In [10]:
# Simple ZIP File Scraper - Step by Step
# This notebook scrapes ZIP files from kmr.gov.ua and processes JSON data

## Cell 1: Import Required Libraries
import pathlib
import zipfile
from datetime import datetime
import os
import json
import re
import time
import requests
import lxml.html
import sqlite3
from requests.exceptions import RequestException

print("All libraries imported successfully!")

## Cell 2: Configuration and Setup
# Configuration
URL_FULL = "https://kmr.gov.ua/uk/result_golosuvanya?title=&field_start_date_n_h_value%5Bmin%5D=&field_start_date_n_h_value%5Bmax%5D=&page="
DB_NAME = 'voting_json_data.db'
MAX_PAGES = 18  # Adjust as needed
DELAY_BETWEEN_REQUESTS = 2  # seconds

# Create directories
os.makedirs("zip_files", exist_ok=True)
os.makedirs("extracted_files", exist_ok=True)

# Initialize tracking variables
timestamp = int(time.time())
counter = 0
processed_files = []

print(f"Setup complete. Timestamp: {timestamp}")
print(f"Will process pages 1 to {MAX_PAGES}")

All libraries imported successfully!
Setup complete. Timestamp: 1748345800
Will process pages 1 to 18


In [11]:


## Cell 3: Helper Functions
def safe_request(url, max_retries=3, delay=5):
    """Safely request a page with retries"""
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            return response
        except RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(delay)
            else:
                print(f"Failed to get {url} after {max_retries} attempts")
                return None

def preprocess_json(file_path):
    """Preprocess JSON file to fix formatting issues"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Correct invalid newline characters within strings
        corrected_content = re.sub(r'(?<!\\)"\s*\n\s*"', r'\"', content)
        return corrected_content
    except Exception as e:
        print(f"Error preprocessing {file_path}: {e}")
        return None

def setup_database():
    """Initialize SQLite database with proper schema"""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    # Drop table if it exists to ensure clean schema
    cursor.execute("DROP TABLE IF EXISTS ua_kmr_voting_json")
    
    # Create table with all required columns
    cursor.execute('''
        CREATE TABLE ua_kmr_voting_json (
            id TEXT,
            file_name TEXT,
            Num_Question TEXT,
            source_url TEXT,
            orgName TEXT,
            SName TEXT,
            GLType TEXT,
            GLTime TEXT,
            PD_NPP TEXT,
            GL_Text TEXT,
            DocTime TEXT,
            DPName TEXT,
            DPGolos TEXT,
            RESULT TEXT,
            retrieved_at TEXT,
            PRIMARY KEY (id, file_name, DPName)
        )
    ''')
    
    conn.commit()
    print("Database table created successfully!")
    return conn

print("Helper functions defined!")

## Cell 4: Initialize Database
# First, let's check if database exists and what tables it has
import os
if os.path.exists(DB_NAME):
    print(f"Database {DB_NAME} already exists. Checking schema...")
    temp_conn = sqlite3.connect(DB_NAME)
    cursor = temp_conn.cursor()
    
    # Check existing tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print(f"Existing tables: {tables}")
    
    # If our table exists, check its schema
    if ('ua_kmr_voting_json',) in tables:
        cursor.execute("PRAGMA table_info(ua_kmr_voting_json)")
        columns = cursor.fetchall()
        print("Existing columns:")
        for col in columns:
            print(f"  {col[1]} ({col[2]})")
    
    temp_conn.close()

# Now create/recreate the database with correct schema
conn = setup_database()
print("Database initialized successfully!")


Helper functions defined!
Database voting_json_data.db already exists. Checking schema...
Existing tables: [('ua_kmr_voting_json',)]
Existing columns:
  id (TEXT)
  file_name (TEXT)
  Num_Question (TEXT)
  source_url (TEXT)
  orgName (TEXT)
  SName (TEXT)
  GLType (TEXT)
  GLTime (TEXT)
  PD_NPP (TEXT)
  GL_Text (TEXT)
  DocTime (TEXT)
  DPName (TEXT)
  DPGolos (TEXT)
  RESULT (TEXT)
  retrieved_at (TEXT)
Database table created successfully!
Database initialized successfully!


In [12]:

## Cell 5: Step 1 - Discover ZIP File Links
print("Step 1: Discovering ZIP file links...")

zip_links = []
failed_pages = []

for i in range(0, MAX_PAGES):
    url = URL_FULL + str(i)
    print(f"Processing page {i}: {url}")
    
    response = safe_request(url)
    if not response:
        failed_pages.append(i)
        continue
    
    try:
        doc = lxml.html.fromstring(response.text)
        blocks = doc.xpath('//div[@class="view-content"]')
        
        for block in blocks[:1]:  # Process first block only
            for element in block.getchildren():
                if 'zip' in element.text_content().lower():
                    links = element.xpath('.//a/@href')
                    if len(links) >= 2:
                        zip_link = links[1]  # Use second link
                        zip_links.append(zip_link)
                        print(f"  Found ZIP: {zip_link}")
    
    except Exception as e:
        print(f"  Error processing page {i}: {e}")
        failed_pages.append(i)
    
    # Delay between requests to be respectful
    time.sleep(DELAY_BETWEEN_REQUESTS)

print(f"\nDiscovery complete!")
print(f"Found {len(zip_links)} ZIP files")
print(f"Failed pages: {failed_pages}")


Step 1: Discovering ZIP file links...
Processing page 0: https://kmr.gov.ua/uk/result_golosuvanya?title=&field_start_date_n_h_value%5Bmin%5D=&field_start_date_n_h_value%5Bmax%5D=&page=0
  Found ZIP: https://kmr.gov.ua/sites/default/files/15.05.2025_nonovyy.zip
  Found ZIP: https://kmr.gov.ua/sites/default/files/15.04.2025.zip
  Found ZIP: https://kmr.gov.ua/sites/default/files/prod03.04.2025.zip
  Found ZIP: https://kmr.gov.ua/sites/default/files/08.04.2025.zip
  Found ZIP: https://kmr.gov.ua/sites/default/files/03.04.2025_0.zip
  Found ZIP: https://kmr.gov.ua/sites/default/files/03.04.2025_prod.zip
Processing page 1: https://kmr.gov.ua/uk/result_golosuvanya?title=&field_start_date_n_h_value%5Bmin%5D=&field_start_date_n_h_value%5Bmax%5D=&page=1
  Found ZIP: https://kmr.gov.ua/sites/default/files/13.03.2025.zip
  Found ZIP: https://kmr.gov.ua/sites/default/files/20.02.2025.zip
  Found ZIP: https://kmr.gov.ua/sites/default/files/2025.01.30gg.zip
  Found ZIP: https://kmr.gov.ua/sites/defa

In [13]:

## Cell 6: Step 2 - Download ZIP Files
print("Step 2: Downloading ZIP files...")

downloaded_zips = []
download_failures = []

for i, link in enumerate(zip_links):
    print(f"Downloading {i+1}/{len(zip_links)}: {link}")
    
    response = safe_request(link)
    if not response:
        download_failures.append(link)
        continue
    
    try:
        # Create unique filename
        file_name = f"{timestamp}_{counter}.zip"
        zip_path = os.path.join("zip_files", file_name)
        
        # Save ZIP file
        with open(zip_path, "wb") as f:
            f.write(response.content)
        
        downloaded_zips.append({
            'path': zip_path,
            'source_url': link,
            'counter': counter
        })
        
        counter += 1
        print(f"  Saved: {zip_path}")
        
    except Exception as e:
        print(f"  Error saving {link}: {e}")
        download_failures.append(link)
    
    # Small delay between downloads
    time.sleep(1)

print(f"\nDownload complete!")
print(f"Downloaded: {len(downloaded_zips)} files")
print(f"Failed downloads: {len(download_failures)}")


Step 2: Downloading ZIP files...
Downloading 1/87: https://kmr.gov.ua/sites/default/files/15.05.2025_nonovyy.zip
  Saved: zip_files/1748345800_0.zip
Downloading 2/87: https://kmr.gov.ua/sites/default/files/15.04.2025.zip
  Saved: zip_files/1748345800_1.zip
Downloading 3/87: https://kmr.gov.ua/sites/default/files/prod03.04.2025.zip
  Saved: zip_files/1748345800_2.zip
Downloading 4/87: https://kmr.gov.ua/sites/default/files/08.04.2025.zip
  Saved: zip_files/1748345800_3.zip
Downloading 5/87: https://kmr.gov.ua/sites/default/files/03.04.2025_0.zip
  Saved: zip_files/1748345800_4.zip
Downloading 6/87: https://kmr.gov.ua/sites/default/files/03.04.2025_prod.zip
  Saved: zip_files/1748345800_5.zip
Downloading 7/87: https://kmr.gov.ua/sites/default/files/13.03.2025.zip
  Saved: zip_files/1748345800_6.zip
Downloading 8/87: https://kmr.gov.ua/sites/default/files/20.02.2025.zip
  Saved: zip_files/1748345800_7.zip
Downloading 9/87: https://kmr.gov.ua/sites/default/files/2025.01.30gg.zip
  Saved: z

In [14]:

## Cell 7: Step 3 - Extract ZIP Files
print("Step 3: Extracting ZIP files...")

extracted_paths = []
extraction_failures = []

for zip_info in downloaded_zips:
    zip_path = zip_info['path']
    extract_dir = os.path.join("extracted_files", f"extract_{zip_info['counter']}")
    
    print(f"Extracting: {zip_path}")
    
    try:
        os.makedirs(extract_dir, exist_ok=True)
        
        with zipfile.ZipFile(zip_path, "r") as zf:
            zf.extractall(extract_dir)
        
        extracted_paths.append({
            'extract_dir': extract_dir,
            'source_url': zip_info['source_url'],
            'zip_path': zip_path
        })
        
        print(f"  Extracted to: {extract_dir}")
        
    except Exception as e:
        print(f"  Error extracting {zip_path}: {e}")
        extraction_failures.append(zip_path)

print(f"\nExtraction complete!")
print(f"Extracted: {len(extracted_paths)} archives")
print(f"Failed extractions: {len(extraction_failures)}")

Step 3: Extracting ZIP files...
Extracting: zip_files/1748345800_0.zip
  Extracted to: extracted_files/extract_0
Extracting: zip_files/1748345800_1.zip
  Extracted to: extracted_files/extract_1
Extracting: zip_files/1748345800_2.zip
  Extracted to: extracted_files/extract_2
Extracting: zip_files/1748345800_3.zip
  Extracted to: extracted_files/extract_3
Extracting: zip_files/1748345800_4.zip
  Extracted to: extracted_files/extract_4
Extracting: zip_files/1748345800_5.zip
  Extracted to: extracted_files/extract_5
Extracting: zip_files/1748345800_6.zip
  Extracted to: extracted_files/extract_6
Extracting: zip_files/1748345800_7.zip
  Extracted to: extracted_files/extract_7
Extracting: zip_files/1748345800_8.zip
  Extracted to: extracted_files/extract_8
Extracting: zip_files/1748345800_9.zip
  Extracted to: extracted_files/extract_9
Extracting: zip_files/1748345800_10.zip
  Extracted to: extracted_files/extract_10
Extracting: zip_files/1748345800_11.zip
  Extracted to: extracted_files/ext

In [6]:

## Cell 8: Step 4 - Find and Process JSON Files
print("Step 4: Finding and processing JSON files...")

all_json_files = []
json_processing_errors = []

for extract_info in extracted_paths:
    extract_dir = extract_info['extract_dir']
    source_url = extract_info['source_url']
    
    print(f"Scanning: {extract_dir}")
    
    try:
        # Find all JSON files in the extracted directory
        for root, dirs, files in os.walk(extract_dir):
            for file in files:
                if file.endswith('.json'):
                    json_path = os.path.join(root, file)
                    all_json_files.append({
                        'path': json_path,
                        'source_url': source_url,
                        'extract_dir': extract_dir
                    })
    
    except Exception as e:
        print(f"  Error scanning {extract_dir}: {e}")

print(f"Found {len(all_json_files)} JSON files")



Step 4: Finding and processing JSON files...
Scanning: extracted_files/extract_0
Scanning: extracted_files/extract_1
Scanning: extracted_files/extract_2
Scanning: extracted_files/extract_3
Scanning: extracted_files/extract_4
Scanning: extracted_files/extract_5
Scanning: extracted_files/extract_6
Scanning: extracted_files/extract_7
Scanning: extracted_files/extract_8
Scanning: extracted_files/extract_9
Scanning: extracted_files/extract_10
Scanning: extracted_files/extract_11
Scanning: extracted_files/extract_12
Scanning: extracted_files/extract_13
Scanning: extracted_files/extract_14
Scanning: extracted_files/extract_15
Scanning: extracted_files/extract_16
Scanning: extracted_files/extract_17
Scanning: extracted_files/extract_18
Scanning: extracted_files/extract_19
Scanning: extracted_files/extract_20
Scanning: extracted_files/extract_21
Scanning: extracted_files/extract_22
Scanning: extracted_files/extract_23
Scanning: extracted_files/extract_24
Scanning: extracted_files/extract_25
Sca

In [7]:

# Process first few files as a test (remove [:5] to process all)
processed_count = 0
for json_info in all_json_files:  # Process first 10 for testing
    json_path = json_info['path']
    source_url = json_info['source_url']
    
    print(f"Processing: {os.path.basename(json_path)}")
    
    try:
        # Preprocess JSON
        corrected_json = preprocess_json(json_path)
        if not corrected_json:
            continue
        
        # Parse JSON
        json_data = json.loads(corrected_json)
        
        # Extract data
        file_name = os.path.basename(json_path)
        
        # Store in processed_files for next step
        processed_files.append({
            'file_name': file_name,
            'source_url': source_url,
            'json_data': json_data,
            'path': json_path
        })
        
        processed_count += 1
        
    except Exception as e:
        print(f"  Error processing {json_path}: {e}")
        json_processing_errors.append(json_path)

print(f"\nJSON processing complete!")
print(f"Successfully processed: {processed_count} files")
print(f"Processing errors: {len(json_processing_errors)}")


Processing: 13.03.2025.json
Processing: 250220_13.json
Processing: 250220_12.json
  Error processing extracted_files/extract_1/20.02.2025/250220_12.json: Invalid control character at: line 8 column 197 (char 400)
Processing: 250220_15.json
Processing: 250220_19.json
Processing: 250220_18.json
Processing: 250220_14.json
Processing: 250220_17.json
Processing: 250220_16.json
Processing: 250220_11.json
Processing: 250130_70.json
Processing: 250130_27.json
Processing: 250130_31.json
Processing: 250130_89.json
Processing: 250130_66.json
Processing: 250130_6.json
Processing: 250130_11.json
Processing: 250130_50.json
Processing: 250130_93.json
Processing: 250130_85.json
Processing: 250130_84.json
Processing: 250130_92.json
Processing: 250130_10.json
Processing: 250130_88.json
Processing: 250130_67.json
Processing: 250130_7.json
Processing: 250130_30.json
Processing: 250130_26.json
Processing: 250130_71.json
Processing: 250130_40.json
Processing: 250130_95.json
Processing: 250130_83.json
Proces

In [8]:
## Cell 9: Step 5 - Store Data in Database
print("Step 5: Storing data in database...")

stored_records = 0
storage_errors = []

cursor = conn.cursor()

for file_info in processed_files:
    try:
        json_data = file_info['json_data']
        file_name = file_info['file_name']
        source_url = file_info['source_url']
        
        # Prepare main document data
        doc_data = {
            'id': f"{file_name}_{timestamp}",
            'file_name': file_name,
            'Num_Question': file_name.split('_')[1].split('.')[0] if '_' in file_name else '',
            'source_url': source_url,
            'orgName': "КИЇВСЬКА МІСЬКА РАДА",
            'SName': json_data.get('SName', ''),
            'GLType': json_data.get('GLType', ''),
            'GLTime': json_data.get('GLTime', ''),
            'PD_NPP': json_data.get('PD_NPP', ''),
            'GL_Text': json_data.get('GL_Text', ''),
            'DocTime': json_data.get('DocTime', ''),
            'RESULT': json_data.get('RESULT', ''),
            'retrieved_at': datetime.now().isoformat()
        }
        
        # Process DP List
        dp_list = json_data.get('DPList', [])
        combined_data = []
        
        if dp_list:
            for dp_entry in dp_list:
                combined_entry = doc_data.copy()
                combined_entry.update({
                    'DPName': dp_entry.get('DPName', ''),
                    'DPGolos': 'Відсутній' if dp_entry.get('DPGolos') == '.........' else dp_entry.get('DPGolos', '')
                })
                combined_data.append(combined_entry)
        else:
            doc_data.update({'DPName': '', 'DPGolos': ''})
            combined_data.append(doc_data)
        
        # Insert data with simpler INSERT OR REPLACE
        for entry in combined_data:
            try:
                cursor.execute('''
                    INSERT OR REPLACE INTO ua_kmr_voting_json 
                    (id, file_name, Num_Question, source_url, orgName, SName, GLType, 
                     GLTime, PD_NPP, GL_Text, DocTime, DPName, DPGolos, RESULT, retrieved_at)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', (
                    entry['id'], entry['file_name'], entry['Num_Question'], 
                    entry['source_url'], entry['orgName'], entry['SName'], 
                    entry['GLType'], entry['GLTime'], entry['PD_NPP'], 
                    entry['GL_Text'], entry['DocTime'], entry['DPName'], 
                    entry['DPGolos'], entry['RESULT'], entry['retrieved_at']
                ))
            except sqlite3.Error as e:
                print(f"    SQL Error for {entry['file_name']}: {e}")
                continue
        
        stored_records += len(combined_data)
        print(f"  Stored {len(combined_data)} records from {file_name}")
        
    except Exception as e:
        print(f"  Error storing {file_info['file_name']}: {e}")
        storage_errors.append(file_info['file_name'])

# Commit all changes
conn.commit()

print(f"\nDatabase storage complete!")
print(f"Stored: {stored_records} records")
print(f"Storage errors: {len(storage_errors)}")


Step 5: Storing data in database...
  Stored 121 records from 13.03.2025.json
  Stored 121 records from 250220_13.json
  Stored 121 records from 250220_15.json
  Stored 121 records from 250220_19.json
  Stored 121 records from 250220_18.json
  Stored 121 records from 250220_14.json
  Stored 121 records from 250220_17.json
  Stored 121 records from 250220_16.json
  Stored 121 records from 250220_11.json
  Stored 121 records from 250130_70.json
  Stored 121 records from 250130_27.json
  Stored 121 records from 250130_31.json
  Stored 121 records from 250130_89.json
  Stored 121 records from 250130_66.json
  Stored 121 records from 250130_6.json
  Stored 121 records from 250130_11.json
  Stored 121 records from 250130_50.json
  Stored 121 records from 250130_93.json
  Stored 121 records from 250130_85.json
  Stored 121 records from 250130_84.json
  Stored 121 records from 250130_92.json
  Stored 121 records from 250130_10.json
  Stored 121 records from 250130_88.json
  Stored 121 records 

In [9]:
## Cell 10: Summary and Verification
print("="*50)
print("PROCESSING SUMMARY")
print("="*50)

# Query database to verify data
cursor.execute("SELECT COUNT(*) FROM ua_kmr_voting_json")
total_records = cursor.fetchone()[0]

cursor.execute("SELECT COUNT(DISTINCT file_name) FROM ua_kmr_voting_json")
unique_files = cursor.fetchone()[0]

print(f"Pages processed: {MAX_PAGES - 1}")
print(f"ZIP links found: {len(zip_links)}")
print(f"ZIP files downloaded: {len(downloaded_zips)}")
print(f"Archives extracted: {len(extracted_paths)}")
print(f"JSON files found: {len(all_json_files)}")
print(f"JSON files processed: {len(processed_files)}")
print(f"Database records: {total_records}")
print(f"Unique files in DB: {unique_files}")

print(f"\nFailed pages: {failed_pages}")
print(f"Download failures: {len(download_failures)}")
print(f"Extraction failures: {len(extraction_failures)}")
print(f"JSON processing errors: {len(json_processing_errors)}")
print(f"Storage errors: {len(storage_errors)}")

## Cell 11: Sample Data Verification
print("\nSample records from database:")
cursor.execute("SELECT file_name, GLType, SName, DPName LIMIT 5")
sample_records = cursor.fetchall()

for i, record in enumerate(sample_records, 1):
    print(f"{i}. File: {record[0]}, Type: {record[1]}, Session: {record[2]}, Deputy: {record[3]}")

# Close database connection
conn.close()
print("\nDatabase connection closed.")
print("Processing complete!")

PROCESSING SUMMARY
Pages processed: 17
ZIP links found: 81
ZIP files downloaded: 81
Archives extracted: 81
JSON files found: 6990
JSON files processed: 6747
Database records: 813223
Unique files in DB: 6747

Failed pages: []
Download failures: 0
Extraction failures: 0
JSON processing errors: 243
Storage errors: 0

Sample records from database:


OperationalError: no such column: file_name