In [18]:
import requests
from bs4 import BeautifulSoup
import json
import re
import time

def fetch_ukrainian_frequency_words():
    """
    Fetches Ukrainian frequency words from Wiktionary and organizes them by frequency ranges.
    Returns a dictionary with frequency ranges as keys and word lists as values.
    """
    url = "https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Ukrainian/Mixed_(2012-2022)"
    
    # Add headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        print("Fetching page...")
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Dictionary to store frequency ranges
        frequency_dict = {}
        
        # Find all sections with frequency ranges
        # Look for h2 elements that contain frequency ranges like "1-1000", "1001-2000", etc.
        headings = soup.find_all('h2', id=re.compile(r'\d+-\d+'))
        
        if not headings:
            # Alternative approach: look for h2 elements with text containing ranges
            all_h2 = soup.find_all('h2')
            headings = [h for h in all_h2 if h.get_text() and re.search(r'\d+-\d+', h.get_text())]
        
        print(f"Found {len(headings)} frequency range sections")
        
        for heading in headings:
            # Extract the range from heading text
            heading_text = heading.get_text().strip()
            range_match = re.search(r'(\d+)-(\d+)', heading_text)
            
            if not range_match:
                continue
                
            start_num, end_num = range_match.groups()
            range_key = end_num  # Use the end number as the key (e.g., "1000", "2000")
            
            print(f"Processing range {start_num}-{end_num}...")
            
            # Find the parent div of the heading
            heading_div = heading.find_parent('div', class_='mw-heading')
            words = []
            
            if heading_div:
                # Look for the next paragraph element after the heading div
                next_element = heading_div.next_sibling
                
                # Skip whitespace and find the first paragraph
                while next_element and (not hasattr(next_element, 'name') or next_element.name != 'p'):
                    next_element = next_element.next_sibling
                
                # Process all consecutive paragraph elements in this section
                while next_element and hasattr(next_element, 'name') and next_element.name == 'p':
                    # Extract all links from this paragraph
                    links = next_element.find_all('a')
                    for link in links:
                        # Check if it's a Ukrainian word link (contains #Ukrainian in href)
                        href = link.get('href', '')
                        if '#Ukrainian' in href:
                            word = link.get_text().strip()
                            if word and len(word) > 0:
                                words.append(word)
                    
                    # Move to next sibling
                    next_element = next_element.next_sibling
                    
                    # Stop if we hit another heading (next section)
                    if next_element and hasattr(next_element, 'name'):
                        if next_element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                            break
                        # Also check for div with heading class
                        if next_element.name == 'div' and 'mw-heading' in next_element.get('class', []):
                            break
            
            if words:
                frequency_dict[range_key] = words
                print(f"  Found {len(words)} words for range {start_num}-{end_num}")
            else:
                print(f"  No words found for range {start_num}-{end_num}")
            
            # Small delay to be respectful to the server
            time.sleep(0.5)
        
        return frequency_dict
        
    except requests.RequestException as e:
        print(f"Error fetching the page: {e}")
        return {}
    except Exception as e:
        print(f"Error processing the page: {e}")
        return {}

def save_to_json(data, filename="ukrainian_frequency_words.json"):
    """
    Save the frequency dictionary to a JSON file.
    """
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"Data saved to {filename}")
        return True
    except Exception as e:
        print(f"Error saving to JSON: {e}")
        return False

def main():
    """
    Main function to run the scraper.
    """
    print("Ukrainian Frequency Words Scraper")
    print("=" * 40)
    
    # Fetch the words
    frequency_data = fetch_ukrainian_frequency_words()
    
    if frequency_data:
        print(f"\nSuccessfully extracted {len(frequency_data)} frequency ranges:")
        for range_key, words in frequency_data.items():
            print(f"  Range ending at {range_key}: {len(words)} words")
        
        # Save to JSON
        if save_to_json(frequency_data):
            print("\nScript completed successfully!")
        else:
            print("\nScript completed but failed to save JSON file.")
    else:
        print("No data was extracted. Please check the website structure.")

if __name__ == "__main__":
    main()

Ukrainian Frequency Words Scraper
Fetching page...
Found 10 frequency range sections
Processing range 1-1000...
  Found 947 words for range 1-1000
Processing range 1001-2000...
  Found 833 words for range 1001-2000
Processing range 2001-3000...
  Found 754 words for range 2001-3000
Processing range 3001-4000...
  Found 700 words for range 3001-4000
Processing range 4001-5000...
  Found 667 words for range 4001-5000
Processing range 5001-6000...
  Found 608 words for range 5001-6000
Processing range 6001-7000...
  Found 622 words for range 6001-7000
Processing range 7001-8000...
  Found 557 words for range 7001-8000
Processing range 8001-9000...
  Found 553 words for range 8001-9000
Processing range 9001-10000...
  Found 576 words for range 9001-10000

Successfully extracted 10 frequency ranges:
  Range ending at 1000: 947 words
  Range ending at 2000: 833 words
  Range ending at 3000: 754 words
  Range ending at 4000: 700 words
  Range ending at 5000: 667 words
  Range ending at 6000: 

## Initializing the items rating

In [21]:
import requests
from bs4 import BeautifulSoup
import time
import json
from typing import Dict, Tuple, Optional
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Define the file path
file_path = f"items.json"

# Write the lemmas list to the JSON file
try:
    with open(file_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        items = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    items = []
except ValueError as e:
    print(f"Error: {e}")
    items = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    items = []

# Define the file path
freq = f"ukrainian_frequencies.json"

# Write the lemmas list to the JSON file
try:
    with open(freq) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        frequencies = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    items = []
except ValueError as e:
    print(f"Error: {e}")
    items = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    items = []

print(items["keys"][:10])
print(frequencies["1000"][90:100])


[{'w': 'а', 'r': 200}, {'w': 'а-а', 'r': 200}, {'w': 'а-а-а', 'r': 200}, {'w': 'а-а-а-а', 'r': 200}, {'w': 'а-конто', 'r': 200}, {'w': 'а-ля', 'r': 200}, {'w': 'а-темпо', 'r': 200}, {'w': 'а-форфе', 'r': 200}, {'w': 'аакуватий', 'r': 200}, {'w': 'аахенський', 'r': 200}]
['населення', 'зі', 'можуть', 'цьому', 'сказав', 'російських', 'якщо', 'окупанти', 'Росія', 'навіть']


In [23]:
import random
from math import floor

new_keys = []
for item in tqdm(items["keys"], desc="Processing items"):
    if item["w"] in frequencies["1000"]:
        val = random.randint(1, 400)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["2000"]:
        val = random.randint(400, 500)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["3000"]:
        val = random.randint(500, 600)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["4000"]:
        val = random.randint(600, 700)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["5000"]:
        val = random.randint(700, 750)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["6000"]:
        val = random.randint(750, 800)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["7000"]:
        val = random.randint(750, 800)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["8000"]:
        val = random.randint(800, 850)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["9000"]:
        val = random.randint(850, 900)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["10000"]:
        val = random.randint(900, 950)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    else:
        val = floor(random.randint(950, 2000))
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    

# order the items by rating
new_keys.sort(key=lambda x: x["r"])
items["keys"] = new_keys


new_dis = []
for item in tqdm(items["distractors"], desc="Processing items"):
    val = floor(random.randint(0, 2000))
    val -= val%5
    new_dis.append(
        {"w": item["w"], "r": val}
    )
new_dis.sort(key=lambda x: x["r"])
items["distractors"] = new_dis

# Write the updated items back to the JSON file
with open(file_path, "w") as f:
    json.dump(items, f, indent=0, ensure_ascii=False)

Processing items: 100%|█| 269430/269430 [00:16<00:00, 16612.26it
Processing items: 100%|█| 49999/49999 [00:00<00:00, 817191.71it/
