In [6]:
import json
import re
import time
from typing import Dict, List
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry


BASE_URL = "https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists_(Belgium,_finance)/"
RANGES = ["1-1000", "1001-2000", "2001-4000", "4001-6000", "6001-8000", "8001-10000"]
OUTPUT_JSON = "fr-frequencies.json"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    )
}


def make_session() -> requests.Session:
    """Create a requests session with polite retry/backoff."""
    session = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=frozenset(["GET"]),
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.headers.update(HEADERS)
    return session


def page_url_for_range(range_label: str) -> str:
    """Construct the subpage URL like .../(1-1000)."""
    return urljoin(BASE_URL, f"{range_label}")


def fetch_html(session: requests.Session, url: str) -> BeautifulSoup:
    """Fetch and parse HTML for a URL."""
    resp = session.get(url, timeout=30)
    resp.raise_for_status()
    return BeautifulSoup(resp.content, "html.parser")


def extract_words_from_page(soup: BeautifulSoup) -> List[str]:
    """
    Extract French words from the page.
    The page layout uses a <table> containing <ul><li> entries like:
      8001. <span class="Latn" lang="fr"><a href="/wiki/obtenue#French">obtenue</a></span>
    Strategy:
      - Look inside tables (to stay scoped to the frequency list)
      - Find <a> with href containing '#French'
      - Ensure the anchor is within an element marked lang="fr"
    """
    words: List[str] = []

    # Restrict to tables to avoid side links / navigation
    tables = soup.find_all("table")
    for table in tables:
        # Find anchors that link to the French section
        for a in table.select('a[href*="#French"]'):
            # Ensure we are in a French-language span/container
            if a.find_parent(attrs={"lang": "fr"}) is None:
                continue
            text = a.get_text(strip=True)
            if not text:
                continue
            words.append(text)

    # De-duplicate while preserving order
    seen = set()
    deduped = []
    for w in words:
        if w not in seen:
            seen.add(w)
            deduped.append(w)
    return deduped


def fetch_french_frequency_words() -> Dict[str, List[str]]:
    """
    Fetch French frequency words from the five (actually six) subpages.
    Returns a dict keyed by the full range string, e.g. "1-1000".
    """
    session = make_session()
    result: Dict[str, List[str]] = {}

    print("French Frequency Words Scraper (Belgium, finance)")
    print("=" * 50)

    for i, range_label in enumerate(RANGES, 1):
        url = page_url_for_range(range_label)
        print(f"[{i}/{len(RANGES)}] Fetching {range_label}: {url}")
        try:
            soup = fetch_html(session, url)
            words = extract_words_from_page(soup)
            print(f"  → Found {len(words)} words")
            result[range_label.split("-")[-1]] = words
        except requests.HTTPError as e:
            print(f"  ! HTTP error for {range_label}: {e}")
            result[range_label] = []
        except requests.RequestException as e:
            print(f"  ! Request error for {range_label}: {e}")
            result[range_label] = []
        except Exception as e:
            print(f"  ! Parsing error for {range_label}: {e}")
            result[range_label] = []

        # Be polite to the server
        time.sleep(0.6)

    return result


def save_to_json(data: Dict[str, List[str]], filename: str = OUTPUT_JSON) -> bool:
    """Save the frequency dictionary to a JSON file (UTF-8, pretty)."""
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"\nData saved to {filename}")
        return True
    except Exception as e:
        print(f"\nError saving to JSON: {e}")
        return False


def main():
    data = fetch_french_frequency_words()

    nonempty = {k: v for k, v in data.items() if v}
    print("\nSummary")
    print("-" * 50)
    for rng in RANGES:
        count = len(data.get(rng, []))
        print(f"  {rng:>11}: {count:>5} words")

    if nonempty:
        save_to_json(data, OUTPUT_JSON)
        print("\nDone.")
    else:
        print("\nNo data extracted. Check the site structure or selectors.")


if __name__ == "__main__":
    main()


French Frequency Words Scraper (Belgium, finance)
[1/6] Fetching 1-1000: https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists_(Belgium,_finance)/1-1000
  → Found 919 words
[2/6] Fetching 1001-2000: https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists_(Belgium,_finance)/1001-2000
  → Found 866 words
[3/6] Fetching 2001-4000: https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists_(Belgium,_finance)/2001-4000
  → Found 1693 words
[4/6] Fetching 4001-6000: https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists_(Belgium,_finance)/4001-6000
  → Found 1667 words
[5/6] Fetching 6001-8000: https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists_(Belgium,_finance)/6001-8000
  → Found 1629 words
[6/6] Fetching 8001-10000: https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists_(Belgium,_finance)/8001-10000
  → Found 1549 words

Summary
--------------------------------------------------
       1-1000:     0 words
    1001-2000:     0 wo

In [9]:
import requests
from bs4 import BeautifulSoup
import time
import json
from typing import Dict, Tuple, Optional
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Define the file path
file_path = f"items.json"

# Write the lemmas list to the JSON file
try:
    with open(file_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        items = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    items = []
except ValueError as e:
    print(f"Error: {e}")
    items = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    items = []

# Define the file path
freq = f"fr-frequencies.json"

# Write the lemmas list to the JSON file
try:
    with open(freq) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        frequencies = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    items = []
except ValueError as e:
    print(f"Error: {e}")
    items = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    items = []

print(items["keys"][:10])
print(frequencies.keys())

[{'w': 'a b c', 'r': 200}, {'w': 'a capella', 'r': 200}, {'w': 'a cappella', 'r': 200}, {'w': 'a fortiori', 'r': 200}, {'w': 'a giorno', 'r': 200}, {'w': 'a posteriori', 'r': 200}, {'w': 'a priori', 'r': 200}, {'w': 'ab intestat', 'r': 200}, {'w': 'abaca', 'r': 200}, {'w': 'abaissable', 'r': 200}]
dict_keys(['1000', '2000', '4000', '6000', '8000', '10000'])


In [10]:
import random
from math import floor

new_keys = []
for item in tqdm(items["keys"], desc="Processing items"):
    if item["w"] in frequencies["1000"]:
        val = random.randint(1, 400)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["2000"]:
        val = random.randint(400, 500)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["4000"]:
        val = random.randint(500, 700)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["6000"]:
        val = random.randint(700, 800)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["8000"]:
        val = random.randint(800, 875)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    elif item["w"] in frequencies["10000"]:
        val = random.randint(875, 950)
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    else:
        val = floor(random.randint(950, 2000))
        val -= val%5
        new_keys.append(
            {"w": item["w"], "r": val}
        )
    

# order the items by rating
new_keys.sort(key=lambda x: x["r"])
items["keys"] = new_keys


new_dis = []
for item in tqdm(items["distractors"], desc="Processing items"):
    val = floor(random.randint(0, 2000))
    val -= val%5
    new_dis.append(
        {"w": item["w"], "r": val}
    )
new_dis.sort(key=lambda x: x["r"])
items["distractors"] = new_dis

# Write the updated items back to the JSON file
with open(file_path, "w") as f:
    json.dump(items, f, indent=0, ensure_ascii=False)

Processing items: 100%|█| 45270/45270 [00:03<00:00, 12670.96it/s
Processing items: 100%|█| 50000/50000 [00:00<00:00, 751473.45it/
