## Download a Hunspell Dictionary
Clicking on [this link](https://mozilla-l10n.github.io/firefox-dictionaries/complete.html) you will find a list of available and up-to-date dictionaries.
Find the dictionary you want to train your model on. Once on the page of the dictionary you want to download, instead of clicking on "add to Firefox", right-click and select "copy the link". Then past the value to assign it to the variable `dictionary_url`.
Think to also set the value of the variable `locale`, checkout the column "Dictionary Locale" of the table in the list of the dictionaries, as they might not be shaped the same way, for example, Welsh is "cy_GB" but British English is "en-GB".

In [1]:
%pip install requests spylls


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
import zipfile
import io
import os
import shutil

# Replace this link
# for Breton (be): https://addons.mozilla.org/firefox/downloads/file/4270474/difazier_an_drouizig-0.17resigned1.xpi
# for Welsh (cy_GB): https://addons.mozilla.org/firefox/downloads/file/4270302/geiriadur_cymraeg-1.8.3resigned1.xpi
# for English (en-GB): https://addons.mozilla.org/firefox/downloads/file/4270302/geiriadur_cymraeg-1.8.3resigned1.xpi
# for Dutch (nl): https://addons.mozilla.org/firefox/downloads/file/3776797/woordenboek_nederlands-4.20.19.xpi
dictionary_url = "https://addons.mozilla.org/firefox/downloads/file/4270302/geiriadur_cymraeg-1.8.3resigned1.xpi"

# Replace with the appropriate ISO-369 code
locale = "cy_GB"

if os.path.isdir(f"./{locale}"):
    shutil.rmtree(f"./{locale}")

# Download and extract dictionary
response = requests.get(dictionary_url)
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall("locales")
    print(f"{locale} dictionary decompressed")

# standardize name of files
for file in os.listdir(f"./"):
    if file.endswith(".dic"):
        os.rename(f"./dictionaries/{file}", f"./{locale}.dic")
    elif file.endswith(".aff"):
        os.rename(f"./dictionaries/{file}", f"./{locale}.aff")

try:
    files = os.listdir("locales")
    shutil.rmtree("locales/META-INF/")
    for file in files:
        file_path = os.path.join("locales", file)
        if os.path.isfile(file_path):
            os.remove(file_path)
    print("Folder cleaned successfully.")
except OSError:
    print("Error occurred while deleting files.")

os.rename("locales/dictionaries", f"./hunspell")
os.rmdir("locales")
print("Check out your dictionary in", f"./hunspell")

cy_GB dictionary decompressed
Folder cleaned successfully.
Check out your dictionary in ./hunspell


In [3]:
# Crawl Wiktionary "Category:Welsh lemmas" from ?from=AA, collect all lemma titles with a tqdm progress bar
import re, time, math, json
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

START_URL = "https://en.wiktionary.org/w/index.php?title=Category:Welsh_lemmas&from=AA"
BASE = "https://en.wiktionary.org"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/126.0 Safari/537.36",
    "Accept-Language": "en;q=0.9",
}

SESSION = requests.Session()
SESSION.headers.update(HEADERS)

def fetch(url, retries=3, timeout=20):
    for i in range(retries):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return r.text
            # simple backoff on non-200
            time.sleep(0.8 * (2 ** i))
        except requests.RequestException:
            if i == retries - 1:
                raise
            time.sleep(0.8 * (2 ** i))
    raise RuntimeError(f"Failed to fetch after {retries} attempts: {url}")

def parse_total_and_next(soup):
    """
    Returns (total_entries:int|None, next_href:str|None)
    Looks under #mw-pages for the 'out of N total' text and 'next page' link.
    """
    total = None
    mw_pages = soup.select_one("#mw-pages")
    if mw_pages:
        # The text looks like: "The following 200 pages are in this category, out of 15,740 total."
        text = mw_pages.get_text(" ", strip=True)
        m = re.search(r"out of\s+([\d,\.]+)\s+total", text, flags=re.I)
        if m:
            total = int(m.group(1).replace(",", "").replace(".", ""))
        # Find the "next page" anchor inside #mw-pages
        next_a = None
        for a in mw_pages.find_all("a", string=True, href=True):
            if a.get_text(strip=True).lower() == "next page":
                next_a = a
                break
        next_href = next_a["href"] if next_a else None
        return total, next_href
    return None, None

def parse_lemmas_on_page(soup):
    """
    Extract lemma titles listed under 'Pages in category "Welsh lemmas"'.
    Those are the anchors inside #mw-pages .mw-category a (list items).
    """
    lemmas = []
    for group in soup.select("#mw-pages .mw-category-group"):
        for a in group.select("ul li a[href]"):
            title = a.get_text(strip=True)
            if title:
                lemmas.append(title)
    return lemmas

# 1) Prime: fetch first page, estimate total & pages
html = fetch(START_URL)
soup = BeautifulSoup(html, "html.parser")
total_entries, next_href = parse_total_and_next(soup)
page_size = 200  # per Wiktionary category page
estimated_pages = math.ceil(total_entries / page_size) if total_entries else None

all_lemmas = []
page_idx = 0

# 2) Progress bar over pages (uses estimated total if available)
with tqdm(total=estimated_pages, unit="page", desc="Downloading Welsh lemmas", disable=(estimated_pages is None)) as pbar:
    # Process the first page
    all_lemmas.extend(parse_lemmas_on_page(soup))
    page_idx += 1
    if estimated_pages is not None:
        pbar.update(1)

    # 3) Follow "next page" links until done
    while next_href:
        next_url = urljoin(BASE, next_href)
        html = fetch(next_url)
        soup = BeautifulSoup(html, "html.parser")
        all_lemmas.extend(parse_lemmas_on_page(soup))
        total_entries, next_href = parse_total_and_next(soup)
        page_idx += 1
        if estimated_pages is not None:
            pbar.update(1)

# If total was unknown initially, print how many pages we actually walked
print(f"Visited {page_idx} page(s). Collected {len(all_lemmas)} raw titles.")

# 4) De-duplicate (preserve order), then sort case-insensitively for stability
seen = set()
deduped = []
for t in all_lemmas:
    if t not in seen:
        seen.add(t)
        deduped.append(t)

deduped_sorted = sorted(deduped, key=lambda s: s.casefold())

# 5) Save
with open("welsh_lemmas.json", "w", encoding="utf-8") as f:
    json.dump(deduped_sorted, f, ensure_ascii=False, indent=2)

print(f"Saved {len(deduped_sorted)} unique lemmas to welsh_lemmas.json")


  from .autonotebook import tqdm as notebook_tqdm
Downloading Welsh lemmas: 100%|█| 79/79 [00:57<00:00,  1.39page/

Visited 79 page(s). Collected 15646 raw titles.
Saved 15646 unique lemmas to welsh_lemmas.json





In [31]:
import os
import re

# Path to the dictionary files
dic_path = f"lemmae.json"

try:
    with open(dic_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        lemmas = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {dic_path}")
    lemmas = []
except ValueError as e:
    print(f"Error: {e}")
    lemmas = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {dic_path}")
    lemmas = []

print(f"{len(lemmas)} items loaded from {dic_path}")

# Function to extract lemmas from a .dic file
def extract_lemmas(lemmae):
    lemmas = []
    chars_to_filter_out = {',', '’', '.', '/', '4', ':', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z','È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ò', 'Ó', 'Ô', 'Ö', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'à', 'á', 'ã', 'ä', 'è', 'é', 'ë', 'ì', 'í', 'ï', 'ò', 'ó', 'ö', 'ù', 'ú', 'ü', 'ý', 'ÿ', 'Ŵ', 'Ŷ', 'Ÿ', '̀', '́', '̂', '̈', 'Ẁ', 'ẁ', 'Ẃ', 'ẃ', 'Ẅ', 'ẅ', 'Ẏ', 'ẏ', 'Ỳ', 'ỳ', '\u200c', '◌', 'Ꝺ', 'ꝺ', 'Ᵹ', 'Ꝿ', 'ꝿ', 'Ꞁ', 'ꞁ', 'Ꞇ', 'ꞇ'}
    for lemma in lemmae:
        if not ((len(lemma) == 2 or len(lemma) == 3) and lemma[0] == "'") and not any(char in lemma for char in chars_to_filter_out) and lemma[-1] != "-" and lemma[0] != "-" and not any([l == lemma for l in ["gw", "ch", "rh", "th", "dd", "ng", "ff", "ph", "ll"]]):
            lemmas.append(lemma)
        else:
            pass
            # print(lemma)
    print(len(lemmas))
    return lemmas

lemmas = set(extract_lemmas(lemmas))
                           
# Print the first 10 lemmas as a sample
chars = list({char for string in lemmas for char in string})
types = list(lemmas)
chars.sort()
types.sort()

print(chars, len(types))

13421 items loaded from lemmae.json
13414
[' ', "'", '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'â', 'ê', 'î', 'ô', 'û', 'ŵ', 'ŷ'] 13414


In [32]:
# Dump the lemmas to a json file
import json

# Define the output file path
output_file_path = f"lemmae.json"

# Write the lemmas list to the JSON file
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    json.dump(types, outfile, ensure_ascii=False, indent=2)

print(f"{len(types)} items extracted and saved to {output_file_path}")

13414 items extracted and saved to lemmae.json
