## Download a Hunspell Dictionary
Clicking on [this link](https://mozilla-l10n.github.io/firefox-dictionaries/complete.html) you will find a list of available and up-to-date dictionaries.
Find the dictionary you want to train your model on. Once on the page of the dictionary you want to download, instead of clicking on "add to Firefox", right-click and select "copy the link". Then past the value to assign it to the variable `dictionary_url`.
Think to also set the value of the variable `locale`, checkout the column "Dictionary Locale" of the table in the list of the dictionaries, as they might not be shaped the same way, for example, Welsh is "cy_GB" but British English is "en-GB".


In [1]:
%pip install requests spylls


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [10]:
import requests
import zipfile
import io
import os
import shutil

# Replace this link
# for Breton (be): https://addons.mozilla.org/firefox/downloads/file/4270474/difazier_an_drouizig-0.17resigned1.xpi
# for Welsh (cy_GB): https://addons.mozilla.org/firefox/downloads/file/4270302/geiriadur_cymraeg-1.8.3resigned1.xpi
# for English (en-GB): https://addons.mozilla.org/firefox/downloads/file/4270302/geiriadur_cymraeg-1.8.3resigned1.xpi
# for Dutch (nl): https://addons.mozilla.org/firefox/downloads/file/3776797/woordenboek_nederlands-4.20.19.xpi
dictionary_url = "https://addons.mozilla.org/firefox/downloads/file/4481074/ukrainian_dictionary-6.6.1.xpi"

# Replace with the appropriate ISO-369 code
locale = "uk-UA"

if os.path.isdir(f"./{locale}"):
    shutil.rmtree(f"./{locale}")

# Download and extract dictionary
response = requests.get(dictionary_url)
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall("locales")
    print(f"{locale} dictionary decompressed")

# standardize name of files
for file in os.listdir(f"./"):
    if file.endswith(".dic"):
        os.rename(f"./dictionaries/{file}", f"./{locale}.dic")
    elif file.endswith(".aff"):
        os.rename(f"./dictionaries/{file}", f"./{locale}.aff")

try:
    files = os.listdir("locales")
    shutil.rmtree("locales/META-INF/")
    for file in files:
        file_path = os.path.join("locales", file)
        if os.path.isfile(file_path):
            os.remove(file_path)
    print("Folder cleaned successfully.")
except OSError:
    print("Error occurred while deleting files.")

os.rename("locales/dictionaries", f"./hunspell")
os.rmdir("locales")
print("Check out your dictionary in", f"./hunspell")

uk-UA dictionary decompressed
Folder cleaned successfully.
Check out your dictionary in ./hunspell


Extract and filter the types from the dictionary.

In [49]:
import os
import re

# Function to extract lemmas from a .dic file
def extract_lemmas(dic_path):
    lemmas = []
    chars_to_filter_out = {'4', 'Ч', 'Ц', 'Щ', 'Н', 'З', 'А', 'Ф', 'Х', 'У', 'С', 'Д', 'О', 'П', 'Ї', 'К', 'Ж', 'И', 'Ґ', 'І', 'М', 'Б', 'Л', '3', 'Т', 'Ю', 'Й', 'Є', 'Ш', '=', 'Е', 'Я', 'Г', 'В', 'Р'}
    with open(dic_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Each line in the .dic file contains a word followed by its affix flags
            # We only need the word part, which is before the '/' character
            word = line.split('/')[0]
            if word[-1] == '\n':
                word = word[:-1]
            if not any(char in word for char in chars_to_filter_out) and word[-1] != "-" and word[0] != "-":
                lemmas.append(word)
    print(len(lemmas))
    return lemmas

# Path to the dictionary files
dic_path = f"hunspell/{locale}.dic"

# Extract lemmas
lemmas = set(extract_lemmas(dic_path))

# Print the first 10 lemmas as a sample
chars = {char for string in lemmas for char in string}
types = list(lemmas)
types.sort()

print(chars, len(types))

281096
{'к', 'т', 'ю', 'ґ', 'ж', 'а', 'й', 'ї', 'ч', 'і', 'є', 'с', 'у', 'щ', 'в', 'г', 'п', 'ш', "'", 'о', 'ь', 'р', 'е', 'и', 'х', 'л', '-', 'з', 'н', 'б', 'д', 'ц', 'м', 'ф', 'я'} 269430


Save the types in a new file

In [48]:
# Dump the lemmas to a json file
import json

# Define the output file path
output_file_path = f"lemmae.json"

# Write the lemmas list to the JSON file
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    json.dump(types, outfile, ensure_ascii=False, indent=2)

print(f"{len(types)} items extracted and saved to {output_file_path}")

269430 items extracted and saved to lemmae.json
