In [1]:
%pip install requests spylls


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Download a Hunspell Dictionary
Clicking on [this link](https://mozilla-l10n.github.io/firefox-dictionaries/complete.html) you will find a list of available and up-to-date dictionaries.
Find the dictionary you want to train your model on. Once on the page of the dictionary you want to download, instead of clicking on "add to Firefox", right-click and select "copy the link". Then past the value to assign it to the variable `dictionary_url`.
Think to also set the value of the variable `locale`, checkout the column "Dictionary Locale" of the table in the list of the dictionaries, as they might not be shaped the same way, for example, Welsh is "cy_GB" but British English is "en-GB".


In [5]:
import requests
import zipfile
import io
import os
import shutil

# Replace this link
# for Breton (be): https://addons.mozilla.org/firefox/downloads/file/4270474/difazier_an_drouizig-0.17resigned1.xpi
# for Welsh (cy_GB): https://addons.mozilla.org/firefox/downloads/file/4270302/geiriadur_cymraeg-1.8.3resigned1.xpi
# for English (en-GB): https://addons.mozilla.org/firefox/downloads/file/4270302/geiriadur_cymraeg-1.8.3resigned1.xpi
# for Dutch (nl): https://addons.mozilla.org/firefox/downloads/file/3776797/woordenboek_nederlands-4.20.19.xpi
dictionary_url = "https://addons.mozilla.org/firefox/downloads/file/3581786/dictionnaire_francais1-7.0b.xpi"

# Replace with the appropriate ISO-369 code
locale = "fr"

if os.path.isdir(f"./{locale}"):
    shutil.rmtree(f"./{locale}")

# Download and extract dictionary
response = requests.get(dictionary_url)
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall("locales")
    print(f"{locale} dictionary decompressed")

# standardize name of files
for file in os.listdir(f"./"):
    if file.endswith(".dic"):
        os.rename(f"./dictionaries/{file}", f"./{locale}.dic")
    elif file.endswith(".aff"):
        os.rename(f"./dictionaries/{file}", f"./{locale}.aff")

try:
    files = os.listdir("locales")
    shutil.rmtree("locales/META-INF/")
    for file in files:
        file_path = os.path.join("locales", file)
        if os.path.isfile(file_path):
            os.remove(file_path)
    print("Folder cleaned successfully.")
except OSError:
    print("Error occurred while deleting files.")

os.rename("locales/dictionaries", f"./hunspell")
os.rmdir("locales")
print("Check out your dictionary in", f"./hunspell")

fr dictionary decompressed
Folder cleaned successfully.
Check out your dictionary in ./hunspell


Extract and filter the types from the dictionary.

In [23]:
import requests
from bs4 import BeautifulSoup
import re
import time
import string

def fetch_words_from_letter_page(base_url, letter):
    """
    Fetch all words from a specific letter page.
    Returns a list of words that meet the criteria (more than one letter, no uppercase).
    """
    url = f"{base_url}/{letter}"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        print(f"Fetching words starting with '{letter}'...")
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        words = []
        
        # Find the ul element with class "entrées"
        entries_ul = soup.find('ul', class_='entrées')
        
        if entries_ul:
            # Find all li elements within the entries list
            list_items = entries_ul.find_all('li')
            
            for li in list_items:
                # Find the link within each li
                link = li.find('a')
                if link:
                    word = link.get_text().strip()
                    
                    # Apply filters:
                    # 1. More than one letter
                    # 2. No uppercase letters
                    # 3. Remove entries with special characters that aren't words
                    if (len(word) > 1 and 
                        not any(c.isupper() for c in word) and
                        not (word.endswith('-') or word.startswith('-'))):  # Additional check for all caps
                        
                        words.append(word)
            
            print(f"  Found {len(words)} valid words for letter '{letter}'")
        else:
            print(f"  No entries found for letter '{letter}'")
        
        return words
        
    except requests.RequestException as e:
        print(f"Error fetching page for letter '{letter}': {e}")
        return []
    except Exception as e:
        print(f"Error processing page for letter '{letter}': {e}")
        return []

def fetch_all_french_dictionary_words():
    """
    Fetch all French dictionary words from the Usito dictionary.
    Returns a list of words that meet the criteria.
    """
    base_url = "https://usito.usherbrooke.ca/index/mots/tous"
    lemmae = []
    
    # Letters to process (including 'Autres' for other characters)
    letters = list(string.ascii_lowercase) + ['Autres']
    
    print("French Dictionary Scraper - Usito")
    print("=" * 40)
    print(f"Processing {len(letters)} letter sections...")
    
    for letter in letters:
        try:
            words = fetch_words_from_letter_page(base_url, letter)
            lemmae.extend(words)
            
            # Be respectful to the server
            time.sleep(1)
            
        except KeyboardInterrupt:
            print(f"\nScraping interrupted by user at letter '{letter}'")
            break
        except Exception as e:
            print(f"Unexpected error processing letter '{letter}': {e}")
            continue
    
    # Remove duplicates while preserving order
    seen = set()
    unique_lemmae = []
    for word in lemmae:
        if word not in seen:
            seen.add(word)
            unique_lemmae.append(word)
    
    print(f"\nCompleted! Found {len(unique_lemmae)} unique French dictionary entries")
    print(f"Sample words: {unique_lemmae[:10] if unique_lemmae else 'None'}")
    
    return unique_lemmae

def save_lemmae_as_python_list(lemmae, filename="lemmae.json"):
    """
    Save the lemmae as a JSON list, merging with existing entries if file exists.
    """
    import json
    
    existing_lemmae = []
    
    # Try to load existing lemmae from the file
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            existing_lemmae = json.load(f)
        print(f"Loaded {len(existing_lemmae)} existing entries from {filename}")
    except FileNotFoundError:
        print(f"No existing file found, starting fresh")
    except json.JSONDecodeError:
        print(f"Error reading JSON from {filename}, starting fresh")
    except Exception as e:
        print(f"Error loading existing file: {e}, starting fresh")
    
    # Combine existing and new lemmae
    combined_lemmae = existing_lemmae + lemmae
    
    # Remove duplicates while preserving order
    seen = set()
    unique_lemmae = []
    for word in combined_lemmae:
        if word not in seen:
            seen.add(word)
            unique_lemmae.append(word)
    
    print(f"Combined: {len(existing_lemmae)} existing + {len(lemmae)} new = {len(unique_lemmae)} unique entries")
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(unique_lemmae, f, ensure_ascii=False, indent=2)
        print(f"Lemmae saved to {filename}")
        return True
    except Exception as e:
        print(f"Error saving lemmae to JSON: {e}")
        return False

def main():
    """
    Main function to run the French dictionary scraper.
    """
    # Fetch all words
    lemmae = fetch_all_french_dictionary_words()
    
    if lemmae:
        print(f"\nFirst 20 words: {lemmae[:20]}")
        print(f"Last 20 words: {lemmae[-20:]}")
        
        # Save in different formats
        save_lemmae_as_python_list(lemmae)
        
        print("\nScript completed successfully!")
        print(f"Variable 'lemmae' contains {len(lemmae)} French dictionary entries")
    else:
        print("No words were extracted. Please check the website structure.")

if __name__ == "__main__":
    main()

French Dictionary Scraper - Usito
Processing 27 letter sections...
Fetching words starting with 'a'...
  Found 3717 valid words for letter 'a'
Fetching words starting with 'b'...
  Found 2447 valid words for letter 'b'
Fetching words starting with 'c'...
  Found 5332 valid words for letter 'c'
Fetching words starting with 'd'...
  Found 3053 valid words for letter 'd'
Fetching words starting with 'e'...
  Found 3024 valid words for letter 'e'
Fetching words starting with 'f'...
  Found 1742 valid words for letter 'f'
Fetching words starting with 'g'...
  Found 1541 valid words for letter 'g'
Fetching words starting with 'h'...
  Found 1193 valid words for letter 'h'
Fetching words starting with 'i'...
  Found 2015 valid words for letter 'i'
Fetching words starting with 'j'...
  Found 395 valid words for letter 'j'
Fetching words starting with 'k'...
  Found 211 valid words for letter 'k'
Fetching words starting with 'l'...
  Found 1284 valid words for letter 'l'
Fetching words starting

In [43]:
import os
import re

# Path to the dictionary files
dic_path = f"lemmae.json"

try:
    with open(dic_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        lemmas = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {dic_path}")
    lemmas = []
except ValueError as e:
    print(f"Error: {e}")
    lemmas = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {dic_path}")
    lemmas = []

print(f"{len(lemmas)} items loaded from {dic_path}")

# Function to extract lemmas from a .dic file
def extract_lemmas(lemmae):
    lemmas = []
    chars_to_filter_out = {'²', ':', '.', 'µ', '2', '/', '(', '3',  '’', '4', ',', '1', '9','8','³', 'ñ', ')'}
    for lemma in lemmae:
        lemma = lemma.replace("(-)", "-")
        lemma = lemma.replace("(s)", "s")
        if not any(char in lemma for char in chars_to_filter_out) and lemma[-1] != "-" and lemma[0] != "-" and lemma != "''":
            lemmas.append(lemma)
        else:
            print(lemma)
    print(len(lemmas))
    return lemmas

lemmas = set(extract_lemmas(lemmas))
                           
# Print the first 10 lemmas as a sample
chars = {char for string in lemmas for char in string}
types = list(lemmas)

types.sort()

print(chars, len(types))

45270 items loaded from lemmae.json
45270
{'à', 'y', 'z', 's', 'v', "'", 'n', 'b', '-', 'm', 'ô', 'r', 'î', 'é', 'd', 'p', 'ù', 'h', 'g', 'â', 'a', 'x', 'u', ' ', 'ö', 'q', 'ü', 'æ', 'o', 'e', 'è', 'l', 'c', 'w', 'ç', 't', 'œ', 'k', 'ï', 'û', 'j', 'ê', 'f', 'i', 'ë', 'ä'} 45270


In [42]:
# Dump the lemmas to a json file
import json

# Define the output file path
output_file_path = f"lemmae.json"

# Write the lemmas list to the JSON file
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    json.dump(types, outfile, ensure_ascii=False, indent=2)

print(f"{len(types)} items extracted and saved to {output_file_path}")

45270 items extracted and saved to lemmae.json
