In [1]:
import csv
import sys
import os
import subprocess
import pandas as pd
import spacy
import spacy_transformers

from string import punctuation
from wordfreq import zipf_frequency

In [2]:
def count_csv_elements_in_file(filepath):

    total_elements = 0

    with open(filepath, "r", encoding="utf-8") as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            total_elements += len(row)

    return total_elements

language = []
total_words = []

for path, subdirs, files in os.walk("../raw-word-list"):
    for name in files:
        filename = (os.path.join(path, name))
        language += [name.split(".")[0]]
        total_words += [count_csv_elements_in_file(filename)]

In [3]:
pd.DataFrame({
    "language" : language,
    "total_words" : total_words
})

Unnamed: 0,language,total_words
0,Afrikaans,3667
1,Amharic,2811
2,Arabic,5691498
3,Armenian,981
4,Azerbaijani,38502
...,...,...
104,Welsh,3407
105,Xhosa,2936
106,Yiddish,3603
107,Yoruba,2728


In [4]:
spacy_models = {
    "Catalan": "ca_core_news_trf",
    "Croatian": "hr_core_news_lg",
    "Danish": "da_core_news_trf",
    "Dutch": "nl_core_news_lg",
    "English": "en_core_web_trf",
    "Finnish": "fi_core_news_lg",
    "French": "fr_dep_news_trf",
    "German": "de_dep_news_trf",
    "Greek": "el_core_news_lg",
    "Italian": "it_core_news_lg",
    "Lithuanian": "lt_core_news_lg",
    "Macedonian": "mk_core_news_lg",
    "Norwegian": "nb_core_news_lg",
    "Polish": "pl_core_news_lg",
    "Portuguese": "pt_core_news_lg",
    "Romanian": "ro_core_news_lg",
    "Russian": "ru_core_news_lg",
    "Slovenian": "sl_core_news_trf",
    "Spanish": "es_dep_news_trf",
    "Swedish": "sv_core_news_lg",
    "Ukrainian": "uk_core_news_trf"
}

In [None]:
import sys
import subprocess

def run_cmd(cmd):
    return subprocess.run(cmd, capture_output=True, text=True)

pip_check = run_cmd([sys.executable, "-m", "pip", "--version"])

if pip_check.returncode != 0:
    print("pip not found, installing pip using ensurepip...")
    ensure = run_cmd([sys.executable, "-m", "ensurepip", "--upgrade"])
    print(ensure.stdout)
    print(ensure.stderr)

    upgrade = run_cmd([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
    print(upgrade.stdout)
    print(upgrade.stderr)


spacy_check = run_cmd([sys.executable, "-c", "import spacy"])

if spacy_check.returncode != 0:
    print("spaCy not found, installing spaCy...")
    install_spacy = run_cmd([sys.executable, "-m", "pip", "install", "spacy"])
    print(install_spacy.stdout)
    print(install_spacy.stderr)


for model in spacy_models.values():
    print(f"Downloading spaCy model: {model}")
    result = run_cmd([sys.executable, "-m", "spacy", "download", model])

    print(result.stdout)
    print(result.stderr)


In [30]:
os.mkdir("../data")

In [31]:
for language in spacy_models.keys():
    try:
        os.mkdir(f"../data/{language}")
        print(f"Directory {language} created.")
    except:
        print(f"Directory {language} already exists.")

Directory Catalan created.
Directory Croatian created.
Directory Danish created.
Directory Dutch created.
Directory English created.
Directory Finnish created.
Directory French created.
Directory German created.
Directory Greek created.
Directory Italian created.
Directory Lithuanian created.
Directory Macedonian created.
Directory Norwegian created.
Directory Polish created.
Directory Portuguese created.
Directory Romanian created.
Directory Russian created.
Directory Slovenian created.
Directory Spanish created.
Directory Swedish created.
Directory Ukrainian created.


In [8]:
def load_and_clean_word_list(language : str) -> pd.DataFrame:    
    with open(f"../raw-word-list/{language}/{language}.txt", "r", encoding = "utf-8") as f:
        word_list = f.read().split(",")

    word_df = pd.DataFrame({
        "word" : word_list
    })

    word_df['word'] = word_df["word"].str.strip(punctuation)

    return word_df

In [9]:
load_and_clean_word_list("English")

Unnamed: 0,word
0,aardvark
1,aardvarks
2,aardwolf
3,aardwolves
4,Aaren
...,...
466429,Zwolle
466430,Zworykin
466431,ZZ
466432,zZt


In [10]:
nlp = spacy.load(spacy_models["English"], disable = ["parser", "ner", "textcat"])

In [12]:
def add_lemma(df : pd.DataFrame,
              nlp,
              batch_size : int = 1000) -> pd.DataFrame:
    
    docs = nlp.pipe(df["word"].to_list(), batch_size = batch_size)
    lemmas = [doc[0].lemma_ for doc in docs]
    df["lemma"] = pd.DataFrame(lemmas, index = df.index)

    return df

In [13]:
add_lemma(
    load_and_clean_word_list("English")[:100], nlp
)

Unnamed: 0,word,lemma
0,aardvark,aardvark
1,aardvarks,aardvarks
2,aardwolf,aardwolf
3,aardwolves,aardwolf
4,Aaren,Aaren
...,...,...
95,Abama,Abama
96,abamp,abamp
97,abampere,abampere
98,abamperes,abamperes


In [16]:
def add_word_frequencies(df : pd.DataFrame,
                         language : str)-> pd.DataFrame :
    
    language_group = spacy_models[language].split("_")[0]
    df["zipf_freq_lemma"] = [zipf_frequency(w, language_group) for w in df["lemma"]]

    return df

In [17]:
add_word_frequencies(
    add_lemma(
        load_and_clean_word_list("English")[:100], nlp
    ), "English"
)

Unnamed: 0,word,lemma,zipf_freq_lemma
0,aardvark,aardvark,2.39
1,aardvarks,aardvarks,1.68
2,aardwolf,aardwolf,1.11
3,aardwolves,aardwolf,1.11
4,Aaren,Aaren,0.00
...,...,...,...
95,Abama,Abama,0.00
96,abamp,abamp,0.00
97,abampere,abampere,0.00
98,abamperes,abamperes,0.00


In [18]:
def clean_up_and_export(df : pd.DataFrame, language : str) -> None:
    df = (
        df.loc[df.groupby("lemma", sort = False)["zipf_freq_lemma"].idxmax()]
        .reset_index(drop = True)
    )

    df = df[(df["zipf_freq_lemma"] > 0)]

    df.loc[:, "word_difficulty"] = pd.cut(
        df["zipf_freq_lemma"],
        bins = [-float("inf"), 2.0, 4.0, float("inf")],
        labels = ["advanced", "intermediate", "beginner"],
        include_lowest = True,
        right = True
    )

    df = df.drop(columns = ["word", "zipf_freq_lemma"])
    df = df.rename(columns = {
        "lemma" : "word"
    })

    df.to_json(f"../data/{language}/word-list-cleaned.json", orient = "index")

In [19]:
clean_up_and_export(
    add_word_frequencies(
        add_lemma(
            load_and_clean_word_list("English")[:100], nlp
        ), "English"
    ), "English"
)

In [39]:
def create_clean_word_list(language : str) -> None:

    nlp = spacy.load(spacy_models[language], disable = ["parser", "ner", "textcat"])

    print("Load in dataset")
    lang_df = load_and_clean_word_list(language)

    print("Lemmatise Words")
    lang_df = add_lemma(lang_df, nlp)

    print("Add the word frequencies")
    lang_df = add_word_frequencies(lang_df, language)

    print("Do the final clean ups and export the file")
    clean_up_and_export(lang_df, language)

    return None

In [27]:
def process_all_languages(spacy_models: dict):
    for language in spacy_models.keys():
        print(f"Processing language: {language}")
        create_clean_word_list(language)

In [32]:
process_all_languages(spacy_models)

Processing language: Catalan
Load in dataset
Lemmatise Words
Add the word frequencies
Do the final clean ups and export the file
Processing language: Croatian
Load in dataset
Lemmatise Words
Add the word frequencies
Do the final clean ups and export the file
Processing language: Danish
Load in dataset
Lemmatise Words
Add the word frequencies
Do the final clean ups and export the file
Processing language: Dutch
Load in dataset
Lemmatise Words
Add the word frequencies
Do the final clean ups and export the file
Processing language: English
Load in dataset
Lemmatise Words
Add the word frequencies
Do the final clean ups and export the file
Processing language: Finnish
Load in dataset
Lemmatise Words
Add the word frequencies
Do the final clean ups and export the file
Processing language: French
Load in dataset
Lemmatise Words
Add the word frequencies
Do the final clean ups and export the file
Processing language: German
Load in dataset
Lemmatise Words
Add the word frequencies
Do the final cl

In [41]:
import json

language_raw = []
total_words_raw = []

for path, subdirs, files in os.walk("../raw-word-list"):
    for name in files:
        filename = (os.path.join(path, name))
        language_raw += [name.split(".")[0]]
        total_words_raw += [count_csv_elements_in_file(filename)]

raw_data = pd.DataFrame({
    "language": language_raw,
    "type": ["Raw"] *  len(total_words_raw),
    "total_words_raw": total_words_raw,
})

In [44]:
language_clean = []
total_words_clean = []

for path, subdirs, files in os.walk("data"):
    for name in files:
        filename = (os.path.join(path, name))
        language_clean += [path.split("/")[1]]
        with open(filename, "r", encoding="utf-8") as f:
            data = json.load(f)
            total_words_clean += [len(data.keys())]

clean_data = pd.DataFrame({
    "language": language_clean,
    "type": ["Clean"] * len(total_words_clean),
    "total_words_raw": total_words_clean,
})

pd.concat([raw_data, clean_data])

Unnamed: 0,language,type,total_words_raw
0,Afrikaans,Raw,3667.0
1,Amharic,Raw,2811.0
2,Arabic,Raw,5691498.0
3,Armenian,Raw,981.0
4,Azerbaijani,Raw,38502.0
...,...,...,...
104,Welsh,Raw,3407.0
105,Xhosa,Raw,2936.0
106,Yiddish,Raw,3603.0
107,Yoruba,Raw,2728.0
