In [1]:
import json
from tqdm import tqdm

def is_wordle_compatible(data):
    if not "word" in data:
        return(False)
    word = data["word"]
    if len(word) != 5:
        return(False)
    if not word.isalpha():
        return(False)
    if not data["lang"] == "Deutsch":
        return(False)
    if data["pos"] == "name":
        return(False)
    alphabet = set("abcdefghijklmnopqrstuvwxyzäöü")
    if not all(c in alphabet for c in word.lower()):
        return(False)
    return(True)       

wordset = set()
# data from https://kaikki.org/dictionary/rawdata.html
with open("../data/de-extract.jsonl", encoding="utf-8") as f:
    for line in tqdm(f):
        data = json.loads(line)
        if is_wordle_compatible(data):
            base_word = data["word"].lower()
            wordset.add(base_word)

0it [00:00, ?it/s]

1291181it [00:21, 60584.63it/s]


In [2]:
with open("../data/de_wiktionary_5_letter.txt", "w", encoding="utf-8") as f:
    for item in sorted(wordset):
        f.write(item + "\n")

In [11]:
# Für leipzig textcorpus frequenzen
import csv

def is_wordle_compatible(word):
    if len(word) != 5:
        return(False)
    if not word.isalpha():
        return(False)
    alphabet = set("abcdefghijklmnopqrstuvwxyzäöü")
    if not all(c in alphabet for c in word.lower()):
        return(False)
    return(True)       

wordset = set()
freq_list = []

with open('../data/wordlist_deu-de_web-public_2019_20260118193645.csv', newline='', encoding='utf-8') as f:
    for i in range(2):
        next(f)

    reader = csv.DictReader(f)

    for row in reader:
        word = row["Item"]
        freq = row["Frequency"]
        if is_wordle_compatible(word):
            wordset.add(word)
            freq_list.append({"word": word, "freq": int(freq)})

# Step 2: Compute total
total_freq = sum(w['freq'] for w in freq_list)

# Step 3: Add relative frequency
for w in freq_list:
    w['rel_freq'] = w['freq'] / total_freq






In [13]:
with open("../data/de_leipzig_5_letter.txt", "w", encoding="utf-8") as f:
    for item in sorted(wordset):
        f.write(item + "\n")

with open('../data/de_leipzig_frequencies.csv', 'w', newline='', encoding='utf-8') as f:
    # Use the keys of the first dict as header
    writer = csv.DictWriter(f, fieldnames=freq_list[0].keys())
    writer.writeheader()
    writer.writerows(freq_list)

In [39]:
len(wordset) #TODO: seltsame wörter wie 'burin', 'rösch', 'schul', 'insvv'; nicht-grundformen in liste

9587

In [30]:
with open("../data/de-extract.jsonl", encoding="utf-8") as f:
    for line in tqdm(f):
        data = json.loads(line)
        try:
            base_word = data["word"].lower()
        except: #errors happen when hard redirects
            print("word" in data)
            continue
        if base_word == "xaver":
            print(data)

3424it [00:00, 11663.01it/s]

False
False
{'word': 'Xaver', 'pos': 'name', 'pos_title': 'Vorname', 'lang_code': 'de', 'lang': 'Deutsch', 'senses': [{'glosses': ['männlicher Vorname'], 'examples': [{'text': 'Xaver ließ gestern wieder lange auf sich warten.', 'bold_text_offsets': [[0, 5]]}], 'sense_index': '1'}], 'translations': [{'sense': 'männlicher Vorname', 'word': 'Xabier', 'lang_code': 'eu', 'lang': 'Baskisch', 'sense_index': '1'}, {'sense': 'männlicher Vorname', 'word': 'Xavier', 'lang_code': 'en', 'lang': 'Englisch', 'sense_index': '1'}, {'sense': 'männlicher Vorname', 'word': 'Xavior', 'lang_code': 'en', 'lang': 'Englisch', 'sense_index': '1'}, {'sense': 'männlicher Vorname', 'word': 'Xzavier', 'lang_code': 'en', 'lang': 'Englisch', 'sense_index': '1'}, {'sense': 'männlicher Vorname', 'word': 'Zavier', 'lang_code': 'en', 'lang': 'Englisch', 'sense_index': '1'}, {'sense': 'männlicher Vorname', 'word': 'Xavier', 'lang_code': 'fr', 'lang': 'Französisch', 'sense_index': '1'}, {'sense': 'männlicher Vorname', 'wor

8968it [00:00, 16767.40it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


15059it [00:00, 19256.17it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


19624it [00:01, 21346.74it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


23954it [00:01, 20672.96it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


28478it [00:01, 21769.08it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False


36966it [00:01, 34014.15it/s]

False
False
False
False
False
False


48550it [00:01, 46159.70it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


57342it [00:02, 39011.33it/s]

False
False
False
False
False
False
False
False
False
False


65086it [00:02, 31796.16it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


71469it [00:02, 28662.62it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


77100it [00:02, 26308.85it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


93606it [00:03, 43015.86it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


98049it [00:03, 40790.04it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


106057it [00:03, 31944.82it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


109435it [00:03, 29651.25it/s]

False
False
False
False
False
False
False
False
False
False


115669it [00:04, 29261.64it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


125555it [00:04, 31846.71it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


133803it [00:04, 36234.95it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


144700it [00:04, 46260.70it/s]

False
False
False
False
False
False
False
False
False
False


160749it [00:04, 63013.01it/s]

False
False
False
False
False
False
False
False
False
False
False


182779it [00:05, 86845.75it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


209379it [00:05, 109306.48it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


234928it [00:05, 106518.81it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


246093it [00:05, 71907.38it/s] 

False
False
False
False
False
False
False
False
False
False
False
False
False


255095it [00:06, 58019.58it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


269330it [00:06, 57531.53it/s]

False
False
False
False
False
False


283665it [00:06, 63618.12it/s]

False
False
False
False
False
False
False


298148it [00:06, 65703.40it/s]

False
False
False
False


320767it [00:07, 71952.02it/s]

False
False
False
False
False
False
False
False


353223it [00:07, 78374.82it/s]

False
False
False
False
False
False
False


377928it [00:07, 81198.10it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


393578it [00:08, 72201.76it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False


400898it [00:08, 62858.79it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


413499it [00:08, 58808.04it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


427169it [00:08, 63435.99it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False


441417it [00:08, 67175.80it/s]

False
False
False


455134it [00:09, 67891.18it/s]

False
False


477113it [00:09, 71341.49it/s]

False


508143it [00:09, 75342.61it/s]

False
False


536112it [00:10, 88237.51it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


563156it [00:10, 88596.63it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


583104it [00:10, 94754.72it/s]

False
False


614368it [00:10, 100375.88it/s]

False
False


634486it [00:11, 98297.63it/s] 

False
False
False
False
False
False
False
False
False


661041it [00:11, 77540.12it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


683421it [00:11, 67539.80it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


696419it [00:12, 57264.02it/s]

False
False
False
False
False
False
False
False
False
False


718207it [00:12, 65013.94it/s]

False
False
False
False
False
False
False
False
False


731714it [00:12, 65503.94it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


745392it [00:12, 65853.72it/s]

False
False
False
False
False


768075it [00:13, 69206.89it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


790645it [00:13, 67079.94it/s]

False
False
False


804133it [00:13, 66233.32it/s]

False
False


817333it [00:13, 65101.54it/s]

False
False
False
False
False
False


844306it [00:14, 63445.40it/s]

False
False
False


856845it [00:14, 61336.01it/s]

False


901506it [00:15, 62709.28it/s]

False
False


914833it [00:15, 64725.51it/s]

False
False


948504it [00:15, 65140.12it/s]

False
False
False
False


961349it [00:16, 62947.19it/s]

False
False
False
False
False
False


974680it [00:16, 64776.48it/s]

False
False
False
False
False
False


987826it [00:16, 62331.49it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


1012797it [00:17, 60258.52it/s]

False
False
False
False
False
False
False
False
False
False


1031242it [00:17, 60936.77it/s]

False
False
False
False


1049368it [00:17, 59364.76it/s]

False
False


1061311it [00:17, 58923.74it/s]

False
False
False
False
False
False


1073217it [00:18, 58338.38it/s]

False


1119492it [00:18, 68713.40it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False


1133797it [00:18, 70147.01it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


1140934it [00:19, 70513.17it/s]

False
False
False
False


1163871it [00:19, 68574.71it/s]

False
False
False
False
False
False
False


1185009it [00:19, 68779.62it/s]

False
False
False
False
False
False
False
False


1206015it [00:20, 68174.92it/s]

False
False
False


1221578it [00:20, 73027.68it/s]

False
False


1236395it [00:20, 73410.97it/s]

False
False
False
False
False
False


1273206it [00:20, 72881.89it/s]

False
False
False
False


1291181it [00:21, 61017.39it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False





In [26]:
wordlist #TODO: duplicate words, words from other languages ('lang': 'Deutsch'), names ('word': 'Xaver', 'pos': 'name', 'pos_title': 'Vorname'), and words with special characters need to be removed

['hallo',
 'seria',
 'seria',
 'seria',
 'seria',
 'april',
 'april',
 'april',
 'april',
 'liebe',
 'liebe',
 'polen',
 'polen',
 'polen',
 'polen',
 'polen',
 'krieg',
 'feuer',
 'japan',
 'japan',
 'japan',
 'lunes',
 'nihon',
 'seite',
 'kampf',
 'licht',
 'licht',
 'woche',
 'stein',
 'stein',
 'stein',
 'stein',
 'macht',
 'vater',
 'auto-',
 'auto-',
 'mushi',
 'kleid',
 'notiz',
 'p. t.',
 'tisch',
 'bruch',
 'bruch',
 'rauch',
 'rauch',
 'fisch',
 'fisch',
 'fisch',
 'apfel',
 'apfel',
 'maler',
 'athen',
 'musik',
 'hilfe',
 'murks',
 'stand',
 'schau',
 'tasse',
 'verbe',
 'crude',
 'ambi-',
 'ambi-',
 'faðir',
 'faðir',
 'faðir',
 'zebra',
 'zebra',
 'zebra',
 'zebra',
 'stadt',
 'figur',
 'chlor',
 'pferd',
 'islam',
 'islam',
 'unagi',
 'milbe',
 'achat',
 'nichi',
 'agora',
 'fuchs',
 'fuchs',
 'pomme',
 'agate',
 'agate',
 'agate',
 'agata',
 'agata',
 'agato',
 'ágata',
 'monat',
 'mücke',
 'mücke',
 'mücke',
 'nauru',
 'nauru',
 'biene',
 'sehen',
 'sehen',
 'handy',


In [27]:
len(wordlist)

46925