In [1]:
from wiktionaryparser import WiktionaryParser
parser = WiktionaryParser()
parser.set_default_language('dutch')

from words import WordData

from paths import audio_dir, word_path

In [2]:
import os
import requests



headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

def download_audio(word: str, audio_url: str) -> str:

    audio_path = os.path.join(audio_dir, f"{word}.ogg")
    audio_url = "https:" + audio_url

    resp = requests.get(audio_url, headers=headers)

    with open(audio_path, "wb") as f:
        f.write(resp.content)

    return audio_path

In [3]:

def download_wikidict_data(word_data: WordData)->None:

    # Definition exists means already downloaded
    if word_data.definitions:
        return

    result = parser.fetch(word_data.word)
    if result == []:
        # Does not exist. E.g., an English word
        return
    
    definitions = []
    audio_url = result[0]['pronunciations']['audio'][0] if result[0]['pronunciations']['audio'] else ""

    if audio_url:
        download_audio(word_data.word, audio_url)

    for etymology in result:
        
        for definition in etymology['definitions']:
            
            part_of_speech = definition['partOfSpeech']
            text = definition['text']
            detail = text.pop(0)

            # Noun
            detail = detail.replace('\xa0n', '(het)')
            detail = detail.replace('\xa0m', '(de)')

            definitions.append([part_of_speech, detail, text])

    word_data.definitions = definitions

    return


In [4]:
import pickle


with open(word_path, "rb") as f:
    all_words = pickle.load(f)

In [5]:
from tqdm import tqdm

i = 0
for word in tqdm(all_words):
    i += 1
    download_wikidict_data(word)

    if i % 10 == 0:
        with open(word_path, "wb") as f:
            pickle.dump(all_words, f)



100%|██████████| 5838/5838 [1:17:37<00:00,  1.25it/s]
