In [1]:
%load_ext autoreload
%autoreload 2

In [26]:
# SETUP
import os
import sys
import logging
logging.basicConfig(level=logging.INFO)

ROOT_DIR = os.path.abspath(os.path.join('.', '..'))
sys.path.append(ROOT_DIR)
print(f"Added {ROOT_DIR!r} to PYTHONPATH")

# IMPORTS
import typing as tp
import click
from omegaconf import OmegaConf
from tqdm import tqdm

from deck_utils.config_helpers import read_config, pprint_config

Added '/Users/Rodion.Khvorostov/Desktop/Programming/Python/PythonPetProjects/DecksGenerator' to PYTHONPATH


In [27]:
cfg: OmegaConf = read_config()
pprint_config(cfg)

{
  "root_dir": "/Users/Rodion.Khvorostov/Desktop/Programming/Python/PythonPetProjects/DecksGenerator",
  "data_dir": "/Users/Rodion.Khvorostov/Desktop/Programming/Python/PythonPetProjects/DecksGenerator/data",
  "deck": {
    "name": "b1_words",
    "lang": "English",
    "anki_deck": "English_b1_words",
    "words": "/Users/Rodion.Khvorostov/Desktop/Programming/Python/PythonPetProjects/DecksGenerator/data/b1_words.csv",
    "results_raw": "/Users/Rodion.Khvorostov/Desktop/Programming/Python/PythonPetProjects/DecksGenerator/data/b1_words_results_raw.json",
    "results": "/Users/Rodion.Khvorostov/Desktop/Programming/Python/PythonPetProjects/DecksGenerator/data/b1_words_results.json"
  },
  "data_de": {
    "raw_location": "/Users/Rodion.Khvorostov/Desktop/Programming/Python/PythonPetProjects/DecksGenerator/data/words/interim/german_a2b1/static",
    "raw_files_a2": [
      "/Users/Rodion.Khvorostov/Desktop/Programming/Python/PythonPetProjects/DecksGenerator/data/words/interim/german_a

In [21]:
from deck_utils.read import read_json
from dataclasses import dataclass

@dataclass
class RawWord:
    word: str
    translation: str
    pos: str

    def __str__(self):
        return f"{self.word} ({self.pos}): {self.translation}"
    
def path_to_pos(path: str) -> str:
    filename = os.path.basename(path)
    # remove prefix
    SYMBOLS_TO_REMOVE = len("A2")
    pos_raw = filename[SYMBOLS_TO_REMOVE:]
    # remove suffix
    pos = pos_raw[:-1]
    return pos.lower()

path_to_pos("A2verbs")

'verb'

In [22]:
def extract_raw_words(filenames: list[str]) -> list[RawWord]:
    raw_words = []
    for filename in filenames:
        pos = path_to_pos(filename)
        data = read_json(filename)
        for word, translation in data.items():
            raw_words.append(RawWord(word=word, translation=translation, pos=pos))
    return raw_words

a2_words = extract_raw_words(cfg.data_ge.raw_files_a2)
print("Words", len(a2_words))
print(a2_words[:3])

Words 1214
[RawWord(word='das Angebot', translation='offer', pos='noun'), RawWord(word='die Adresse', translation='address', pos='noun'), RawWord(word='der Anfang', translation='Beginning', pos='noun')]


In [23]:
b1_words = extract_raw_words(cfg.deck_ge.raw_files_b1)
print("Words", len(b1_words))
print(b1_words[:3])

Words 2632
[RawWord(word='die Abbildung', translation='Illustration', pos='noun'), RawWord(word='der Abfall', translation='waste; rubbish; garbage', pos='noun'), RawWord(word='das Abgas', translation='exhaust; waste gas', pos='noun')]


In [25]:
# convert to pandas
import pandas as pd

def to_df(raw_words: list[RawWord]) -> pd.DataFrame:
    return pd.DataFrame([vars(x) for x in raw_words])

a2_df = to_df(a2_words)
b1_df = to_df(b1_words)
a2_df.head()

Unnamed: 0,word,translation,pos
0,das Angebot,offer,noun
1,die Adresse,address,noun
2,der Anfang,Beginning,noun
3,die Angst,anxiety / fear,noun
4,die Ahnung,idea,noun


In [None]:
# save to csv
a2_df.to_csv(cfg.data_de.words_a2, index=False)
b1_df.to_csv(cfg.data_de.words_b1, index=False)