In [11]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from languages import LANGUAGES

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
FILE_DIR = "./langdata/"

def split_pronunciation(row, lang):
    pronunciations : list[str] = row["pronunciation"].split(',')
    
    # Remove all broad transcriptions
    broad_transcriptions = []
    for p in pronunciations: 
        if p.startswith("/") and p.endswith("/"):
            broad_transcriptions.append(p[1:-1])
        elif p.startswith("[") and p.endswith("]"):
            broad_transcriptions.append(p[1:-1])

    return pd.DataFrame({"pronunciation": broad_transcriptions, 
                         "word": [row["word"]] * len(broad_transcriptions), 
                         "language": [lang] * len(broad_transcriptions)})

def process_language(lang):
    lang_df = pd.DataFrame({"pronunciation": [], "word": [], "language": []}) 

    df = pd.read_csv(FILE_DIR + lang + ".csv")
    
    for i in range(len(df)):
        x = df.iloc[i]
        P = split_pronunciation(x, lang)
        lang_df  = pd.concat([lang_df, P], ignore_index=True)

        del P 

    return lang_df

In [13]:
df = process_language("Japanese")
df = pd.concat([df, process_language("English")], ignore_index=True)

df.to_csv("Japanese_English_Processed.csv")


# Process All Languages

In [None]:
loaded_df = pd.DataFrame({"pronunciation": [], "word": [], "language": []}) 
ctr = 0
lang_ctr = 7

start_idx = lang_ctr * 10 + ctr 
for lang in [LANGUAGES[start_idx:]]:
    ctr += 1
    print(lang_ctr * 10 + ctr - 1, lang)

    df = pd.read_csv(FILE_DIR + lang + ".csv")
    
    for i in range(len(df)):
        x = df.iloc[i]
        P = split_pronunciation(x, lang)
        loaded_df  = pd.concat([loaded_df, P], ignore_index=True)

        del P 
    
    if ctr == 10: 
        loaded_df.to_csv(f"full_dataset_v0_{lang_ctr}.csv")
        loaded_df = pd.DataFrame({"pronunciation": [], "word": [], "language": []}) 
        lang_ctr += 1
        ctr = 0
    del df

In [None]:
merged_df = pd.DataFrame({"pronunciation": [], "word": [], "language": []}) 
# Merge all entries 
for i in range(0, 12):
    file = f"dataset/full_dataset_v0_{i}.csv"
    df = pd.read_csv(file)
    merged_df = pd.concat([merged_df, df[["pronunciation", "word", "language"]]], ignore_index=True)

    del df 

merged_df.to_csv("full_dataset_v0.csv")

In [None]:
print(merged_df['pronunciation'].str.len().max())