## Batch Game Generator

Other possible clues:
  - Other linguistic info about the language
  - Place of origin: 
  - Specific to the audio sample: 
    - Wompy audio version
  - Look into semantic scholar, only import if these fields are not blank or null

## Settings

In [1]:
# NUMBER OF DAYS TO GENERATE USING THIS BATCH PROCESS
number_to_generate = 80

In [2]:
import pandas as pd
import numpy as np
from langcodes import Language
from datasets import load_dataset, Audio as HF_Audio
import librosa, scipy.signal as sig
from datetime import datetime
from datetime import date, timedelta
from pathlib import Path
import subprocess
import shutil, soundfile as sf
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time

In [3]:
# Metadata
LANGUAGE_DATA_PATH = 'languages.csv'
GAME_DATA_PATH = 'game_data.csv'
CSV_PATH = Path(GAME_DATA_PATH)

## Methods

In [4]:
# How to sample from the languages list
def sample_with_repeat_rounds(arr, N):
    if not arr:
        return []
    result = []
    while len(result) < N:
        remaining = N - len(result)
        draw_count = min(remaining, len(arr))
        new_samples = random.sample(arr, draw_count)
        result.extend(new_samples)

    return result

# Count number of words in a string
def count_words(text):
    return len(text.split())

# Request a sample from Mozilla's Common Voice
def sample_common_voice(cv_code: str):
    version = 15
    try:
        ds_stream = load_dataset(
            f'mozilla-foundation/common_voice_{version}_0',
            name = cv_code, 
            split = 'train',
            streaming = True
        )
        ds_stream.cast_column('audio', HF_Audio(decode = True))
        seed = random.randint(0, 2**32 - 1)
        ds_shuffled = ds_stream.shuffle(buffer_size=2048, seed=seed)

        samples = list(ds_shuffled.take(5))
        for sample in samples:
            if sample['audio']['array'].shape[0] >= 200000:
                print(f'Loaded a sample from Common Voice dataset for {cv_code} that was long enough >:)')
                return [sample]
        print(f'Loaded a sample from Common Voice dataset for {cv_code} but it was too short :3')
        return [samples[0]]
        
    except Exception as e:
        print(f'Failed to sample from the Common Voice dataset:\n{e}')
        return []

# Exploit Google Translate to translate this sentence (webscraping!)
def translate(language, text):
    
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    try:
        # Navigate to Google Translate
        driver.get("https://translate.google.com")
        wait = WebDriverWait(driver, 15)
        actions = ActionChains(driver)
        
        # Wait for page to load
        time.sleep(0.5)
        
        # Enter text (cursor should be ready in text area)
        actions.send_keys(text).perform()
        
        # Find and click the target language button
        target_lang_button = wait.until(EC.element_to_be_clickable((
            By.CSS_SELECTOR, 
            "button[aria-label*='target language'], .VfPpkd-Bz112c-RLmnJb"
        )))
        target_lang_button.click()
        
        # Wait briefly for dropdown, then type language and press Enter
        time.sleep(0.5)
        actions.send_keys(language).perform()
        time.sleep(0.5)
        actions.send_keys(Keys.RETURN).perform()
        
        # Wait for translation and extract
        time.sleep(0.5)
        
        # Extract translation
        translation_selectors = [
            "span[jsname='W297wb']",
            "span[lang]:not([lang='auto']):not([lang=''])",
            ".ryNqvb span",
            ".J0lOec span"
        ]
        
        for selector in translation_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    translated_text = element.text.strip()
                    if translated_text and translated_text != text:
                        print(f"Translation found: {translated_text}")
                        return translated_text
            except:
                continue
        return None
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None
        
    finally:
        driver.quit()

def find_espeak():
    """Return path to espeak-ng or espeak binary, raise if missing."""
    exe = shutil.which("espeak-ng") or shutil.which("espeak")
    if exe is None:
        raise FileNotFoundError("No espeak-ng/espeak binary on PATH")
    return exe

def phonemize(text: str, lang: str = "en", ipa: bool = False,
              ipa_level: int = 1, keep_stress: bool = True) -> str:
    """Phonemize text using eSpeak (NG)"""
    try: 
        exe = find_espeak()

        # Build the command line
        args = [exe, "-q", f"-v{lang}"]
        if ipa:
            args.append(f"--ipa={ipa_level}")
        else:
            args.append("-x")
            if not keep_stress:
                args.append("--sep=-") # do this to strip stress marks later
        args.append(text)

        # Run the command
        proc = subprocess.run(args, text = True, capture_output = True, check = True)
        out = proc.stdout.strip()

        if not keep_stress:
            # eSpeak stress marks are the `'` characters; remove them
            out = out.replace("ˈ", "")

        # Sometimes line breaks appear in the output; remove them
        out = out.replace("\n", " ").replace("\r", " ").strip()

        return out
    
    except Exception as e:
        print(f"Error getting IPA representation, enter manually: {str(e)}")
        return "???"
    
def get_row(chosen_language, day, languages_df):
    # Get language data from languages dataframe
    languages_row = languages_df[languages_df['espeak_code'] == chosen_language].iloc[0]
    LANGUAGE = { 
        'ESPEAK_CODE': chosen_language,
        'ENGLISH_NAME' : languages_row['english_name'], 
        'ISO' : languages_row['iso3'],  
        'CV' : languages_row['cv_code'], 
        'ESPEAK' : languages_row['espeak_code'], 
        'LINEAGE' : eval(languages_row['lineage']), 
        'FAMILY_0' : languages_row['family_0'], 
        'FAMILY_1' : languages_row['family_1'], 
        'FAMILY_2' : languages_row['family_2']
    }

    print(f"Generating an example for {LANGUAGE['ENGLISH_NAME']} ({LANGUAGE['ESPEAK_CODE']})")

    # Get a sample from that language
    LANGUAGE['SAMPLE'] = sample_common_voice(LANGUAGE['CV'])
    LANGUAGE['SAMPLE'][0]['sentence'] = LANGUAGE['SAMPLE'][0]['sentence'].replace('\n', ' ')

    # Try to translate it
    try:
        LANGUAGE['SAMPLE'][0]['translation'] = translate(LANGUAGE['ENGLISH_NAME'], LANGUAGE['SAMPLE'][0]['sentence'])
    except Exception as e:
        print(f'Failed to add translation, enter it manually:\n{e}')
        LANGUAGE['SAMPLE'][0]['translation'] = '???????'

    # Extract audio data of sample
    wave = LANGUAGE['SAMPLE'][0]['audio']['array']
    rate = LANGUAGE['SAMPLE'][0]['audio']['sampling_rate']

    # Convert to phonetics
    LANGUAGE['SAMPLE'][0]['IPA'] = phonemize(text = LANGUAGE['SAMPLE'][0]['sentence'], lang = LANGUAGE['ESPEAK'], ipa = True, ipa_level = 1, keep_stress = False)

    # Generate and store corresponding audio file
    AUDIO_DIR = Path('assets/audio')
    AUDIO_DIR.mkdir(exist_ok = True)
    fname_base = f"{day:%Y-%m-%d}"
    file_path = AUDIO_DIR / f"{fname_base}.mp3"
    # muffled_file_path = AUDIO_DIR / f"{fname_base}_muffled.mp3"
    # muffled_wave = sig.sosfiltfilt(sig.butter(1, 700, "low", fs = rate, output = "sos"),
    #                          librosa.effects.pitch_shift(wave.astype(np.float32), sr = rate, n_steps = -2))
    sf.write(file_path, wave, rate, format='MP3')
    # sf.write(muffled_file_path, muffled_wave, rate, format='WAV')

    # build a new row based on the generated LANGUAGE dict from above
    new_row = { 'date': day,
                'language': LANGUAGE['ENGLISH_NAME'],
                'iso': LANGUAGE['ISO'],
                'cv_code': LANGUAGE['CV'],
                'espeak_code': LANGUAGE['ESPEAK'],
                'lineage': LANGUAGE['LINEAGE'],
                'family_0': LANGUAGE['FAMILY_0'],
                'family_1': LANGUAGE['FAMILY_1'],
                'family_2': LANGUAGE['FAMILY_2'],
                'sentence': LANGUAGE['SAMPLE'][0]['sentence'],
                'translation': LANGUAGE['SAMPLE'][0]['translation'],
                'wave': file_path.name,
                # 'muffled_wave': muffled_file_path.name,
                'sampling_rate': LANGUAGE['SAMPLE'][0]['audio']['sampling_rate'],
                'IPA': LANGUAGE['SAMPLE'][0]['IPA']
    }

    return new_row
    
def fill_day(chosen_language, day, languages_df, game_df, CSV_PATH):
    new_row = get_row(chosen_language, day, languages_df)
    # ensure any new columns are present in the dataframe
    missing_cols = set(new_row) - set(game_df.columns)
    for c in missing_cols:
        game_df[c] = pd.NA # create blank column for any new field

    # append and reset the index
    game_df = pd.concat([game_df, pd.DataFrame([new_row])], ignore_index = True)

    # store into the game_df
    game_df.to_csv(CSV_PATH, index = False)

    print(f"Filled {day:%Y-%m-%d} with {new_row['language']}\n")
    

def replace_day(chosen_language, day, languages_df, game_df, CSV_PATH):
    # Find index for matching date
    datetime_object = datetime.strptime(day, "%Y-%m-%d").date()
    mask = game_df['date'] == datetime_object
    hits = mask.sum()
    if hits == 0:
        raise KeyError(f"No row found with date {day}")
    if hits > 1:
        raise KeyError(f"Expected one row with date {day}, found {hits}")
    
    new_row = get_row(chosen_language, datetime_object, languages_df)
    pd_new_row = (pd.Series(new_row).reindex(game_df.columns, fill_value=pd.NA))
    
    idx = game_df.index[mask][0]
    game_df.loc[idx] = pd_new_row

    # store into the game_df
    game_df.to_csv(CSV_PATH, index = False)

    print(f"Replaced {day} with {new_row['language']}")

## Batch Generator

In [5]:
# All available languages, removing english
languages_df = pd.read_csv(LANGUAGE_DATA_PATH)
all_languages = languages_df['espeak_code'].unique()
for code in ['en', 'en-gb', 'en-sc', 'en-uk-north', 'en-uk-wmids', 'en-us', 'en-wi']:
    all_languages = np.delete(all_languages, np.where(all_languages == code))
all_languages

# Select number_to_generate many languages, with some non-repetition baked into this choice
sample = sample_with_repeat_rounds(list(all_languages), number_to_generate)

START_DAY = date(2025, 6, 13) # first day in the daily games series

# Generate examples
for chosen_language in sample:

    # Load the dataframe
    if CSV_PATH.exists():
        game_df = pd.read_csv(CSV_PATH, parse_dates = ['date'])
        # mutate the column to a Python date, not Timestamp
        game_df['date'] = game_df['date'].dt.date
    else:
        game_df = pd.DataFrame(columns=['date'])

    # Decide date to which this example corresponds
    next_day = (max(game_df['date']) + timedelta(days = 1)) if not game_df.empty else START_DAY

    # Fill in the next day with an example of the chosen language
    fill_day(chosen_language, next_day, languages_df, game_df, CSV_PATH)

Generating an example for Greek (el)


Reading metadata...: 1913it [00:00, 15457.96it/s]


Loaded a sample from Common Voice dataset for el that was long enough >:)
Translation found: and placed it in front of them, on an iron pan.
Filled 2025-07-20 with Greek

Generating an example for French (fr-fr)


Reading metadata...: 527554it [00:16, 32828.48it/s]


Loaded a sample from Common Voice dataset for fr that was long enough >:)
Translation found: Do you want me for your husband?
Filled 2025-07-21 with French

Generating an example for Russian (ru)


Reading metadata...: 26328it [00:01, 23028.98it/s]


Loaded a sample from Common Voice dataset for ru that was long enough >:)
Translation found: I bought paint to paint the balcony, and they painted the doors in the kitchen.
Filled 2025-07-22 with Russian

Generating an example for Slovak (sk)


Reading metadata...: 3276it [00:00, 19038.44it/s]


Loaded a sample from Common Voice dataset for sk that was long enough >:)
Translation found: patent owner;
Filled 2025-07-23 with Slovak

Generating an example for Vietnamese (vi-sgn)


Reading metadata...: 2383it [00:00, 26607.21it/s]


Loaded a sample from Common Voice dataset for vi that was long enough >:)
Translation found: Only then did I remember what Thao said last night.
Filled 2025-07-24 with Vietnamese

Generating an example for Indonesian (id)


Reading metadata...: 4968it [00:00, 29922.87it/s]


Loaded a sample from Common Voice dataset for id that was long enough >:)
Translation found: Everyone is registered by year of birth and grouped into generations.
Filled 2025-07-25 with Indonesian

Generating an example for Serbian (sr)


Reading metadata...: 1521it [00:00, 17063.24it/s]


Loaded a sample from Common Voice dataset for sr but it was too short :3
Translation found: All right!
Filled 2025-07-26 with Serbian

Generating an example for Hungarian (hu)


Reading metadata...: 34498it [00:01, 27720.10it/s]


Loaded a sample from Common Voice dataset for hu that was long enough >:)
Translation found: The next day, Cécile refuses to let anyone into her room.
Filled 2025-07-27 with Hungarian

Generating an example for Malayalam (ml)


Reading metadata...: 1249it [00:00, 14696.05it/s]


Loaded a sample from Common Voice dataset for ml that was long enough >:)
Translation found: Cases will pile up and things will get to the point where healthcare systems cannot handle it.
Filled 2025-07-28 with Malayalam

Generating an example for Polish (pl)


Reading metadata...: 19119it [00:00, 30675.55it/s]


Loaded a sample from Common Voice dataset for pl that was long enough >:)
Translation found: The draft resolution in question is in line with this
Filled 2025-07-29 with Polish

Generating an example for Czech (cs)


Reading metadata...: 19358it [00:00, 24244.38it/s]


Loaded a sample from Common Voice dataset for cs that was long enough >:)
Translation found: He was already a member of student organizations during his studies.
Filled 2025-07-30 with Czech

Generating an example for Nepali (ne)


Reading metadata...: 194it [00:00, 4351.54it/s]


Loaded a sample from Common Voice dataset for ne-NP that was long enough >:)
Translation found: The by-road is winding and winding.
Filled 2025-07-31 with Nepali

Generating an example for Icelandic (is)


Reading metadata...: 8it [00:00, 150.40it/s]


Loaded a sample from Common Voice dataset for is that was long enough >:)
Translation found: The playing season is from late May to mid-September.
Filled 2025-08-01 with Icelandic

Generating an example for Estonian (et)


Reading metadata...: 3148it [00:00, 13920.08it/s]


Loaded a sample from Common Voice dataset for et that was long enough >:)
Translation found: The latter gives a youthful and sophisticated appearance, which women really like.
Filled 2025-08-02 with Estonian

Generating an example for French (fr-be)


Reading metadata...: 527554it [00:13, 38405.34it/s]


Loaded a sample from Common Voice dataset for fr that was long enough >:)
Translation found: His mother Olga Davidovna Petrenko, née Weintraub, is a musicologist.
Filled 2025-08-03 with French

Generating an example for Portuguese (pt-br)


Reading metadata...: 21202it [00:00, 29984.07it/s]


Loaded a sample from Common Voice dataset for pt that was long enough >:)
Translation found: Certifications will be issued with the approval of the President.
Filled 2025-08-04 with Portuguese

Generating an example for Swedish (sv)


Reading metadata...: 7584it [00:00, 28357.02it/s]


Loaded a sample from Common Voice dataset for sv-SE that was long enough >:)
Translation found: The large, high window looked out onto the city and the sea.
Filled 2025-08-05 with Swedish

Generating an example for German (de)


Reading metadata...: 567993it [00:16, 33930.18it/s]


Loaded a sample from Common Voice dataset for de that was long enough >:)
Translation found: This was completed by his son-in-law Georges Bizet.
Filled 2025-08-06 with German

Generating an example for Cantonese (zh-yue)


Reading metadata...: 3074it [00:00, 21305.06it/s]


Loaded a sample from Common Voice dataset for yue that was long enough >:)
Translation found: A good night’s sleep makes the excellent you, better than the weekdays
Filled 2025-08-07 with Cantonese

Generating an example for Spanish (es)


Reading metadata...: 311392it [00:10, 31002.45it/s]


Loaded a sample from Common Voice dataset for es that was long enough >:)
Translation found: Until then, the Jacobite forces had encountered almost negligible resistance.
Filled 2025-08-08 with Spanish

Generating an example for Irish (ga)


Reading metadata...: 536it [00:00, 6540.81it/s]


Loaded a sample from Common Voice dataset for ga-IE that was long enough >:)
Translation found: There's no sore throat like your own sore throat.
Filled 2025-08-09 with Irish

Generating an example for Vietnamese (vi)


Reading metadata...: 2383it [00:00, 5045.75it/s]


Loaded a sample from Common Voice dataset for vi that was long enough >:)
Translation found: Time goes by endlessly
Filled 2025-08-10 with Vietnamese

Generating an example for Albanian (sq)


Reading metadata...: 494it [00:00, 6185.51it/s]


Loaded a sample from Common Voice dataset for sq that was long enough >:)
Translation found: Macedonian Prime Minister visits Dublin.
Filled 2025-08-11 with Albanian

Generating an example for Tamil (ta)


Reading metadata...: 44044it [00:01, 30159.01it/s]


Loaded a sample from Common Voice dataset for ta that was long enough >:)
Translation found: They showed him to the doctor, fearing that he would have another attack like this.
Filled 2025-08-12 with Tamil

Generating an example for Danish (da)


Reading metadata...: 3381it [00:00, 17339.25it/s]


Loaded a sample from Common Voice dataset for da that was long enough >:)
Translation found: said the queen, something that is profound and instructive!
Filled 2025-08-13 with Danish

Generating an example for Swahili (sw)


Reading metadata...: 44075it [00:01, 23064.29it/s]


Loaded a sample from Common Voice dataset for sw that was long enough >:)
Translation found: that helped the country eliminate the friction of
Filled 2025-08-14 with Swahili

Generating an example for Armenian (hy)


Reading metadata...: 651it [00:00, 9543.50it/s]


Loaded a sample from Common Voice dataset for hy-AM that was long enough >:)
Translation found: As a radio operator, she fights for her love at sea and wins.
Filled 2025-08-15 with Armenian

Generating an example for Bulgarian (bg)


Reading metadata...: 3413it [00:00, 23593.21it/s]


Loaded a sample from Common Voice dataset for bg that was long enough >:)
Translation found: The leader's body was quickly carried away.
Filled 2025-08-16 with Bulgarian

Generating an example for Dutch (nl)


Reading metadata...: 34088it [00:01, 25722.40it/s]


Loaded a sample from Common Voice dataset for nl that was long enough >:)
Translation found: The types of measures are increasing and the number of actors is also growing.
Filled 2025-08-17 with Dutch

Generating an example for Macedonian (mk)


Reading metadata...: 67it [00:00, 1265.47it/s]


Loaded a sample from Common Voice dataset for mk that was long enough >:)
Translation found: It was not clear who had the right of way, because the traffic lights were not working.
Filled 2025-08-18 with Macedonian

Generating an example for Romanian (ro)


Reading metadata...: 5172it [00:00, 26059.62it/s]


Loaded a sample from Common Voice dataset for ro that was long enough >:)
Translation found: Young people are not the only problematic category.
Filled 2025-08-19 with Romanian

Generating an example for Portuguese (pt-pt)


Reading metadata...: 21202it [00:00, 32834.66it/s]


Loaded a sample from Common Voice dataset for pt that was long enough >:)
Translation found: Language support for languages in India has improved in recent months.
Filled 2025-08-20 with Portuguese

Generating an example for Mandarin (zh)


Reading metadata...: 7048it [00:00, 33770.32it/s]


Loaded a sample from Common Voice dataset for zh-TW that was long enough >:)
Translation found: Isn’t computerization being promoted now?
Filled 2025-08-21 with Mandarin

Generating an example for Finnish (fi)


Reading metadata...: 2148it [00:00, 16895.67it/s]


Loaded a sample from Common Voice dataset for fi that was long enough >:)
Translation found: After about half an hour, she came back out the door looking very disappointed.
Filled 2025-08-22 with Finnish

Generating an example for Lithuanian (lt)


Reading metadata...: 6715it [00:00, 27929.11it/s]


Loaded a sample from Common Voice dataset for lt that was long enough >:)
Translation found: The tower now houses a book museum.
Filled 2025-08-23 with Lithuanian

Generating an example for Spanish (es-la)


Reading metadata...: 311392it [00:11, 28168.59it/s]


Loaded a sample from Common Voice dataset for es that was long enough >:)
Translation found: Thus, stations, bus stops, and the signaling system completed their final day.
Filled 2025-08-24 with Spanish

Generating an example for Hindi (hi)


Reading metadata...: 4630it [00:00, 12370.91it/s]


Loaded a sample from Common Voice dataset for hi that was long enough >:)
Translation found: Will the Gondwana Ganatantra Party become a major force in Jaisinghnagar Assembly?
Filled 2025-08-25 with Hindi

Generating an example for Italian (it)


Reading metadata...: 166503it [00:09, 17233.20it/s]


Loaded a sample from Common Voice dataset for it that was long enough >:)
Translation found: His main interest, however, remained poetry.
Filled 2025-08-26 with Italian

Generating an example for Georgian (ka)


Reading metadata...: 39326it [00:03, 11419.22it/s]


Loaded a sample from Common Voice dataset for ka that was long enough >:)
Translation found: It was mandatory for students to complete a summer internship.
Filled 2025-08-27 with Georgian

Generating an example for Persian (fa)


Reading metadata...: 28756it [00:01, 17246.46it/s]


Loaded a sample from Common Voice dataset for fa that was long enough >:)
Translation found: Months ago I decided to take action.
Filled 2025-08-28 with Persian

Generating an example for Armenian (hy-west)


Reading metadata...: 651it [00:00, 6932.98it/s]


Loaded a sample from Common Voice dataset for hy-AM that was long enough >:)
Translation found: He is fluent in Armenian, Russian, English, and French.
Filled 2025-08-29 with Armenian

Generating an example for Vietnamese (vi-hue)


Reading metadata...: 2383it [00:00, 16963.90it/s]


Loaded a sample from Common Voice dataset for vi that was long enough >:)
Translation found: And remember, don't let the wild sunflowers out of your hands.
Filled 2025-08-30 with Vietnamese

Generating an example for Punjabi (pa)


Reading metadata...: 728it [00:00, 5748.21it/s]


Loaded a sample from Common Voice dataset for pa-IN that was long enough >:)
Translation found: Muslims, Christians and Parsis have their own acts.
Filled 2025-08-31 with Punjabi

Generating an example for Welsh (cy)


Reading metadata...: 7871it [00:00, 13423.88it/s]


Loaded a sample from Common Voice dataset for cy that was long enough >:)
Translation found: He was also a conservative trade unionist.
Filled 2025-09-01 with Welsh

Generating an example for Catalan (ca)


Reading metadata...: 1142607it [00:48, 23609.60it/s]


Loaded a sample from Common Voice dataset for ca that was long enough >:)
Translation found: Tomorrow Dan and Roc will go to Torrebesses.
Filled 2025-09-02 with Catalan

Generating an example for Turkish (tr)


Reading metadata...: 31465it [00:01, 18064.71it/s]


Loaded a sample from Common Voice dataset for tr that was long enough >:)
Translation found: Three days later, the ventilator was removed and Lamba was declared dead.
Filled 2025-09-03 with Turkish

Generating an example for Esperanto (eo)


Reading metadata...: 144070it [00:04, 33819.53it/s]


Loaded a sample from Common Voice dataset for eo that was long enough >:)
Translation found: He was the first to dare to lay a hand on me, and we fought savagely.
Filled 2025-09-04 with Esperanto

Generating an example for Latvian (lv)


Reading metadata...: 3157it [00:00, 19090.17it/s]


Loaded a sample from Common Voice dataset for lv that was long enough >:)
Translation found: Speaking of finesse, this is an expensive restaurant.
Filled 2025-09-05 with Latvian

Generating an example for Afrikaans (af)


Reading metadata...: 15it [00:00, 167.18it/s]


Loaded a sample from Common Voice dataset for af that was long enough >:)
Translation found: There are some differences in the format of the volume and page numbers.
Filled 2025-09-06 with Afrikaans

Generating an example for Persian (fa-pin)


Reading metadata...: 28756it [00:01, 17599.01it/s]


Loaded a sample from Common Voice dataset for fa that was long enough >:)
Translation found: She should not put all her inventory up for sale.
Filled 2025-09-07 with Persian

Generating an example for Polish (pl)


Reading metadata...: 19119it [00:00, 25201.01it/s]


Loaded a sample from Common Voice dataset for pl that was long enough >:)
Translation found: It contributed to opening a new chapter in the process of building the European Union.
Filled 2025-09-08 with Polish

Generating an example for Portuguese (pt-pt)


Reading metadata...: 21202it [00:00, 26043.89it/s]


Loaded a sample from Common Voice dataset for pt that was long enough >:)
Translation found: Woman listening to music player while smoking a cigarette.
Filled 2025-09-09 with Portuguese

Generating an example for Armenian (hy)


Reading metadata...: 651it [00:00, 3138.98it/s]


Loaded a sample from Common Voice dataset for hy-AM that was long enough >:)
Translation found: He was one of the best performers of the classical repertoire.
Filled 2025-09-10 with Armenian

Generating an example for Dutch (nl)


Reading metadata...: 34088it [00:01, 26841.01it/s]


Loaded a sample from Common Voice dataset for nl that was long enough >:)
Translation found: Everything that has been said about this can be summed up in one word: tear it down.
Filled 2025-09-11 with Dutch

Generating an example for Czech (cs)


Reading metadata...: 19358it [00:00, 33475.84it/s]


Loaded a sample from Common Voice dataset for cs that was long enough >:)
Translation found: Most species grow in Indochina and Madagascar.
Filled 2025-09-12 with Czech

Generating an example for Serbian (sr)


Reading metadata...: 1521it [00:00, 14509.33it/s]


Loaded a sample from Common Voice dataset for sr but it was too short :3
Translation found: Who can't?
Filled 2025-09-13 with Serbian

Generating an example for Catalan (ca)


Reading metadata...: 1142607it [00:23, 48908.90it/s]


Loaded a sample from Common Voice dataset for ca that was long enough >:)
Translation found: For Sant Jordi, Roc and Garbí want to go visit my uncle.
Filled 2025-09-14 with Catalan

Generating an example for Swedish (sv)


Reading metadata...: 7584it [00:00, 16981.68it/s]


Loaded a sample from Common Voice dataset for sv-SE but it was too short :3
Translation found: I borrowed it from a guy.
Filled 2025-09-15 with Swedish

Generating an example for Swahili (sw)


Reading metadata...: 44075it [00:02, 20887.55it/s]


Loaded a sample from Common Voice dataset for sw that was long enough >:)
Translation found: Good leadership is very important in many African countries.
Filled 2025-09-16 with Swahili

Generating an example for Danish (da)


Reading metadata...: 3381it [00:00, 21336.21it/s]


Loaded a sample from Common Voice dataset for da that was long enough >:)
Translation found: Timber made from spruce can have good strength for many years
Filled 2025-09-17 with Danish

Generating an example for Persian (fa-pin)


Reading metadata...: 28756it [00:01, 22177.90it/s]


Loaded a sample from Common Voice dataset for fa that was long enough >:)
Translation found: Color cannot be felt by touch.
Filled 2025-09-18 with Persian

Generating an example for Macedonian (mk)


Reading metadata...: 67it [00:00, 1310.23it/s]


Loaded a sample from Common Voice dataset for mk that was long enough >:)
Translation found: Some new force was chasing me down the steep street.
Filled 2025-09-19 with Macedonian

Generating an example for Italian (it)


Reading metadata...: 166503it [00:03, 54461.94it/s]


Loaded a sample from Common Voice dataset for it that was long enough >:)
Translation found: Poirot then suggests holding a séance, and the others reluctantly agree.
Filled 2025-09-20 with Italian

Generating an example for Slovak (sk)


Reading metadata...: 3276it [00:00, 20176.15it/s]


Loaded a sample from Common Voice dataset for sk but it was too short :3
Translation found: disagreement
Filled 2025-09-21 with Slovak

Generating an example for Bulgarian (bg)


Reading metadata...: 3413it [00:00, 16736.53it/s]


Loaded a sample from Common Voice dataset for bg that was long enough >:)
Translation found: It is located three hours away from Stara Zagora, surrounded by Turkish villages.
Filled 2025-09-22 with Bulgarian

Generating an example for Nepali (ne)


Reading metadata...: 194it [00:00, 2352.94it/s]


Loaded a sample from Common Voice dataset for ne-NP that was long enough >:)
Translation found: A movie you like is good for you, a movie you don't like is bad for you.
Filled 2025-09-23 with Nepali

Generating an example for German (de)


Reading metadata...: 567993it [00:12, 47235.32it/s]


Loaded a sample from Common Voice dataset for de that was long enough >:)
Translation found: The former Catholic chapel of the old Krupp settlement Altenhof is now the rebuilt hospital chapel.
Filled 2025-09-24 with German

Generating an example for Spanish (es-la)


Reading metadata...: 311392it [00:06, 45358.49it/s]


Loaded a sample from Common Voice dataset for es that was long enough >:)
Translation found: The President may only be re-elected for a second term.
Filled 2025-09-25 with Spanish

Generating an example for Portuguese (pt-br)


Reading metadata...: 21202it [00:00, 29490.80it/s]


Loaded a sample from Common Voice dataset for pt that was long enough >:)
Translation found: A boy is kicking a blue ball on the carpet surrounded by toys.
Filled 2025-09-26 with Portuguese

Generating an example for Mandarin (zh)


Reading metadata...: 7048it [00:00, 30277.98it/s]


Loaded a sample from Common Voice dataset for zh-TW but it was too short :3
Translation found: At first I thought it was coming
Filled 2025-09-27 with Mandarin

Generating an example for Armenian (hy-west)


Reading metadata...: 651it [00:00, 9867.84it/s]


Loaded a sample from Common Voice dataset for hy-AM that was long enough >:)
Translation found: He is considered one of the most famous actors in Indian cinema.
Filled 2025-09-28 with Armenian

Generating an example for Hindi (hi)


Reading metadata...: 4630it [00:00, 20033.87it/s]


Loaded a sample from Common Voice dataset for hi that was long enough >:)
Translation found: Madhuri looks charming in the film 'Dedh Ishqiya', see the first look
Filled 2025-09-29 with Hindi

Generating an example for Vietnamese (vi)


Reading metadata...: 2383it [00:00, 16815.91it/s]


Loaded a sample from Common Voice dataset for vi that was long enough >:)
Translation found: Then will turn into a ghost soldier to serve the devil
Filled 2025-09-30 with Vietnamese

Generating an example for Welsh (cy)


Reading metadata...: 7871it [00:00, 23103.46it/s]


Loaded a sample from Common Voice dataset for cy that was long enough >:)
Filled 2025-10-01 with Welsh

Generating an example for Russian (ru)


Reading metadata...: 26328it [00:01, 21250.34it/s]


Loaded a sample from Common Voice dataset for ru that was long enough >:)
Translation found: At the high-level meeting five years ago, Ireland made a very specific commitment.
Filled 2025-10-02 with Russian

Generating an example for Punjabi (pa)


Reading metadata...: 728it [00:00, 11382.10it/s]


Loaded a sample from Common Voice dataset for pa-IN that was long enough >:)
Translation found: While this is Sania's first Grand Slam
Filled 2025-10-03 with Punjabi

Generating an example for Turkish (tr)


Reading metadata...: 31465it [00:01, 29559.40it/s]


Loaded a sample from Common Voice dataset for tr that was long enough >:)
Translation found: Kerim Agha weighed it with her eyes and said: "Take it to the town market tomorrow."
Filled 2025-10-04 with Turkish

Generating an example for Spanish (es)


Reading metadata...: 311392it [00:07, 43402.75it/s]


Loaded a sample from Common Voice dataset for es that was long enough >:)
Translation found: The study took place in the city of Quito, Pichincha, Ecuador.
Filled 2025-10-05 with Spanish

Generating an example for Vietnamese (vi-sgn)


Reading metadata...: 2383it [00:00, 23269.41it/s]


Loaded a sample from Common Voice dataset for vi that was long enough >:)
Translation found: Unable to escape, he also killed his relatives and friends.
Filled 2025-10-06 with Vietnamese

Generating an example for Greek (el)


Reading metadata...: 1913it [00:00, 19139.37it/s]


Loaded a sample from Common Voice dataset for el that was long enough >:)
Translation found: "Old man!"
Filled 2025-10-07 with Greek



## Replacer

In [6]:
# Manually replacing
date_string = "2025-07-01"

# Load the dataframe
if CSV_PATH.exists():
    game_df = pd.read_csv(CSV_PATH, parse_dates = ['date'])
    game_df['date'] = game_df['date'].dt.date
else:
    game_df = pd.DataFrame(columns=['date'])

# Replace the example for this date with language of choice
replace_day('sw', date_string, pd.read_csv(LANGUAGE_DATA_PATH), game_df, Path(GAME_DATA_PATH))

Generating an example for Swahili (sw)


Reading metadata...: 44075it [00:02, 16429.99it/s]


Loaded a sample from Common Voice dataset for sw that was long enough >:)
Clicked target language dropdown
Translation found: denied entry to Chinese people, the law aimed to reduce
Replaced 2025-07-01 with Swahili)


## View Game Data

In [13]:
game_df.tail(10)

Unnamed: 0,date,iso,family_2,translation,sentence,family_0,lineage,espeak_code,family_1,sampling_rate,IPA,wave,language,cv_code,muffled_wave
23,2025-07-06,hin,Shaurasenic,These apps will make your train journey easier,ये एप्स आपके ट्रेन के सफर को बनाएंगे और आसान,Indo-European,"['Indo-European', 'Classical Indo-European', '...",hi,Indo-Iranian,48000,jeː ˈeːps ˌaːpkˌeː ʈɾˈeːn keː sˈʌpʰəɾ koː bˌən...,2025-07-06.mp3,Hindi,hi,
24,2025-07-07,vie,Viet-Muong,I took the pack of cigarettes out to the porch...,Tôi cầm bao thuốc ra bên ngoài hiên hóng gió,Austroasiatic,"['Austroasiatic', 'Vietic', 'Viet-Muong', 'Vie...",vi,Vietic,48000,t̪ˈo͡ɪ1 ɡˈə2m bˈaː͡ʊ1 tˈu͡əɜk͡h zˈaː1 bˈe1n ŋˈ...,2025-07-07.mp3,Vietnamese,vi,
25,2025-07-08,nep,Eastern Pahari,If anyone has a program that can be held on th...,यदि कसैको क्याम्पसमा कार्यक्रम गर्न मिल्ने छ भ...,Indo-European,"['Indo-European', 'Classical Indo-European', '...",ne,Indo-Iranian,48000,jˈʌdɪ kəsˈɛːkoː kːjˈaːmpəsˌəmaː kˈaːɾrjəkɾˌəmə...,2025-07-08.mp3,Nepali,ne-NP,
26,2025-07-09,vie,Viet–Mường,"Looking at the clock, it was already ten o'clo...",Nhìn đồng hồ lúc này cũng đã là mười giờ tối,Austroasiatic,"['Austroasiatic', 'Vietic', 'Viet-Muong', 'Vie...",vi-hue,Vietic,48000,ɲˈi2ŋ ɗˈo2 hˈo2 lˌuɜk͡h nˈa͡ɪ2 ɡˈu5ŋ ɗˌaː5 lˌa...,2025-07-09.mp3,Vietnamese,vi,
27,2025-07-10,ell,Modern Koineic,His knees buckled for a moment and he leaned a...,Τα γόνατά του κόπηκαν προς στιγμή και ακούμπησ...,Indo-European,"['Indo-European', 'Classical Indo-European', '...",el,Graeco-Phrygian,48000,ta ɣˈonatˈa tu kˈopikˌam brˈos stiɣmˈi ke akˈu...,2025-07-10.mp3,Greek,el,
28,2025-07-11,cym,Brythonic,Porridge is oatmeal boiled in water or milk.,Blawd ceirch wedi'i ferwi mewn dŵr neu laeth y...,Indo-European,"['Indo-European', 'Classical Indo-European', '...",cy,Celtic,48000,blˈa͡ʊd kˈə͡ɪrx wɛdˈiːɨ vˈɛrwɨ me͡ʊn dˈuːr nˈə...,2025-07-11.mp3,Welsh,cy,
29,2025-07-12,mkd,South Slavic,"The houses are incredibly realistic, with mode...","Куќичките се неверојатно реални, со модерни де...",Indo-European,"['Indo-European', 'Classical Indo-European', '...",mk,Balto-Slavic,48000,kʊk^ˈit͡ɕkɪtˌe se nˌeverˈojætnˌo rˈeælnˌɪ\n s ...,2025-07-12.mp3,Macedonian,mk,
30,2025-07-13,lit,Eastern Baltic,Its area is about half the area of Abruzzo.,Jos plotas sudaro apie pusę Abrucų ploto.,Indo-European,"['Indo-European', 'Classical Indo-European', '...",lt,Balto-Slavic,48000,jˈoːs pl̩ˈoːtas sudˈaroː ˈapʲi͡e pˈuʂe͡ɑ abrˈu...,2025-07-13.mp3,Lithuanian,lt,
31,2025-07-14,fin,Coastal Finnic,"Or has some other species, perhaps a bacterium...","Vai onko jokin muu laji, bakteeri kenties, tuh...",Uralic,"['Uralic', 'Finnic', 'Coastal Finnic', 'Neva',...",fi,Finnic,48000,va͡i ˈoŋko jˈokɪn mˈuː lˈajɪ\n bˈakteːrɪ kˈent...,2025-07-14.mp3,Finnish,fi,
32,2025-07-15,afr,Macro-Dutch,Your school choir performed the choral works b...,Jou skoolkoor het die koorwerke hieronder gedu...,Indo-European,"['Indo-European', 'Classical Indo-European', '...",af,Germanic,48000,jə͡ʊ skˈʊ͡əlkʊ͡ər hɛ di kˈʊ͡ərværkə hˈiːrɔnər ...,2025-07-15.mp3,Afrikaans,af,
