## Batch Game Generator

Other possible clues:
  - Other linguistic info about the language
  - Place of origin: 
  - Specific to the audio sample: 
    - Wompy audio version
  - Look into semantic scholar, only import if these fields are not blank or null

## Settings

In [6]:
# NUMBER OF DAYS TO GENERATE USING THIS BATCH PROCESS
number_to_generate = 20

In [7]:
import pandas as pd
import numpy as np
from langcodes import Language
from datasets import load_dataset, Audio as HF_Audio
import librosa, scipy.signal as sig
from datetime import datetime
from datetime import date, timedelta
from pathlib import Path
import subprocess
import shutil, soundfile as sf
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import os

In [8]:
# Metadata
LANGUAGE_DATA_PATH = 'languages.csv'
GAME_DATA_PATH = 'game_data.csv'
CSV_PATH = Path(GAME_DATA_PATH)

## Methods

In [9]:
# How to sample from the languages list
def sample_with_repeat_rounds(arr, N):
    if not arr:
        return []
    result = []
    while len(result) < N:
        remaining = N - len(result)
        draw_count = min(remaining, len(arr))
        new_samples = random.sample(arr, draw_count)
        result.extend(new_samples)

    return result

# Count number of words in a string
def count_words(text):
    return len(text.split())

# Request a sample from Mozilla's Common Voice
def sample_common_voice(cv_code: str):
    version = 15
    try:
        ds_stream = load_dataset(
            f'mozilla-foundation/common_voice_{version}_0',
            name = cv_code, 
            split = 'train',
            streaming = True
            # token = os.environ["HF_TOKEN"]
        )
        ds_stream.cast_column('audio', HF_Audio(decode = True))
        seed = random.randint(0, 2**32 - 1)
        ds_shuffled = ds_stream.shuffle(buffer_size=2048, seed=seed)

        samples = list(ds_shuffled.take(5))
        for sample in samples:
            if sample['audio']['array'].shape[0] >= 200000:
                print(f'Loaded a sample from Common Voice dataset for {cv_code} that was long enough >:)')
                return [sample]
        print(f'Loaded a sample from Common Voice dataset for {cv_code} but it was too short :3')
        return [samples[0]]
        
    except Exception as e:
        print(f'Failed to sample from the Common Voice dataset:\n{e}')
        return []

# Exploit Google Translate to translate this sentence (webscraping!)
def translate(language, text):
    
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    try:
        # Navigate to Google Translate
        driver.get("https://translate.google.com")
        wait = WebDriverWait(driver, 15)
        actions = ActionChains(driver)
        
        # Wait for page to load
        time.sleep(0.5)
        
        # Enter text (cursor should be ready in text area)
        actions.send_keys(text).perform()
        
        # Find and click the target language button
        target_lang_button = wait.until(EC.element_to_be_clickable((
            By.CSS_SELECTOR, 
            "button[aria-label*='target language'], .VfPpkd-Bz112c-RLmnJb"
        )))
        target_lang_button.click()
        
        # Wait briefly for dropdown, then type language and press Enter
        time.sleep(0.5)
        actions.send_keys(language).perform()
        time.sleep(0.5)
        actions.send_keys(Keys.RETURN).perform()
        
        # Wait for translation and extract
        time.sleep(0.5)
        
        # Extract translation
        translation_selectors = [
            "span[jsname='W297wb']",
            "span[lang]:not([lang='auto']):not([lang=''])",
            ".ryNqvb span",
            ".J0lOec span"
        ]
        
        for selector in translation_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    translated_text = element.text.strip()
                    if translated_text and translated_text != text:
                        print(f"Translation found: {translated_text}")
                        return translated_text
            except:
                continue
        return None
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None
        
    finally:
        driver.quit()

def find_espeak():
    """Return path to espeak-ng or espeak binary, raise if missing."""
    exe = shutil.which("espeak-ng") or shutil.which("espeak")
    if exe is None:
        raise FileNotFoundError("No espeak-ng/espeak binary on PATH")
    return exe

def phonemize(text: str, lang: str = "en", ipa: bool = False,
              ipa_level: int = 1, keep_stress: bool = True) -> str:
    """Phonemize text using eSpeak (NG)"""
    try: 
        exe = find_espeak()

        # Build the command line
        args = [exe, "-q", f"-v{lang}"]
        if ipa:
            args.append(f"--ipa={ipa_level}")
        else:
            args.append("-x")
            if not keep_stress:
                args.append("--sep=-") # do this to strip stress marks later
        args.append(text)

        # Run the command
        proc = subprocess.run(args, text = True, capture_output = True, check = True)
        out = proc.stdout.strip()

        if not keep_stress:
            # eSpeak stress marks are the `'` characters; remove them
            out = out.replace("ˈ", "")

        # Sometimes line breaks appear in the output; remove them
        out = out.replace("\n", " ").replace("_", "").replace("\r", " ").strip()

        return out
    
    except Exception as e:
        print(f"Error getting IPA representation, enter manually: {str(e)}")
        return "???"
    
def get_row(chosen_language, day, languages_df):
    # Get language data from languages dataframe
    languages_row = languages_df[languages_df['espeak_code'] == chosen_language].iloc[0]
    LANGUAGE = { 
        'ESPEAK_CODE': chosen_language,
        'ENGLISH_NAME' : languages_row['english_name'], 
        'ISO' : languages_row['iso3'],  
        'CV' : languages_row['cv_code'], 
        'ESPEAK' : languages_row['espeak_code'], 
        'LINEAGE' : eval(languages_row['lineage']), 
        'FAMILY_0' : languages_row['family_0'], 
        'FAMILY_1' : languages_row['family_1'], 
        'FAMILY_2' : languages_row['family_2']
    }

    print(f"Generating an example for {LANGUAGE['ENGLISH_NAME']} ({LANGUAGE['ESPEAK_CODE']})")

    # Get a sample from that language
    LANGUAGE['SAMPLE'] = sample_common_voice(LANGUAGE['CV'])
    LANGUAGE['SAMPLE'][0]['sentence'] = LANGUAGE['SAMPLE'][0]['sentence'].replace('\n', ' ')

    # Try to translate it
    try:
        LANGUAGE['SAMPLE'][0]['translation'] = translate(LANGUAGE['ENGLISH_NAME'], LANGUAGE['SAMPLE'][0]['sentence'])
    except Exception as e:
        print(f'Failed to add translation, enter it manually:\n{e}')
        LANGUAGE['SAMPLE'][0]['translation'] = '???????'

    # Extract audio data of sample
    wave = LANGUAGE['SAMPLE'][0]['audio']['array']
    rate = LANGUAGE['SAMPLE'][0]['audio']['sampling_rate']

    # Convert to phonetics
    LANGUAGE['SAMPLE'][0]['IPA'] = phonemize(text = LANGUAGE['SAMPLE'][0]['sentence'], lang = LANGUAGE['ESPEAK'], ipa = True, ipa_level = 1, keep_stress = False)

    # Generate and store corresponding audio file
    AUDIO_DIR = Path('assets/audio')
    AUDIO_DIR.mkdir(exist_ok = True)
    fname_base = f"{day:%Y-%m-%d}"
    file_path = AUDIO_DIR / f"{fname_base}.mp3"
    # muffled_file_path = AUDIO_DIR / f"{fname_base}_muffled.mp3"
    # muffled_wave = sig.sosfiltfilt(sig.butter(1, 700, "low", fs = rate, output = "sos"),
    #                          librosa.effects.pitch_shift(wave.astype(np.float32), sr = rate, n_steps = -2))
    sf.write(file_path, wave, rate, format='MP3')
    # sf.write(muffled_file_path, muffled_wave, rate, format='WAV')

    # build a new row based on the generated LANGUAGE dict from above
    new_row = { 'date': day,
                'language': LANGUAGE['ENGLISH_NAME'],
                'iso': LANGUAGE['ISO'],
                'cv_code': LANGUAGE['CV'],
                'espeak_code': LANGUAGE['ESPEAK'],
                'lineage': LANGUAGE['LINEAGE'],
                'family_0': LANGUAGE['FAMILY_0'],
                'family_1': LANGUAGE['FAMILY_1'],
                'family_2': LANGUAGE['FAMILY_2'],
                'sentence': LANGUAGE['SAMPLE'][0]['sentence'],
                'translation': LANGUAGE['SAMPLE'][0]['translation'],
                'wave': file_path.name,
                # 'muffled_wave': muffled_file_path.name,
                'sampling_rate': LANGUAGE['SAMPLE'][0]['audio']['sampling_rate'],
                'IPA': LANGUAGE['SAMPLE'][0]['IPA']
    }

    return new_row
    
def fill_day(chosen_language, day, languages_df, game_df, CSV_PATH):
    new_row = get_row(chosen_language, day, languages_df)
    # ensure any new columns are present in the dataframe
    missing_cols = set(new_row) - set(game_df.columns)
    for c in missing_cols:
        game_df[c] = pd.NA # create blank column for any new field

    # append and reset the index
    game_df = pd.concat([game_df, pd.DataFrame([new_row])], ignore_index = True)

    # store into the game_df
    game_df.to_csv(CSV_PATH, index = False)

    print(f"Filled {day:%Y-%m-%d} with {new_row['language']}\n")
    

def replace_day(chosen_language, day, languages_df, game_df, CSV_PATH):
    # Find index for matching date
    datetime_object = datetime.strptime(day, "%Y-%m-%d").date()
    mask = game_df['date'] == datetime_object
    hits = mask.sum()
    if hits == 0:
        raise KeyError(f"No row found with date {day}")
    if hits > 1:
        raise KeyError(f"Expected one row with date {day}, found {hits}")
    
    new_row = get_row(chosen_language, datetime_object, languages_df)
    pd_new_row = (pd.Series(new_row).reindex(game_df.columns, fill_value=pd.NA))
    
    idx = game_df.index[mask][0]
    game_df.loc[idx] = pd_new_row

    # store into the game_df
    game_df.to_csv(CSV_PATH, index = False)

    print(f"Replaced {day} with {new_row['language']}")

## Batch Generator

In [10]:
# All available languages, removing english
languages_df = pd.read_csv(LANGUAGE_DATA_PATH)
all_languages = languages_df['espeak_code'].unique()
for code in ['en', 'en-gb', 'en-sc', 'en-uk-north', 'en-uk-wmids', 'en-us', 'en-wi']:
    all_languages = np.delete(all_languages, np.where(all_languages == code))
all_languages

# Select number_to_generate many languages, with some non-repetition baked into this choice
sample = sample_with_repeat_rounds(list(all_languages), number_to_generate)

START_DAY = date(2025, 6, 13) # first day in the daily games series

# Generate examples
for chosen_language in sample:

    # Load the dataframe
    if CSV_PATH.exists():
        game_df = pd.read_csv(CSV_PATH, parse_dates = ['date'])
        # mutate the column to a Python date, not Timestamp
        game_df['date'] = game_df['date'].dt.date
    else:
        game_df = pd.DataFrame(columns=['date'])

    # Decide date to which this example corresponds
    next_day = (max(game_df['date']) + timedelta(days = 1)) if not game_df.empty else START_DAY

    # Fill in the next day with an example of the chosen language
    fill_day(chosen_language, next_day, languages_df, game_df, CSV_PATH)

Generating an example for Welsh (cy)


Reading metadata...: 7871it [00:00, 23850.14it/s]


Loaded a sample from Common Voice dataset for cy that was long enough >:)
Translation found: This article covers malnutrition and overnutrition.
Filled 2025-12-17 with Welsh

Generating an example for Macedonian (mk)


Reading metadata...: 67it [00:00, 891.70it/s]


Loaded a sample from Common Voice dataset for mk that was long enough >:)
Translation found: Relativistic corrections are also needed for quantum mechanics.
Filled 2025-12-18 with Macedonian

Generating an example for Tamil (ta)


Reading metadata...: 44044it [00:01, 24278.56it/s]


Loaded a sample from Common Voice dataset for ta that was long enough >:)
Translation found: Finally, I have to write a little about the play Nandanar, which I have chosen as the seventh play.
Filled 2025-12-19 with Tamil

Generating an example for Georgian (ka)


Reading metadata...: 39326it [00:02, 16778.73it/s]


Loaded a sample from Common Voice dataset for ka that was long enough >:)
Translation found: The first five books in the series review the facts of The Silmarillion and related events.
Filled 2025-12-20 with Georgian

Generating an example for Turkish (tr)


Reading metadata...: 31465it [00:01, 28225.55it/s]


Loaded a sample from Common Voice dataset for tr but it was too short :3
Translation found: Come on, come out.
Filled 2025-12-21 with Turkish

Generating an example for Swedish (sv)


Reading metadata...: 7584it [00:00, 29453.47it/s]


Loaded a sample from Common Voice dataset for sv-SE that was long enough >:)
Translation found: I can't send a bill to the Chinese guy who peed on the carpet.
Filled 2025-12-22 with Swedish

Generating an example for German (de)


Reading metadata...: 567993it [00:17, 32243.63it/s]


Loaded a sample from Common Voice dataset for de that was long enough >:)
Translation found: The British commander also did not consider entrenchment necessary.
Filled 2025-12-23 with German

Generating an example for Swahili (sw)


Reading metadata...: 44075it [00:01, 33329.18it/s]


Loaded a sample from Common Voice dataset for sw that was long enough >:)
Error: 'NoneType' object has no attribute 'is_displayed'
Filled 2025-12-24 with Swahili

Generating an example for Nepali (ne)


Reading metadata...: 194it [00:00, 3140.89it/s]


Loaded a sample from Common Voice dataset for ne-NP that was long enough >:)
Translation found: Yes, the dozer should have been used in the open field, but after following the procedure.
Filled 2025-12-25 with Nepali

Generating an example for Persian (fa)


Reading metadata...: 28756it [00:00, 31804.85it/s]


KeyboardInterrupt: 

## Replacer

In [6]:
# Manually replacing
date_string = "2025-07-01"

# Load the dataframe
if CSV_PATH.exists():
    game_df = pd.read_csv(CSV_PATH, parse_dates = ['date'])
    game_df['date'] = game_df['date'].dt.date
else:
    game_df = pd.DataFrame(columns=['date'])

# Replace the example for this date with language of choice
replace_day('sw', date_string, pd.read_csv(LANGUAGE_DATA_PATH), game_df, Path(GAME_DATA_PATH))

Generating an example for Swahili (sw)


Reading metadata...: 44075it [00:02, 16429.99it/s]


Loaded a sample from Common Voice dataset for sw that was long enough >:)
Clicked target language dropdown
Translation found: denied entry to Chinese people, the law aimed to reduce
Replaced 2025-07-01 with Swahili)


## View Game Data

In [13]:
game_df.tail(10)

Unnamed: 0,date,iso,family_2,translation,sentence,family_0,lineage,espeak_code,family_1,sampling_rate,IPA,wave,language,cv_code,muffled_wave
23,2025-07-06,hin,Shaurasenic,These apps will make your train journey easier,ये एप्स आपके ट्रेन के सफर को बनाएंगे और आसान,Indo-European,"['Indo-European', 'Classical Indo-European', '...",hi,Indo-Iranian,48000,jeː ˈeːps ˌaːpkˌeː ʈɾˈeːn keː sˈʌpʰəɾ koː bˌən...,2025-07-06.mp3,Hindi,hi,
24,2025-07-07,vie,Viet-Muong,I took the pack of cigarettes out to the porch...,Tôi cầm bao thuốc ra bên ngoài hiên hóng gió,Austroasiatic,"['Austroasiatic', 'Vietic', 'Viet-Muong', 'Vie...",vi,Vietic,48000,t̪ˈo͡ɪ1 ɡˈə2m bˈaː͡ʊ1 tˈu͡əɜk͡h zˈaː1 bˈe1n ŋˈ...,2025-07-07.mp3,Vietnamese,vi,
25,2025-07-08,nep,Eastern Pahari,If anyone has a program that can be held on th...,यदि कसैको क्याम्पसमा कार्यक्रम गर्न मिल्ने छ भ...,Indo-European,"['Indo-European', 'Classical Indo-European', '...",ne,Indo-Iranian,48000,jˈʌdɪ kəsˈɛːkoː kːjˈaːmpəsˌəmaː kˈaːɾrjəkɾˌəmə...,2025-07-08.mp3,Nepali,ne-NP,
26,2025-07-09,vie,Viet–Mường,"Looking at the clock, it was already ten o'clo...",Nhìn đồng hồ lúc này cũng đã là mười giờ tối,Austroasiatic,"['Austroasiatic', 'Vietic', 'Viet-Muong', 'Vie...",vi-hue,Vietic,48000,ɲˈi2ŋ ɗˈo2 hˈo2 lˌuɜk͡h nˈa͡ɪ2 ɡˈu5ŋ ɗˌaː5 lˌa...,2025-07-09.mp3,Vietnamese,vi,
27,2025-07-10,ell,Modern Koineic,His knees buckled for a moment and he leaned a...,Τα γόνατά του κόπηκαν προς στιγμή και ακούμπησ...,Indo-European,"['Indo-European', 'Classical Indo-European', '...",el,Graeco-Phrygian,48000,ta ɣˈonatˈa tu kˈopikˌam brˈos stiɣmˈi ke akˈu...,2025-07-10.mp3,Greek,el,
28,2025-07-11,cym,Brythonic,Porridge is oatmeal boiled in water or milk.,Blawd ceirch wedi'i ferwi mewn dŵr neu laeth y...,Indo-European,"['Indo-European', 'Classical Indo-European', '...",cy,Celtic,48000,blˈa͡ʊd kˈə͡ɪrx wɛdˈiːɨ vˈɛrwɨ me͡ʊn dˈuːr nˈə...,2025-07-11.mp3,Welsh,cy,
29,2025-07-12,mkd,South Slavic,"The houses are incredibly realistic, with mode...","Куќичките се неверојатно реални, со модерни де...",Indo-European,"['Indo-European', 'Classical Indo-European', '...",mk,Balto-Slavic,48000,kʊk^ˈit͡ɕkɪtˌe se nˌeverˈojætnˌo rˈeælnˌɪ\n s ...,2025-07-12.mp3,Macedonian,mk,
30,2025-07-13,lit,Eastern Baltic,Its area is about half the area of Abruzzo.,Jos plotas sudaro apie pusę Abrucų ploto.,Indo-European,"['Indo-European', 'Classical Indo-European', '...",lt,Balto-Slavic,48000,jˈoːs pl̩ˈoːtas sudˈaroː ˈapʲi͡e pˈuʂe͡ɑ abrˈu...,2025-07-13.mp3,Lithuanian,lt,
31,2025-07-14,fin,Coastal Finnic,"Or has some other species, perhaps a bacterium...","Vai onko jokin muu laji, bakteeri kenties, tuh...",Uralic,"['Uralic', 'Finnic', 'Coastal Finnic', 'Neva',...",fi,Finnic,48000,va͡i ˈoŋko jˈokɪn mˈuː lˈajɪ\n bˈakteːrɪ kˈent...,2025-07-14.mp3,Finnish,fi,
32,2025-07-15,afr,Macro-Dutch,Your school choir performed the choral works b...,Jou skoolkoor het die koorwerke hieronder gedu...,Indo-European,"['Indo-European', 'Classical Indo-European', '...",af,Germanic,48000,jə͡ʊ skˈʊ͡əlkʊ͡ər hɛ di kˈʊ͡ərværkə hˈiːrɔnər ...,2025-07-15.mp3,Afrikaans,af,
