## Batch Game Generator

In [6]:
# NUMBER OF DAYS TO GENERATE USING THIS BATCH PROCESS
number_to_generate = 4

# Metadata
LANGUAGE_DATA_PATH = 'languages.csv'
GAME_DATA_PATH = 'game_data.csv'

In [7]:
import pprint
import pandas as pd
import numpy as np
from langcodes import Language
from datasets import load_dataset, Audio as HF_Audio
from IPython.display import Audio
from itertools import islice
from datetime import date, timedelta
from pathlib import Path
import subprocess
import shutil, soundfile as sf
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time

In [None]:
# Request a sample from Mozilla's Common Voice
def sample_common_voice(cv_code: str, n_samples: int = 1):
    version = 15
    try:
        ds_stream = load_dataset(
            f'mozilla-foundation/common_voice_{version}_0',
            name = cv_code, 
            split = 'train',
            streaming = True
        )
        ds_stream.cast_column('audio', HF_Audio(decode = True))
        seed = random.randint(0, 2**32 - 1)
        ds_shuffled = ds_stream.shuffle(buffer_size=2048, seed=seed)

        sample = list(ds_shuffled.take(n_samples))
        print(f'Successfully loaded {len(sample)} sample{"" if len(sample) == 1 else "s"} from Common Voice dataset for language code: {cv_code}')
        return sample
        
    except Exception as e:
        print(f'Failed to sample from the Common Voice dataset:\n{e}')
        return []

# Exploit Google Translate to translate this sentence
def translate(language, text):
    
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    try:
        # Navigate to Google Translate
        driver.get("https://translate.google.com")
        wait = WebDriverWait(driver, 15)
        actions = ActionChains(driver)
        
        # Wait for page to load
        time.sleep(0.5)
        
        # Step 1: Enter text (cursor should be ready in text area)
        # print("Entering text...")
        actions.send_keys(text).perform()
        # print(f"Entered: {text}")
        
        # Step 2: Click language dropdown and select language
        # print("Selecting target language...")
        
        # Find and click the target language button
        target_lang_button = wait.until(EC.element_to_be_clickable((
            By.CSS_SELECTOR, 
            "button[aria-label*='target language'], .VfPpkd-Bz112c-RLmnJb"
        )))
        target_lang_button.click()
        print("Clicked target language dropdown")
        
        # Wait briefly for dropdown, then type language and press Enter
        time.sleep(0.5)
        actions.send_keys(language).perform()
        time.sleep(0.5)
        actions.send_keys(Keys.RETURN).perform()
        # print(f"Selected language: {language}")
        
        # Step 3: Wait for translation and extract
        # print("Waiting for translation...")
        time.sleep(0.5)
        
        # Extract translation
        translation_selectors = [
            "span[jsname='W297wb']",
            "span[lang]:not([lang='auto']):not([lang=''])",
            ".ryNqvb span",
            ".J0lOec span"
        ]
        
        for selector in translation_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    translated_text = element.text.strip()
                    if translated_text and translated_text != text:
                        print(f"Translation found: {translated_text}")
                        return translated_text
            except:
                continue
        
        # print("No translation found")
        return None
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None
        
    finally:
        driver.quit()

def find_espeak():
    """Return path to espeak-ng or espeak binary, raise if missing."""
    exe = shutil.which("espeak-ng") or shutil.which("espeak")
    if exe is None:
        raise FileNotFoundError("No espeak-ng/espeak binary on PATH")
    return exe

def phonemize(text: str, lang: str = "en", ipa: bool = False,
              ipa_level: int = 1, keep_stress: bool = True) -> str:
    """Phonemize text using eSpeak (NG)"""
    try: 
        exe = find_espeak()

        # Build the command line
        args = [exe, "-q", f"-v{lang}"]
        if ipa:
            args.append(f"--ipa={ipa_level}")
        else:
            args.append("-x")       # eSpeak’s own symbols
            if not keep_stress:
                args.append("--sep=-")  # abuse this to strip stress marks later
        args.append(text)

        # Run the command
        proc = subprocess.run(
            args,
            text = True, capture_output = True, check = True
        )
        out = proc.stdout.strip()

        if not keep_stress and not ipa:
            # eSpeak stress marks are the `'` characters; remove them.
            out = out.replace("'", "")

        return out
    
    except Exception as e:
        print(f"Error getting IPA representation, enter manually: {str(e)}")
        return "???"


In [9]:
# All available languages, removing english
languages_df = pd.read_csv(LANGUAGE_DATA_PATH)
all_languages = languages_df['espeak_code'].unique()
for code in ['en', 'en-gb', 'en-sc', 'en-uk-north', 'en-uk-wmids', 'en-us', 'en-wi']:
    all_languages = np.delete(all_languages, np.where(all_languages == code))
all_languages

# Game dataframe 
CSV_PATH = Path(GAME_DATA_PATH)
START_DAY = date(2025, 6, 13) # first day in the daily games series

# Load the dataframe
if CSV_PATH.exists():
    game_df = pd.read_csv(CSV_PATH, parse_dates = ['date'])
    # mutate the column to a Python date, not Timestamp
    game_df['date'] = game_df['date'].dt.date
else:
    game_df = pd.DataFrame(columns=['date'])

# Generate examples
for i in range(number_to_generate):

    # Choose a language
    chosen_language = random.choice(all_languages)

    # Get language data from languages dataframe
    languages_row = languages_df[languages_df['espeak_code'] == chosen_language].iloc[0]
    LANGUAGE = { 
        'ESPEAK_CODE': chosen_language,
        'ENGLISH_NAME' : languages_row['english_name'], 
        'ISO' : languages_row['iso3'], 
        'CV' : languages_row['cv_code'], 
        'ESPEAK' : languages_row['espeak_code'], 
        'LINEAGE' : eval(languages_row['lineage']), 
        'FAMILY_0' : languages_row['family_0'], 
        'FAMILY_1' : languages_row['family_1'], 
        'FAMILY_2' : languages_row['family_2']
    }

    print(f"Generating an example for {LANGUAGE['ENGLISH_NAME']} ({LANGUAGE['ESPEAK_CODE']})")

    # Get a sample from that language
    LANGUAGE['SAMPLE'] = sample_common_voice(LANGUAGE['CV'], n_samples = 1)

    # Try to translate it
    try:
        LANGUAGE['SAMPLE'][0]['translation'] = translate(LANGUAGE['ENGLISH_NAME'], LANGUAGE['SAMPLE'][0]['sentence'])
    except Exception as e:
        print(f'Failed to add translation, enter it manually:\n{e}')
        LANGUAGE['SAMPLE'][0]['translation'] = '???????'

    # Extract audio data of sample
    wave = LANGUAGE['SAMPLE'][0]['audio']['array']
    rate = LANGUAGE['SAMPLE'][0]['audio']['sampling_rate']

    # Convert to phonetics
    LANGUAGE['SAMPLE'][0]['IPA'] = phonemize(text = LANGUAGE['SAMPLE'][0]['sentence'], lang = LANGUAGE['ESPEAK'], ipa = True, ipa_level = 1)

    # Decide date to which this example corresponds
    next_day = (max(game_df['date']) + timedelta(days = 1)) if not game_df.empty else START_DAY

    # Generate and store corresponding audio file
    AUDIO_DIR = Path('assets/audio')
    AUDIO_DIR.mkdir(exist_ok = True)
    fname_base = f"{next_day:%Y-%m-%d}"
    file_path = AUDIO_DIR / f"{fname_base}.mp3"
    sf.write(file_path, wave, rate, format='MP3')

    # build a new row based on the generated LANGUAGE dict from above
    new_row = { 'date': next_day,
                'language': LANGUAGE['ENGLISH_NAME'],
                'iso': LANGUAGE['ISO'],
                'cv_code': LANGUAGE['CV'],
                'espeak_code': LANGUAGE['ESPEAK'],
                'lineage': LANGUAGE['LINEAGE'],
                'family_0': LANGUAGE['FAMILY_0'],
                'family_1': LANGUAGE['FAMILY_1'],
                'family_2': LANGUAGE['FAMILY_2'],
                'sentence': LANGUAGE['SAMPLE'][0]['sentence'],
                'translation': LANGUAGE['SAMPLE'][0]['translation'],
                'wave': file_path.name,
                'sampling_rate': LANGUAGE['SAMPLE'][0]['audio']['sampling_rate'],
                'IPA': LANGUAGE['SAMPLE'][0]['IPA']
    }

    # ensure any new columns are present in the dataframe
    missing_cols = set(new_row) - set(game_df.columns)
    for c in missing_cols:
        game_df[c] = pd.NA # create blank column for any new field

    # append and reset the index
    game_df = pd.concat([game_df, pd.DataFrame([new_row])], ignore_index = True)

    # store into the game_df
    game_df.to_csv(CSV_PATH, index = False)

    print("\n")

Generating an example for Cantonese (zh-yue)


Reading metadata...: 3074it [00:00, 8095.97it/s]


Successfully loaded 1 sample from Common Voice dataset for language code: yue
Entering text...
Entered: 我係獨生嘅
Selecting target language...
Clicked target language dropdown
Selected language: Cantonese
Waiting for translation...
Translation found: I was an only child



Generating an example for Persian (fa)


Reading metadata...: 28756it [00:03, 7961.65it/s]


Successfully loaded 1 sample from Common Voice dataset for language code: fa
Entering text...
Entered: اگه عاشقته وای به حالش
Selecting target language...
Clicked target language dropdown
Selected language: Persian
Waiting for translation...
Translation found: If you love her, then woe to her.



Generating an example for Hindi (hi)


Reading metadata...: 4630it [00:00, 7789.34it/s]


Successfully loaded 1 sample from Common Voice dataset for language code: hi
Entering text...
Entered: मुंबई: बोरीवली और विरार के बीच रेल सेवा सामान्य
Selecting target language...
Clicked target language dropdown
Selected language: Hindi
Waiting for translation...
Translation found: Mumbai: Rail service between Borivali and Virar is normal



Generating an example for Vietnamese (vi-hue)


Reading metadata...: 2383it [00:00, 8485.83it/s]


Successfully loaded 1 sample from Common Voice dataset for language code: vi
Entering text...
Entered: Đang ngắm trời sao
Selecting target language...
Clicked target language dropdown
Selected language: Vietnamese
Waiting for translation...
Translation found: Stargazing





In [10]:
game_df.tail(10)

Unnamed: 0,date,iso,family_2,translation,sentence,family_0,lineage,espeak_code,family_1,sampling_rate,IPA,wave,language,cv_code
0,2025-06-13,ita,Italic,"Unfortunately, the entire Christian family has...",Purtroppo l’intera famiglia cristiana non ha a...,Indo-European,"['Indo-European', 'Classical Indo-European', '...",it,Classical Indo-European,48000,pʊrtrˈɔpːo lintˈɛːra famˈiːʎa kristjˈaːna nˌon...,2025-06-13.mp3,Italian,it
1,2025-06-14,fas,Iranian,She distorted my statements.,او اظهارات مرا تحریف کرد.,Indo-European,"['Indo-European', 'Indo-Iranian', 'Iranian']",fa-pin,Indo-Iranian,48000,ˈuː ezhˈɑrɑt marˈɑ tˈahrif kˈard,2025-06-14.mp3,Persian,fa
2,2025-06-15,hun,Hungarian,The value of her textbooks is confirmed by the...,Tankönyveinek értékét minősíti azok többszöri ...,Uralic,"['Uralic', 'Hungaric', 'Hungarian']",hu,Hungaric,48000,tˈɑnkøɲvɛinɛk ˈeːrteːkeːt mˈinøːʃiːti ˈɑzok tˈ...,2025-06-15.mp3,Hungarian,hu
3,2025-06-16,rus,Balto-Slavic,The doctor is crazy too.,— Доктор тоже сумасшедший.,Indo-European,"['Indo-European', 'Classical Indo-European', '...",ru,Classical Indo-European,48000,dˈoktʌr tˈoʒy sumaʃʃˈɛdʃyj,2025-06-16.mp3,Russian,ru
4,2025-06-17,fin,Coastal Finnic,Rosina turns around in surprise.,Rosina käännähtää ihmeissään.,Uralic,"['Uralic', 'Finnic', 'Coastal Finnic', 'Neva',...",fi,Finnic,48000,rˈosɪna kˈæːnnæhtæː ˈihme͡issæːn,2025-06-17.mp3,Finnish,fi
5,2025-06-18,yue,Classical-Middle-Modern Sinitic,I was an only child,我係獨生嘅,Sino-Tibetan,"[Sino-Tibetan, Sinitic, Classical-Middle-Moder...",zh-yue,Sinitic,48000,lˈe1tə1 lˈe1tə1 lˈe1tə1 lˈe1tə1 lˈe1tə1,2025-06-18.mp3,Cantonese,yue
6,2025-06-19,fas,Iranian,"If you love her, then woe to her.",اگه عاشقته وای به حالش,Indo-European,"[Indo-European, Indo-Iranian, Iranian]",fa,Indo-Iranian,48000,ˈaːɡeː ˈɑʃq1atˌeː vˈɑj beː hˈɑlaʃ,2025-06-19.mp3,Persian,fa
7,2025-06-20,hin,Indo-Iranian,Mumbai: Rail service between Borivali and Vira...,मुंबई: बोरीवली और विरार के बीच रेल सेवा सामान्य,Indo-European,"[Indo-European, Classical Indo-European, Indo-...",hi,Classical Indo-European,48000,mˈũbˌi boːɾˈiːʋlˌi ɔːɾ wɪɾˈaːɾ keː bˈiːc ɾˈeː...,2025-06-20.mp3,Hindi,hi
8,2025-06-21,vie,Viet-Muong,Stargazing,Đang ngắm trời sao,Austroasiatic,"[Austroasiatic, Vietic, Viet-Muong, Vietnamese]",vi-hue,Vietic,48000,ɗˌaː1ŋ ŋˈaɜm c͡rˈəː͡ɪ2 ʂˈaː͡ʊ7,2025-06-21.mp3,Vietnamese,vi
