# Example Language Generator
Using **Mozilla Common Voice**, **Tatoeba**, **phonemizer/eSpeak‑NG**, **Google Cloud TTS**, and/or **Glottolog**. 

In this notebook, we create explore the various components towards presenting the "evidences" needed to guess a language in the Langr game.

In [91]:
import pprint
import pandas as pd
import numpy as np
from langcodes import Language
from datasets import load_dataset, Audio as HF_Audio
from IPython.display import Audio
from itertools import islice
from datetime import date, timedelta
from pathlib import Path
import shutil, soundfile as sf
import random
import subprocess
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time

def print_object(obj):
    """Print object in a readable format."""
    if isinstance(obj, dict):
        pprint.pprint(obj)
    elif isinstance(obj, list):
        for item in obj:
            print_object(item)
    else:
        print(obj)

# Convert an English language name to an ISO code 
def english_name_to_iso(name: str, three_letter: bool = True) -> str:
    lang = Language.find(name) 
    return lang.to_alpha3() if three_letter else lang.language

# Convert an ISO code to an English language name
def iso_to_english_name(code: str) -> str:
    lang = Language.get(code)
    return lang.display_name() if lang else None

In [84]:
# Read all languages we have in the languages dataset
languages_df = pd.read_csv('languages.csv')

In [85]:
# List all languages available to us
print(sorted(languages_df['english_name'].unique()))

['Afrikaans', 'Albanian', 'Armenian', 'Bulgarian', 'Cantonese', 'Catalan', 'Chinese', 'Czech', 'Danish', 'Dutch', 'English', 'Esperanto', 'Estonian', 'Finnish', 'French', 'Georgian', 'German', 'Greek', 'Hindi', 'Hungarian', 'Icelandic', 'Indonesian', 'Irish', 'Italian', 'Latvian', 'Lithuanian', 'Macedonian', 'Malayalam', 'Nepali', 'Persian', 'Polish', 'Portuguese', 'Punjabi', 'Romanian', 'Russian', 'Serbian', 'Slovak', 'Spanish', 'Swahili', 'Swedish', 'Tamil', 'Turkish', 'Vietnamese', 'Welsh']


In [95]:
# List all eSpeak Codes available to us
all_languages = languages_df['espeak_code'].unique()
for code in ['en', 'en-gb', 'en-sc', 'en-uk-north', 'en-uk-wmids', 'en-us', 'en-wi']:
    all_languages = np.delete(all_languages, np.where(all_languages == code))
all_languages

array(['af', 'bg', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'eo', 'et', 'fa',
       'fa-pin', 'fi', 'fr-be', 'fr-fr', 'ga', 'hi', 'hu', 'hy',
       'hy-west', 'id', 'is', 'it', 'ka', 'lv', 'lt', 'ml', 'mk', 'ne',
       'nl', 'pa', 'pl', 'pt-br', 'pt-pt', 'ro', 'ru', 'sk', 'es',
       'es-la', 'sq', 'sr', 'sw', 'sv', 'ta', 'tr', 'vi', 'vi-hue',
       'vi-sgn', 'zh-yue', 'zh'], dtype=object)

In [96]:
chosen = random.choice(all_languages)
chosen

'nl'

In [71]:
LANGUAGE = {
    'ESPEAK_CODE': chosen.lower(),
}

# structure: english_name, iso3, cv_code, espeak_code, lineage, family_0, family_1, family_2
languages_row = languages_df[languages_df['espeak_code'] == LANGUAGE['ESPEAK_CODE']].iloc[0]

LANGUAGE['ENGLISH_NAME'] = languages_row['english_name']
LANGUAGE['ISO'] = languages_row['iso3']
LANGUAGE['CV'] = languages_row['cv_code']
LANGUAGE['ESPEAK'] = languages_row['espeak_code']
LANGUAGE['LINEAGE'] = eval(languages_row['lineage'])
LANGUAGE['FAMILY_0'] = languages_row['family_0']
LANGUAGE['FAMILY_1'] = languages_row['family_1']
LANGUAGE['FAMILY_2'] = languages_row['family_2']

print_object(LANGUAGE)

{'CV': 'es',
 'ENGLISH_NAME': 'Spanish',
 'ESPEAK': 'es',
 'ESPEAK_CODE': 'es',
 'FAMILY_0': 'Indo-European',
 'FAMILY_1': 'Classical Indo-European',
 'FAMILY_2': 'Italic',
 'ISO': 'spa',
 'LINEAGE': ['Indo-European',
             'Classical Indo-European',
             'Italic',
             'Latino-Faliscan',
             'Latinic',
             'Imperial Latin',
             'Romance',
             'Italo-Western Romance',
             'Western Romance',
             'Shifted Western Romance',
             'Southwestern Shifted Romance',
             'West Ibero-Romance',
             'Castilic',
             'Spanish']}


In [74]:
# Request a sample from Mozilla's Common Voice
def sample_common_voice(cv_code: str, n_samples: int = 1):
    version = 15
    try:
        ds_stream = load_dataset(
            f'mozilla-foundation/common_voice_{version}_0',
            name = cv_code, 
            split = 'train',
            streaming = True
        )
        ds_stream.cast_column('audio', HF_Audio(decode = True))
        seed = random.randint(0, 2**32 - 1)
        ds_shuffled = ds_stream.shuffle(buffer_size=2048, seed=seed)

        sample = list(ds_shuffled.take(n_samples))
        print(f'Successfully loaded {len(sample)} sample{"" if len(sample) == 1 else "s"} from Common Voice dataset for language code: {cv_code}')
        return sample
        
    except Exception as e:
        print(f'Failed to sample from the Common Voice dataset:\n{e}')
        return []

In [75]:
# Get the sample from Common Voice
LANGUAGE['SAMPLE'] = sample_common_voice(LANGUAGE['CV'], n_samples = 1)

Reading metadata...: 311392it [00:50, 6128.02it/s] 


Successfully loaded 1 sample from Common Voice dataset for language code: es


In [76]:
# Here's the sample we get from Common Voice
LANGUAGE['SAMPLE']

[{'client_id': 'f4d21c57a1db02eab64c39638b9f5efc11233273bcf34e46cd1d15c9221863719b66e17c7feb45e59d1fa90a21e6cde3f3c9cdb7505244fde83f4773589dbf9e',
  'path': 'es_train_0/common_voice_es_19690431.mp3',
  'audio': {'path': 'es_train_0/common_voice_es_19690431.mp3',
   'array': array([ 0.00000000e+00,  1.15004035e-15, -4.52476327e-14, ...,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00], shape=(263808,)),
   'sampling_rate': 48000},
  'sentence': 'La crisis política desatada termina por provocar la renuncia del Presidente a su cargo.',
  'up_votes': 2,
  'down_votes': 0,
  'age': 'twenties',
  'gender': 'male',
  'accent': 'Chileno: Chile, Cuyo',
  'locale': 'es',
  'segment': '',
  'variant': ''}]

In [77]:
# Let's see the sentence
LANGUAGE['SAMPLE'][0]['sentence']

'La crisis política desatada termina por provocar la renuncia del Presidente a su cargo.'

In [78]:
# Exploit Google Translate to translate this sentence
def translate(language, text):
    
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    try:
        # Navigate to Google Translate
        driver.get("https://translate.google.com")
        wait = WebDriverWait(driver, 15)
        actions = ActionChains(driver)
        
        # Wait for page to load
        time.sleep(0.5)
        
        # Step 1: Enter text (cursor should be ready in text area)
        print("Entering text...")
        actions.send_keys(text).perform()
        print(f"Entered: {text}")
        
        # Step 2: Click language dropdown and select language
        print("Selecting target language...")
        
        # Find and click the target language button
        target_lang_button = wait.until(EC.element_to_be_clickable((
            By.CSS_SELECTOR, 
            "button[aria-label*='target language'], .VfPpkd-Bz112c-RLmnJb"
        )))
        target_lang_button.click()
        print("Clicked target language dropdown")
        
        # Wait briefly for dropdown, then type language and press Enter
        time.sleep(0.5)
        actions.send_keys(language).perform()
        time.sleep(0.5)
        actions.send_keys(Keys.RETURN).perform()
        print(f"Selected language: {language}")
        
        # Step 3: Wait for translation and extract
        print("Waiting for translation...")
        time.sleep(0.5)
        
        # Extract translation
        translation_selectors = [
            "span[jsname='W297wb']",
            "span[lang]:not([lang='auto']):not([lang=''])",
            ".ryNqvb span",
            ".J0lOec span"
        ]
        
        for selector in translation_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    translated_text = element.text.strip()
                    if translated_text and translated_text != text:
                        print(f"Translation found: {translated_text}")
                        return translated_text
            except:
                continue
        
        print("No translation found")
        return None
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None
        
    finally:
        driver.quit()

In [79]:
LANGUAGE['SAMPLE'][0]

{'client_id': 'f4d21c57a1db02eab64c39638b9f5efc11233273bcf34e46cd1d15c9221863719b66e17c7feb45e59d1fa90a21e6cde3f3c9cdb7505244fde83f4773589dbf9e',
 'path': 'es_train_0/common_voice_es_19690431.mp3',
 'audio': {'path': 'es_train_0/common_voice_es_19690431.mp3',
  'array': array([ 0.00000000e+00,  1.15004035e-15, -4.52476327e-14, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00], shape=(263808,)),
  'sampling_rate': 48000},
 'sentence': 'La crisis política desatada termina por provocar la renuncia del Presidente a su cargo.',
 'up_votes': 2,
 'down_votes': 0,
 'age': 'twenties',
 'gender': 'male',
 'accent': 'Chileno: Chile, Cuyo',
 'locale': 'es',
 'segment': '',
 'variant': ''}

In [80]:
# Manual input, for now
try:
    LANGUAGE['SAMPLE'][0]['translation'] = translate(LANGUAGE['ENGLISH_NAME'], LANGUAGE['SAMPLE'][0]['sentence'])
except Exception as e:
    print(f'Failed to add translation, enter it manually:\n{e}')
    LANGUAGE['SAMPLE'][0]['translation'] = '???????'

LANGUAGE['SAMPLE'][0]['translation']

Entering text...
Entered: La crisis política desatada termina por provocar la renuncia del Presidente a su cargo.
Selecting target language...
Clicked target language dropdown
Selected language: Spanish
Waiting for translation...
Translation found: The political crisis that erupted ultimately led to the President's resignation.


"The political crisis that erupted ultimately led to the President's resignation."

In [81]:
# Let's see the audio sample
wave = LANGUAGE['SAMPLE'][0]['audio']['array']
rate = LANGUAGE['SAMPLE'][0]['audio']['sampling_rate']
display(Audio(wave, rate = rate))

In [None]:
def find_espeak():
    """Return path to espeak-ng or espeak binary, raise if missing."""
    exe = shutil.which("espeak-ng") or shutil.which("espeak")
    if exe is None:
        raise FileNotFoundError("No espeak-ng/espeak binary on PATH")
    return exe

def phonemize(text: str, lang: str = "en", ipa: bool = False,
              ipa_level: int = 1, keep_stress: bool = True) -> str:
    """Phonemize text using eSpeak (NG)"""
    exe = find_espeak()

    # Build the command line
    args = [exe, "-q", f"-v{lang}"]
    if ipa:
        args.append(f"--ipa={ipa_level}")
    else:
        args.append("-x")       # eSpeak’s own symbols
        if not keep_stress:
            args.append("--sep=-")  # abuse this to strip stress marks later
    args.append(text)

    # Run the command
    proc = subprocess.run(
        args,
        text = True, capture_output = True, check = True
    )
    out = proc.stdout.strip()

    if not keep_stress and not ipa:
        # eSpeak stress marks are the `'` characters; remove them.
        out = out.replace("'", "")

    return out

In [65]:
# phonemize(LANGUAGE['SAMPLE'][0]['sentence'], language = LANGUAGE['ESPEAK'], backend = 'espeak', strip = True, preserve_punctuation = True, with_stress = False)
LANGUAGE['SAMPLE'][0]['IPA'] = phonemize(text = LANGUAGE['SAMPLE'][0]['sentence'], lang = LANGUAGE['ESPEAK'], ipa = True, ipa_level = 1)
print(LANGUAGE['SAMPLE'][0]['IPA'])

il ɛ lə- bɛ̃ʒamˈɛ̃ dy- pˈɛ̃tʁ e skyltˈœʁ pjeʁˈo fɑ̃tastiʃinˈi


# Batched Process

## Create `Game_Data` DataFrame

In [231]:
CSV_PATH = Path('game_data.csv')
START_DAY = date(2025, 6, 13) # first day in the daily games series

# Load the dataframe
if CSV_PATH.exists():
    game_df = pd.read_csv(CSV_PATH, parse_dates = ['date'])
    # mutate the column to a Python date, not Timestamp
    game_df['date'] = game_df['date'].dt.date
else:
    game_df = pd.DataFrame(columns=['date'])

# next time slot
next_day = (max(game_df['date']) + timedelta(days = 1)) if not game_df.empty else START_DAY

In [232]:
# Let's see the date we're generating a game instance for
print(f'Generating game instance for date: {next_day:%D}')

Generating game instance for date: 06/17/25


In [None]:
# Handle the audio files and referencing of paths
AUDIO_DIR = Path('assets/audio')
AUDIO_DIR.mkdir(exist_ok = True)
fname_base = f"{next_day:%Y-%m-%d}"
file_path = AUDIO_DIR / f"{fname_base}.mp3"
sf.write(file_path, wave, rate, format='MP3')

In [None]:
# build a new row based on the generated LANGUAGE dict from above
new_row = { 'date': next_day,
            'language': LANGUAGE['ENGLISH_NAME'],
            'iso': LANGUAGE['ISO'],
            'cv_code': LANGUAGE['CV'],
            'espeak_code': LANGUAGE['ESPEAK'],
            'lineage': LANGUAGE['LINEAGE'],
            'family_0': LANGUAGE['FAMILY_0'],
            'family_1': LANGUAGE['FAMILY_1'],
            'family_2': LANGUAGE['FAMILY_2'],
            'sentence': LANGUAGE['SAMPLE'][0]['sentence'],
            'translation': LANGUAGE['SAMPLE'][0]['translation'],
            'wave': file_path.name,
            'sampling_rate': LANGUAGE['SAMPLE'][0]['audio']['sampling_rate'],
            'IPA': LANGUAGE['SAMPLE'][0]['IPA']
          }

# ensure any new columns are present in the dataframe
missing_cols = set(new_row) - set(game_df.columns)
for c in missing_cols:
    game_df[c] = pd.NA # create blank column for any new field

# append and reset the index
game_df = pd.concat([game_df, pd.DataFrame([new_row])], ignore_index = True)

In [235]:
# save it again
game_df.to_csv(CSV_PATH, index = False)

print(f'Added {next_day} with {new_row['language']} ({new_row['iso']}).')
game_df.tail(10) 

Added 2025-06-17 with Malayalam (mal).


Unnamed: 0,date,sampling_rate,lineage,cv_code,family_1,family_0,sentence,language,translation,iso,family_2,wave,espeak_code
0,2025-06-13,48000,"['Sino-Tibetan', 'Sinitic', 'Classical-Middle-...",yue,Sinitic,Sino-Tibetan,睇內容長短嘅,Cantonese,It depends on the length of the content.,yue,Classical-Middle-Modern Sinitic,2025-06-13_yue.mp3,zh-yue
1,2025-06-14,48000,"['Kartvelian', 'Georgian-Zan', 'Georgic', 'Geo...",ka,Georgian-Zan,Kartvelian,მდებარეობს ბოლივიის ცენტრალური კორდილიერის ჩრდ...,Georgian,It is located on the northern edge of the Cent...,kat,Georgic,2025-06-14_kat.mp3,ka
2,2025-06-15,48000,"['Indo-European', 'Classical Indo-European', '...",cy,Classical Indo-European,Indo-European,"Cafodd adroddiad ardderchog, yn adlewyrchu ei ...",Welsh,"He had an excellent report, reflecting his eff...",cym,Celtic,2025-06-15_cym.mp3,cy
3,2025-06-16,48000,"['Indo-European', 'Classical Indo-European', '...",ro,Classical Indo-European,Indo-European,Ce s-a întâmplat cu legislaţia socială?,Romanian,What happened to social legislation?,ron,Italic,2025-06-16_ron.mp3,ro
4,2025-06-17,48000,"[Dravidian, South Dravidian, South Dravidian I...",ml,South Dravidian,Dravidian,ഇല്ല മോനേ,Malayalam,"No, son.",mal,South Dravidian I,2025-06-17_mal.mp3,ml
