# Example Language Generator
Using **Mozilla Common Voice**, **Tatoeba**, **phonemizer/eSpeak‑NG**, **Google Cloud TTS**, and/or **Glottolog**. 

In this notebook, we create explore the various components towards presenting the "evidences" needed to guess a language in the Langr game.

In [160]:
import pprint
import pandas as pd
from langcodes import Language
from phonemizer import phonemize
from datasets import load_dataset, Audio as HF_Audio
from IPython.display import Audio
from itertools import islice
from datetime import date, timedelta
from pathlib import Path
import os, pathlib, soundfile as sf

def print_object(obj):
    """Print object in a readable format."""
    if isinstance(obj, dict):
        pprint.pprint(obj)
    elif isinstance(obj, list):
        for item in obj:
            print_object(item)
    else:
        print(obj)

# Convert an English language name to an ISO code 
def english_name_to_iso(name: str, three_letter: bool = True) -> str:
    lang = Language.find(name) 
    return lang.to_alpha3() if three_letter else lang.language

# Convert an ISO code to an English language name
def iso_to_english_name(code: str) -> str:
    lang = Language.get(code)
    return lang.display_name() if lang else None

In [161]:
# Read all languages we have in the languages dataset
languages_df = pd.read_csv('languages.csv')

In [249]:
# List all languages available to us
print(sorted(languages_df['english_name'].unique()))

['Afrikaans', 'Albanian', 'Armenian', 'Bulgarian', 'Cantonese', 'Catalan', 'Chinese', 'Czech', 'Danish', 'Dutch', 'English', 'Esperanto', 'Estonian', 'Finnish', 'French', 'Georgian', 'German', 'Greek', 'Hindi', 'Hungarian', 'Icelandic', 'Indonesian', 'Irish', 'Italian', 'Latvian', 'Lithuanian', 'Macedonian', 'Malayalam', 'Nepali', 'Persian', 'Polish', 'Portuguese', 'Punjabi', 'Romanian', 'Russian', 'Serbian', 'Slovak', 'Spanish', 'Swahili', 'Swedish', 'Tamil', 'Turkish', 'Vietnamese', 'Welsh']


In [237]:
# List all eSpeak Codes available to us
languages_df['espeak_code'].unique()

array(['af', 'bg', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'en-gb',
       'en-sc', 'en-uk-north', 'en-uk-wmids', 'en-us', 'en-wi', 'eo',
       'et', 'fa', 'fa-pin', 'fi', 'fr-be', 'fr-fr', 'ga', 'hi', 'hu',
       'hy', 'hy-west', 'id', 'is', 'it', 'ka', 'lv', 'lt', 'ml', 'mk',
       'ne', 'nl', 'pa', 'pl', 'pt-br', 'pt-pt', 'ro', 'ru', 'sk', 'es',
       'es-la', 'sq', 'sr', 'sw', 'sv', 'ta', 'tr', 'vi', 'vi-hue',
       'vi-sgn', 'zh-yue', 'zh'], dtype=object)

In [224]:
LANGUAGE = {
    'ESPEAK_CODE': 'ml'.lower(),
}

# structure: english_name, iso3, cv_code, espeak_code, lineage, family_0, family_1, family_2
languages_row = languages_df[languages_df['espeak_code'] == LANGUAGE['ESPEAK_CODE']].iloc[0]

LANGUAGE['ENGLISH_NAME'] = languages_row['english_name']
LANGUAGE['ISO'] = languages_row['iso3']
LANGUAGE['CV'] = languages_row['cv_code']
LANGUAGE['ESPEAK'] = languages_row['espeak_code']
LANGUAGE['LINEAGE'] = eval(languages_row['lineage'])
LANGUAGE['FAMILY_0'] = languages_row['family_0']
LANGUAGE['FAMILY_1'] = languages_row['family_1']
LANGUAGE['FAMILY_2'] = languages_row['family_2']

print_object(LANGUAGE)

{'CV': 'ml',
 'ENGLISH_NAME': 'Malayalam',
 'ESPEAK': 'ml',
 'ESPEAK_CODE': 'ml',
 'FAMILY_0': 'Dravidian',
 'FAMILY_1': 'South Dravidian',
 'FAMILY_2': 'South Dravidian I',
 'ISO': 'mal',
 'LINEAGE': ['Dravidian',
             'South Dravidian',
             'South Dravidian I',
             'Tamil-Kannada',
             'Tamil-Kota',
             'Tamil-Toda',
             'Tamil-Irula',
             'Tamil-Kodagu',
             'Tamil-Malayalam',
             'Malayalamoid',
             'Malayalam']}


In [225]:
# Request a sample from Mozilla's Common Voice
def sample_common_voice(cv_code: str, n_samples: int = 1):
    version = 15
    try:
        ds_stream = load_dataset(
            f'mozilla-foundation/common_voice_{version}_0',
            name = cv_code, 
            split = 'train',
            streaming = True
        )
        ds_stream.cast_column('audio', HF_Audio(decode = True))
        sample = list(ds_stream.take(n_samples))
        print(f'Successfully loaded {len(sample)} sample{"" if len(sample) == 1 else "s"} from Common Voice dataset for language code: {cv_code}')
        return sample
        
    except Exception as e:
        print(f'Failed to sample from the Common Voice dataset:\n{e}')
        return []

In [226]:
# Get the sample from Common Voice
LANGUAGE['SAMPLE'] = sample_common_voice(LANGUAGE['CV'], n_samples = 1)

Reading metadata...: 1249it [00:00, 7003.39it/s]


Successfully loaded 1 sample from Common Voice dataset for language code: ml


In [227]:
# Here's the sample we get from Common Voice
LANGUAGE['SAMPLE']

[{'client_id': 'e315d6a0031a9543a498a5dc7388a5756cc1e0fb2545b1791a604513673f513f77c8d4dfabe82fe3a2056531f621d3392aaa380c6216932f5e0debd676739b74',
  'path': 'ml_train_0/common_voice_ml_37003897.mp3',
  'audio': {'path': 'ml_train_0/common_voice_ml_37003897.mp3',
   'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
          1.33694311e-06, 6.72574970e-07, 1.44026671e-07], shape=(150336,)),
   'sampling_rate': 48000},
  'sentence': 'ഇല്ല മോനേ',
  'up_votes': 4,
  'down_votes': 0,
  'age': '',
  'gender': '',
  'accent': 'Central Kerala',
  'locale': 'ml',
  'segment': '',
  'variant': ''}]

In [228]:
# Let's see the sentence
LANGUAGE['SAMPLE'][0]['sentence']

'ഇല്ല മോനേ'

In [229]:
# Manual input, for now
LANGUAGE['SAMPLE'][0]['translation'] = 'No, son.'
LANGUAGE['SAMPLE'][0]['translation']

'No, son.'

In [230]:
# Let's see the audio sample
wave = LANGUAGE['SAMPLE'][0]['audio']['array']
rate = LANGUAGE['SAMPLE'][0]['audio']['sampling_rate']
display(Audio(wave, rate = rate))

In [157]:
# phonemize(LANGUAGE['SAMPLE'][0]['sentence'], language = LANGUAGE['ESPEAK'], backend = 'espeak', strip = True, preserve_punctuation = True, with_stress = False)

## Create `Game_Data` DataFrame

In [231]:
CSV_PATH = Path('game_data.csv')
START_DAY = date(2025, 6, 13) # first day in the daily games series

# Load the dataframe
if CSV_PATH.exists():
    game_df = pd.read_csv(CSV_PATH, parse_dates = ['date'])
    # mutate the column to a Python date, not Timestamp
    game_df['date'] = game_df['date'].dt.date
else:
    game_df = pd.DataFrame(columns=['date'])

# next time slot
next_day = (max(game_df['date']) + timedelta(days = 1)) if not game_df.empty else START_DAY

In [232]:
# Let's see the date we're generating a game instance for
print(f'Generating game instance for date: {next_day:%D}')

Generating game instance for date: 06/17/25


In [None]:
# Handle the audio files and referencing of paths
AUDIO_DIR = Path('assets/audio')
AUDIO_DIR.mkdir(exist_ok = True)
fname_base = f"{next_day:%Y-%m-%d}"
file_path  = AUDIO_DIR / f"{fname_base}.mp3"
sf.write(file_path, wave, rate, format='MP3')

In [234]:
# build a new row based on the generated LANGUAGE dict from above
new_row = { 'date': next_day,
            'language': LANGUAGE['ENGLISH_NAME'],
            'iso': LANGUAGE['ISO'],
            'cv_code': LANGUAGE['CV'],
            'espeak_code': LANGUAGE['ESPEAK'],
            'lineage': LANGUAGE['LINEAGE'],
            'family_0': LANGUAGE['FAMILY_0'],
            'family_1': LANGUAGE['FAMILY_1'],
            'family_2': LANGUAGE['FAMILY_2'],
            'sentence': LANGUAGE['SAMPLE'][0]['sentence'],
            'translation': LANGUAGE['SAMPLE'][0]['translation'],
            # 'wave': LANGUAGE['SAMPLE'][0]['audio']['array'],
            'wave': file_path.name,
            'sampling_rate': LANGUAGE['SAMPLE'][0]['audio']['sampling_rate'],
        #    'phonemes':
          }

# ensure any new columns are present in the dataframe
missing_cols = set(new_row) - set(game_df.columns)
for c in missing_cols:
    game_df[c] = pd.NA # create blank column for any new field

# append and reset the index
game_df = pd.concat([game_df, pd.DataFrame([new_row])], ignore_index = True)

In [235]:
# save it again
game_df.to_csv(CSV_PATH, index = False)

print(f'Added {next_day} with {new_row['language']} ({new_row['iso']}).')
game_df.tail(10) 

Added 2025-06-17 with Malayalam (mal).


Unnamed: 0,date,sampling_rate,lineage,cv_code,family_1,family_0,sentence,language,translation,iso,family_2,wave,espeak_code
0,2025-06-13,48000,"['Sino-Tibetan', 'Sinitic', 'Classical-Middle-...",yue,Sinitic,Sino-Tibetan,睇內容長短嘅,Cantonese,It depends on the length of the content.,yue,Classical-Middle-Modern Sinitic,2025-06-13_yue.mp3,zh-yue
1,2025-06-14,48000,"['Kartvelian', 'Georgian-Zan', 'Georgic', 'Geo...",ka,Georgian-Zan,Kartvelian,მდებარეობს ბოლივიის ცენტრალური კორდილიერის ჩრდ...,Georgian,It is located on the northern edge of the Cent...,kat,Georgic,2025-06-14_kat.mp3,ka
2,2025-06-15,48000,"['Indo-European', 'Classical Indo-European', '...",cy,Classical Indo-European,Indo-European,"Cafodd adroddiad ardderchog, yn adlewyrchu ei ...",Welsh,"He had an excellent report, reflecting his eff...",cym,Celtic,2025-06-15_cym.mp3,cy
3,2025-06-16,48000,"['Indo-European', 'Classical Indo-European', '...",ro,Classical Indo-European,Indo-European,Ce s-a întâmplat cu legislaţia socială?,Romanian,What happened to social legislation?,ron,Italic,2025-06-16_ron.mp3,ro
4,2025-06-17,48000,"[Dravidian, South Dravidian, South Dravidian I...",ml,South Dravidian,Dravidian,ഇല്ല മോനേ,Malayalam,"No, son.",mal,South Dravidian I,2025-06-17_mal.mp3,ml
