# Example Language Generator
Using **Mozilla Common Voice**, **Tatoeba**, **phonemizer/eSpeak‑NG**, **Google Cloud TTS**, and/or **Glottolog**. 

In this notebook, we create explore the various components towards presenting the "evidences" needed to guess a language in the Langr game.

In [73]:
import pprint
import pandas as pd
from langcodes import Language
from phonemizer import phonemize
from datasets import load_dataset, Audio as HF_Audio
from IPython.display import Audio
from itertools import islice
from datetime import date, timedelta
from pathlib import Path

def print_object(obj):
    """Print object in a readable format."""
    if isinstance(obj, dict):
        pprint.pprint(obj)
    elif isinstance(obj, list):
        for item in obj:
            print_object(item)
    else:
        print(obj)

# Convert an English language name to an ISO code 
def english_name_to_iso(name: str, three_letter: bool = True) -> str:
    lang = Language.find(name) 
    return lang.to_alpha3() if three_letter else lang.language

# Convert an ISO code to an English language name
def iso_to_english_name(code: str) -> str:
    lang = Language.get(code)
    return lang.display_name() if lang else None

In [44]:
# Read all languages we have in the languages dataset
languages_df = pd.read_csv('languages.csv')

In [45]:
LANGUAGE = {
    'ESPEAK_CODE': 'fa'.lower(),
}

# structure: english_name, iso3, cv_code, espeak_code, lineage, family_0, family_1, family_2
languages_row = languages_df[languages_df['espeak_code'] == LANGUAGE['ESPEAK_CODE']].iloc[0]

LANGUAGE['ENGLISH_NAME'] = languages_row['english_name']
LANGUAGE['ISO'] = languages_row['iso3']
LANGUAGE['CV'] = languages_row['cv_code']
LANGUAGE['ESPEAK'] = languages_row['espeak_code']
LANGUAGE['LINEAGE'] = eval(languages_row['lineage'])
LANGUAGE['FAMILY_0'] = languages_row['family_0']
LANGUAGE['FAMILY_1'] = languages_row['family_1']
LANGUAGE['FAMILY_2'] = languages_row['family_2']

print_object(LANGUAGE)

{'CV': 'fa',
 'ENGLISH_NAME': 'Persian',
 'ESPEAK': 'fa',
 'ESPEAK_CODE': 'fa',
 'FAMILY_0': 'Indo-European',
 'FAMILY_1': 'Indo-Iranian',
 'FAMILY_2': 'Iranian',
 'ISO': 'fas',
 'LINEAGE': ['Indo-European', 'Indo-Iranian', 'Iranian']}


In [None]:
def sample_common_voice(cv_code: str, n_samples: int = 1):
    version = 15
    try:
        ds_stream = load_dataset(
            f'mozilla-foundation/common_voice_{version}_0',
            name = cv_code, 
            split = 'train',
            streaming = True
        )
        ds_stream.cast_column('audio', HF_Audio(decode = True))
        sample = list(ds_stream.take(n_samples))
        print(f'Successfully loaded {len(sample)} sample{"" if len(sample) == 1 else "s"} from Common Voice dataset for language code: {cv_code}')
        return sample
        
    except Exception as e:
        print(f'Failed to sample from the Common Voice dataset:\n{e}')
        return []

In [68]:
LANGUAGE['SAMPLE'] = sample_common_voice(LANGUAGE['CV'], n_samples = 1)

Reading metadata...: 28756it [00:01, 15452.75it/s]
'(ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 3005a4a0-3ddd-4edd-bd6f-002ea6f1598c)')' thrown while requesting GET https://huggingface.co/datasets/mozilla-foundation/common_voice_15_0/resolve/main/audio/fa/train/fa_train_0.tar
Retrying in 1s [Retry 1/5].


Successfully loaded 1 sample from Common Voice dataset for language code: fa


In [69]:
LANGUAGE['SAMPLE']

[{'client_id': 'd990ca0c0ef86bfc398a5dcdec5d1c36b91321f3ba88b529104163ce45b6f3021fbf02e1500cf439c72c687ba4a72f0182ee3dd5240cc668f44a4bc2d3752da9',
  'path': 'fa_train_0/common_voice_fa_24550511.mp3',
  'audio': {'path': 'fa_train_0/common_voice_fa_24550511.mp3',
   'array': array([ 5.68434189e-14,  7.38964445e-13,  1.08002496e-12, ...,
          -1.56469042e-08,  4.04977357e-07,  3.26697091e-07], shape=(255744,)),
   'sampling_rate': 48000},
  'sentence': 'با ماهیتابه بر سر شوهرش کوفت.',
  'up_votes': 2,
  'down_votes': 0,
  'age': 'teens',
  'gender': 'male',
  'accent': '',
  'locale': 'fa',
  'segment': '',
  'variant': ''}]

In [70]:
LANGUAGE['SAMPLE'][0]['sentence']

'با ماهیتابه بر سر شوهرش کوفت.'

In [71]:
LANGUAGE['SAMPLE'][0]['translation'] = 'She hit her husband on the head with a frying pan.'
LANGUAGE['SAMPLE'][0]['translation']

'She hit her husband on the head with a frying pan.'

In [72]:
# ds = ds.cast_column("audio", HF_Audio(decode=True))
# row = next(islice(ds, 1, None))

display(Audio(LANGUAGE['SAMPLE'][0]['audio']['array'], rate = LANGUAGE['SAMPLE'][0]['audio']['sampling_rate']))

## Create `Game_Data` DataFrame

In [None]:
CSV_PATH = Path('game_data.csv') # change if you prefer another folder
START_DAY = date(2025, 6, 13) # first day in your "daily puzzle" series

# Load the dataframe
if CSV_PATH.exists():
    df = pd.read_csv(CSV_PATH, parse_dates=['date'])
    # keep the column as a Python date, not Timestamp
    df['date'] = df['date'].dt.date
else:
    df = pd.DataFrame(columns=['date'])

# next time slot
next_day = (max(df['date']) + timedelta(days = 1)) if not df.empty else START_DAY

In [None]:
# Build the row from your LANGUAGE dict
# Assume LANGUAGE already exists in the notebook, e.g.
# LANGUAGE = {"iso3": "ita", "english_name": "Italian", "cv_code": "it", ...}

row = {'date': next_day, **LANGUAGE}

# Make sure any new columns are present in the dataframe
missing_cols = set(row) - set(df.columns)
for c in missing_cols:
    df[c] = pd.NA # create blank column for any new field order doesn’t matter

In [None]:
# Append and reset the index
df = pd.concat([df, pd.DataFrame([row])], ignore_index = True)

In [None]:
# save it again
df.to_csv(CSV_PATH, index = False)

print(f'Added {next_day} with {row['english_name']} ({row['iso3']}).')
df.tail(3) 