# Languages List Creator
Using **Mozilla Common Voice**, **Tatoeba**, **phonemizer/eSpeak‑NG**, **Google Cloud TTS**, and/or **Glottolog**. 

In this notebook, we create a dataframe storing information about each language supported across the multiple libraries. We have about 50-60 good languages to choose from programmaticaly.

# Imports

In [1]:
# Requirements:
# !pip install --quiet requests pandas langcodes phonemizer pydub google-cloud-texttospeech

In [None]:
import random, requests, subprocess, pprint, shutil
import pandas as pd
from langcodes import Language
from phonemizer import phonemize
from datasets import get_dataset_config_names, load_dataset
from IPython.display import Audio

# Helpers

In [9]:
def norm(code):
    """Return ISO 639‑3 code for a language, if possible. Otherwise, return the argument."""
    try:
        return Language.get(code).to_alpha3()
    except Exception:
        return code.lower()

In [10]:
def print_object(obj):
    """Print object in a readable format."""
    if isinstance(obj, dict):
        pprint.pprint(obj)
    elif isinstance(obj, list):
        for item in obj:
            print_object(item)
    else:
        print(obj)

# Common Voice
- Speech-audio files
- Sentence text
- other metadata

`path` (filename), `audio` (raw samples), `sentence`, `age`, `gender`, `accent`

## CV Sample

In [11]:
# v = 16, 17 fail
# v = 11, 15 work!
version = 15

try:
    ds_stream = load_dataset(
        f"mozilla-foundation/common_voice_{version}_0",
        name="hi", 
        split="train",
        streaming=True
    )
    sample = list(ds_stream.take(5))
    print(f"Version {version}.0 worked!")
    
except Exception as e:
    print(f"Version {version}.0 failed: {e}")

for i, s in enumerate(sample):
    print(f"Sample {i}:")
    print("Keys:", list(s.keys()))
    print_object(s)

Reading metadata...: 4630it [00:00, 18626.17it/s]


Version 15.0 worked!
Sample 0:
Keys: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant']
{'accent': '',
 'age': '',
 'audio': {'array': array([ 5.49299626e-26, -1.27631384e-25, -1.03397577e-25, ...,
        1.06425901e-07,  4.46417232e-08,  2.61464095e-09], shape=(195264,)),
           'path': 'hi_train_0/common_voice_hi_26008353.mp3',
           'sampling_rate': 48000},
 'client_id': '0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd00383f604e1e17c38d6ed8adf1bd2ccbf927a52c5adefb8ac4b158ce27a7c2ed9581e71202eb302dfb3',
 'down_votes': 0,
 'gender': '',
 'locale': 'hi',
 'path': 'hi_train_0/common_voice_hi_26008353.mp3',
 'segment': '',
 'sentence': 'हमने उसका जन्मदिन मनाया।',
 'up_votes': 2,
 'variant': ''}
Sample 1:
Keys: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant']
{'accent': '',
 'age': '',
 'audio': {'array': array([ 7.10542736e-15

## CV Languages List

In [12]:
def common_voice_langs():
    """
    Returns a DataFrame of Common Voice language configs
    (ISO‑639‑3 where possible, plus the HF config tag).
    """
    # Each HF “config” corresponds to one language.
    configs = get_dataset_config_names(f"mozilla-foundation/common_voice_{version}_0")

    # Normalise with langcodes → ISO‑639‑3
    data = [{"iso3": norm(c), "cv_code": c} for c in configs]
    return pd.DataFrame(data)

cv_df = common_voice_langs()
cv_df

Unnamed: 0,iso3,cv_code
0,eng,en
1,fas,fa
2,fra,fr
3,spa,es
4,slv,sl
...,...,...
109,lao,lo
110,dyu,dyu
111,isl,is
112,zgh,zgh


## CV Querying
Sample sentences, or referencing by ID

In [13]:
def sample_sentence(iso3):
    """
    Get a random sentence from Tatoeba using the current API_v0 REST endpoint.
    
    Args:
        iso3 (str): ISO 639-3 language code (e.g., 'nld' for Dutch, 'fra' for French)
    
    Returns:
        tuple: (sentence_id, sentence_text, translation_text)
    """
    # Use the current Tatoeba API_v0 REST endpoint
    url = (
        "https://tatoeba.org/eng/api_v0/search"
        f"?from={iso3}"
        "&to=eng"
        "&orphans=no"
        "&sort=random"
        "&page=1"
        "&trans_filter=limit"
        "&trans_link=direct"
    )
    
    headers = {
        "Accept": "application/json",
        "User-Agent": "Mozilla/5.0 (compatible; Python script)"
    }
    
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    data = r.json()
    
    # Check if we have results
    if not data.get("results") or len(data["results"]) == 0:
        raise ValueError("No sentence found")
    
    # Get the first result
    sentence_data = data["results"][0]
    
    sent_id = sentence_data["id"]
    sent = sentence_data["text"]
    
    # Get English translation
    translations = sentence_data.get("translations", [])
    if not translations:
        raise ValueError("No English translation found")
    
    # Find the first English translation
    translation = None
    for trans in translations:
        if trans[0].get("lang") == "eng":  # translations are nested arrays
            translation = trans[0]["text"]
            break
    
    if not translation:
        raise ValueError("No English translation found")
    
    return sent_id, sent, translation

def sample_sentence_alternative(iso3):
    """
    Alternative approach - get multiple results and pick one randomly.
    This gives more randomness since Tatoeba's sort=random might not work well.
    """
    url = (
        "https://tatoeba.org/eng/api_v0/search"
        f"?from={iso3}"
        "&to=eng"
        "&orphans=no"
        "&perPage=10"  # Get 10 results
        "&page=1"
        "&trans_filter=limit" 
        "&trans_link=direct"
    )
    
    headers = {
        "Accept": "application/json",
        "User-Agent": "Mozilla/5.0 (compatible; Python script)"
    }
    
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    data = r.json()
    
    if not data.get("results") or len(data["results"]) == 0:
        raise ValueError("No sentence found")
    
    # Pick a random sentence from the results
    sentence_data = random.choice(data["results"])
    
    sent_id = sentence_data["id"]
    sent = sentence_data["text"]
    
    # Get English translation
    translations = sentence_data.get("translations", [])
    if not translations:
        raise ValueError("No English translation found")
    
    # Find the first English translation
    translation = None
    for trans in translations:
        if trans[0].get("lang") == "eng":
            translation = trans[0]["text"]
            break
    
    if not translation:
        raise ValueError("No English translation found")
    
    return sent_id, sent, translation

def get_sentence_by_id(sentence_id):
    """
    Get a specific sentence by ID.
    
    Args:
        sentence_id (int): The Tatoeba sentence ID
        
    Returns:
        dict: Full sentence data
    """
    url = f"https://tatoeba.org/eng/api_v0/sentence/{sentence_id}"
    
    headers = {
        "Accept": "application/json",
        "User-Agent": "Mozilla/5.0 (compatible; Python script)"
    }
    
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    return r.json()

In [14]:
# Test with Dutch (nld)
print("Testing with Dutch (nld):")
sent_id, sentence, translation = sample_sentence("nld")
print(f"ID: {sent_id}")
print(f"Dutch: {sentence}")
print(f"English: {translation}")

Testing with Dutch (nld):
ID: 7900647
Dutch: Doe het snel.
English: Do it quickly.


In [15]:
# Test alternative method
print("\nTesting alternative method:")
sent_id, sentence, translation = sample_sentence_alternative("nld")
print(f"ID: {sent_id}")
print(f"Dutch: {sentence}")
print(f"English: {translation}")


Testing alternative method:
ID: 12872350
Dutch: Loop!
English: Walk!


# Tatoeba
- Parallel sentences in many languages

`id`, `text`, `lang`, `translations` (list)

## T Languages

In [16]:
# def get_tatoeba_languages():
#     """
#     Get all available languages from Tatoeba by making a general search
#     and extracting language information from the results.
    
#     Since Tatoeba doesn't have a direct /languages endpoint, we need to
#     make a search request and examine the available language options.
#     """
    
#     # Method 1: Try to get languages from a broad search
#     # This will return results that show available 'from' and 'to' languages
#     url = "https://tatoeba.org/eng/api_v0/search"
    
#     headers = {
#         "Accept": "application/json",
#         "User-Agent": "Mozilla/5.0 (compatible; Python script)"
#     }
    
#     # Make a general search to get some results
#     params = {
#         "orphans": "no",
#         "sort": "random",
#         "page": "1",
#         "trans_filter": "limit"
#     }
    
#     try:
#         r = requests.get(url, headers=headers, params=params, timeout=30)
#         r.raise_for_status()
#         data = r.json()
        
#         print("Sample API response structure:")
#         print(f"Keys in response: {list(data.keys())}")
        
#         if 'results' in data and len(data['results']) > 0:
#             print(f"Sample result keys: {list(data['results'][0].keys())}")
#             # Look for language information in the results
#             sample_result = data['results'][0]
#             if 'lang' in sample_result:
#                 print(f"Sample language code: {sample_result['lang']}")
        
#         return data
        
#     except requests.RequestException as e:
#         print(f"Error making request: {e}")
#         return None

# def extract_languages_from_search_results():
#     """
#     Alternative approach: Extract language codes from multiple search results
#     to build a comprehensive list of available languages.
#     """
    
#     url = "https://tatoeba.org/eng/api_v0/search"
#     headers = {
#         "Accept": "application/json",
#         "User-Agent": "Mozilla/5.0 (compatible; Python script)"
#     }
    
#     languages = set()
    
#     # Make several searches to collect different language samples
#     for page in range(1, 60):  # Search first 60 pages
#         params = {
#             "orphans": "no",
#             "sort": "random",
#             "page": str(page),
#             "trans_filter": "limit"
#         }
        
#         try:
#             r = requests.get(url, headers=headers, params=params, timeout=30)
#             r.raise_for_status()
#             data = r.json()
            
#             if 'results' in data:
#                 for result in data['results']:
#                     if 'lang' in result:
#                         languages.add(result['lang'])
                    
#                     # Also check translations if they exist
#                     if 'translations' in result:
#                         for translation in result['translations']:
#                             if isinstance(translation, dict) and 'lang' in translation:
#                                 languages.add(translation['lang'])
            
#             # Be nice to the API
#             time.sleep(0.5)
            
#         except requests.RequestException as e:
#             print(f"Error on page {page}: {e}")
#             continue
    
#     return languages

# def create_language_dataframe(language_codes):
#     """
#     Convert language codes to a DataFrame with ISO3 codes and names.
#     """
#     language_data = []
    
#     for code in language_codes:
#         try:
#             # Try to get language info using iso639
#             lang = Language.get(code)
#             language_data.append({
#                 'iso_code': code,
#                 'iso3': lang.to_alpha3() if hasattr(lang, 'to_alpha3') else code,
#                 'name': lang.name if hasattr(lang, 'name') else code
#             })
#         except:
#             # If iso639 doesn't recognize the code, keep it as is
#             language_data.append({
#                 'iso_code': code,
#                 'iso3': code,
#                 'name': code
#             })
    
#     df = pd.DataFrame(language_data)
#     return df.sort_values('name').reset_index(drop=True)

In [17]:
# # print("Testing Tatoeba API structure...")
# # sample_data = get_tatoeba_languages()

# print("\nExtracting languages from search results...")
# language_codes = extract_languages_from_search_results()

# print(f"\nFound {len(language_codes)} unique language codes:")
# print(sorted(language_codes))

# if language_codes:
#     print("\nCreating language DataFrame...")
#     tat_df = create_language_dataframe(language_codes)
#     print(f"\nLanguage DataFrame shape: {tat_df.shape}")
#     print("\nSample languages:")
#     print(tat_df.head(10))
    
#     # Save to CSV for reference
#     tat_df.to_csv('tatoeba_languages.csv', index=False)
#     print("\nSaved to 'tatoeba_languages.csv'")
# else:
#     print("No language codes found. Check API structure.")

# eSpeak-NG
- IPA transcription

In [18]:
def espeak_langs():
    exe = shutil.which("espeak-ng") or shutil.which("espeak")
    if exe is None:
        raise FileNotFoundError("espeak(-ng) binary not on PATH")

    proc = subprocess.run([exe, "--voices"], text=True, capture_output=True, check=True)
    lines = proc.stdout.strip().split("\n")[1:]  # skip header
    codes = [l.split()[1] for l in lines]
    return pd.DataFrame({"iso3": [norm(c) for c in codes],
                         "espeak_code": codes})

es_df = espeak_langs()
print(f"{len(es_df)} eSpeak‑NG voice codes")
es_df.head()

69 eSpeak‑NG voice codes


Unnamed: 0,iso3,espeak_code
0,afr,af
1,arg,an
2,bul,bg
3,bos,bs
4,cat,ca


# Glottolog
- Language metadata: autonym, family, macro-area, latitude/longitude, ISO-code(s)

`glottocode`, `name`, `iso639-3`, `family_name`

## G Languages

In [22]:
gl_df = (
    pd.read_csv("languoid.csv", low_memory=False) # tab-separated
      .query("level == 'language'") # drop families, dialect nodes
      .rename(columns={
          "iso639P3code": "iso3",
          "name": "glottolog_name",
          "family_id": "family",
          "latitude": "lat",
          "longitude": "lon",
      })
)
# normalise ISO codes
gl_df["iso3"] = gl_df["iso3"].str.lower()


## G Lineage Function

In [51]:
gdf = pd.read_csv("languoid.csv", low_memory=False)

PK = "id"
PARENT_COL = "parent_id"
ISO_COL = "iso639P3code"
NAME_COL = "name"
LEVEL_COL = "level"

# Keep only what we need and normalise ISO codes
gdf = gdf[[PK, PARENT_COL, NAME_COL, LEVEL_COL, ISO_COL]].copy()
gdf[ISO_COL] = gdf[ISO_COL].str.lower()

# Build lookup tables
row_by_pk  = gdf.set_index(PK)
pk_by_iso3 = gdf.dropna(subset=[ISO_COL]).set_index(ISO_COL)[PK]

# Method for walking up the lineage tree
def lineage(iso3):
    """
    Return the list ['family', 'subbranch', …, 'language'] for an ISO-639-3 code.
    """
    if iso3 not in pk_by_iso3:
        # f"(iso code {iso3} not in Glottolog)"
        return ["?", "?", "?"]

    chain, pk = [], pk_by_iso3[iso3]
    while pd.notna(pk):
        node = row_by_pk.loc[pk]
        chain.append(node[NAME_COL])
        # if node[LEVEL_COL] == "family": 
        #     break
        pk = node[PARENT_COL]

    return list(reversed(chain))


def top_two(chain):
    return pd.Series(chain[:2] + [""]*(2-len(chain)), index=["family_0", "family_1"])

def top_three(chain):
    return pd.Series(chain[:3] + [""]*(3-len(chain)), index=["family_0", "family_1", "family_2"])

In [48]:
print("Italian's lineage:",  " > ".join(lineage("ita")))
top_three(lineage("ita"))

Italian's lineage: Indo-European > Classical Indo-European > Italic > Latino-Faliscan > Latinic > Imperial Latin > Romance > Italo-Western Romance > Italo-Dalmatian > Italian Romance > Italian


family_0              Indo-European
family_1    Classical Indo-European
family_2                     Italic
dtype: object

# Imports

# Google Cloud TTS
- Synthetic Speech

`AudioConfig`, `audioContent`, `voice.name`, ...

Not strictly necessary... so ignoring

In [None]:
def gcloud_tts_langs():
    from google.cloud import texttospeech
    client = texttospeech.TextToSpeechClient()
    voices = client.list_voices().voices
    codes = set()
    for v in voices:
        codes.update(v.language_codes)
    data = [{'iso3': norm(c), 'gcloud_code': c} for c in codes]
    return pd.DataFrame(data)

# Requires GOOGLE_APPLICATION_CREDENTIALS env var
# gcloud_df = gcloud_tts_langs()
# gcloud_df.head()

# Intersection

In [52]:

# dfs = [cv_df, tat_df, es_df]  # add gcloud_df when ready
dfs = [cv_df, es_df]

iso_sets = [set(df['iso3']) for df in dfs]
common = set.intersection(*iso_sets)
supported_df = pd.DataFrame(sorted(common), columns=['iso3'])
# supported_df = supported_df.merge(tat_df[['iso3','name']], on='iso3', how='left')
supported_df = supported_df.merge(cv_df[['iso3','cv_code']], on='iso3', how='left')
supported_df = supported_df.merge(es_df[['iso3','espeak_code']], on='iso3', how='left')
supported_df['lineage'] = supported_df["iso3"].apply(lineage)
supported_df[["family_0", "family_1", "family_2"]] = supported_df["lineage"].apply(top_three)

supported_df["english_name"] = supported_df["iso3"].apply(
    lambda c: Language.get(c).display_name("en")
)
print(f"Languages supported across {len(dfs)} services: {len(supported_df)}")
display(supported_df)

Languages supported across 2 services: 59


Unnamed: 0,iso3,cv_code,espeak_code,lineage,family_0,family_1,family_2,english_name
0,afr,af,af,"[Indo-European, Classical Indo-European, Germa...",Indo-European,Classical Indo-European,Germanic,Afrikaans
1,bul,bg,bg,"[Indo-European, Classical Indo-European, Balto...",Indo-European,Classical Indo-European,Balto-Slavic,Bulgarian
2,cat,ca,ca,"[Indo-European, Classical Indo-European, Itali...",Indo-European,Classical Indo-European,Italic,Catalan
3,ces,cs,cs,"[Indo-European, Classical Indo-European, Balto...",Indo-European,Classical Indo-European,Balto-Slavic,Czech
4,cym,cy,cy,"[Indo-European, Classical Indo-European, Celti...",Indo-European,Classical Indo-European,Celtic,Welsh
5,dan,da,da,"[Indo-European, Classical Indo-European, Germa...",Indo-European,Classical Indo-European,Germanic,Danish
6,deu,de,de,"[Indo-European, Classical Indo-European, Germa...",Indo-European,Classical Indo-European,Germanic,German
7,ell,el,el,"[Indo-European, Classical Indo-European, Graec...",Indo-European,Classical Indo-European,Graeco-Phrygian,Greek
8,eng,en,en,"[Indo-European, Classical Indo-European, Germa...",Indo-European,Classical Indo-European,Germanic,English
9,eng,en,en-gb,"[Indo-European, Classical Indo-European, Germa...",Indo-European,Classical Indo-European,Germanic,English


In [None]:
# Save to CSV
supported_df.to_csv("supported_languages_cv_espeak_G.csv", index = False)

In [56]:
def get_common_voice_clip(lang_iso3):
    # Naïve fetch using HF raw URLs (for demo only):
    base = f'https://huggingface.co/datasets/mozilla-foundation/common_voice_{version}_0/resolve/main/{lang_iso3}/clips/'
    index_url = base + 'validated.tsv'
    tsv = requests.get(index_url, timeout=30).text.split('\n')
    if len(tsv) < 2:
        raise ValueError('No clips')
    clip_rel = random.choice(tsv[1:]).split('\t')[1]
    audio_url = base + clip_rel
    return audio_url

iso3_to_espeak = dict(zip(es_df["iso3"], es_df["espeak_code"]))
iso3_to_espeak
def ipa(sentence, iso_code):
    code = iso3_to_espeak[iso_code]      # KeyError if not available
    return phonemize(sentence, language=code, backend="espeak", espeak_path="/opt/homebrew/bin/espeak", strip=True)

In [55]:
ipa_test = phonemize(
    "mañana",
    language="es",
    backend="espeak",
    backend_opts={"espeak_path": "/opt/homebrew/bin/espeak"}, 
    strip=True
)
print(ipa_test)   # → maˈɲana

TypeError: phonemize() got an unexpected keyword argument 'backend_opts'

# Testing

In [61]:
lang = random.choice(list(common))
sid, s, t = sample_sentence(lang)
print('ISO3:', lang, '\nSentence:', s, '\nTranslation:', t)
# print('IPA:', ipa(s, lang))
try:
    url = get_common_voice_clip(lang)
    audio = Audio(url)
    display(audio)
except Exception as e:
    print('Audio fetch failed:', e)

ISO3: isl 
Sentence: Er þetta hestur eða meri? 
Translation: Is this a stallion or a mare?
Audio fetch failed: No clips
