In [1]:
import pandas as pd
import subprocess
import json
import urllib.request, json 
from bs4 import BeautifulSoup
from TTS.api import TTS # Coqui
import numpy as np
import languagecodes
import pycountry

# Get meta codes

### ISO 639 codes

- Set 1: Alpha 2 code
- Set 2: Alpha 3 code (deprecated)
- Set 3: Alpha 3 code for comprehensive coverage
- Set 5: ALpha 3 code for language families and groups
- Set 6: Alpha 4 representation for comprehensive coverage of language variants (withdrawn)
  
https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes
https://en.wikipedia.org/wiki/ISO_639
https://iso639-3.sil.org/code_tables/639/

### BCP-47 codes

- First part contains ISO-2 or ISO-3 codes

https://www.w3.org/International/articles/language-tags/
https://stackoverflow.com/questions/26085570/how-to-convert-ietf-bcp-47-language-identifier-to-iso-639-2

### Codes by model

Model | N languages| Codes used | Multiple speakers per language code    
---|---|---|---   
Meta MMS | 4022 | ISO 693-3  | No  
Toucan | 7233  |  ISO 693-3 for 7225 languages PLUS BCP-47 codes for 8 additional languages | No
espeak_NG | 112 | BCP-47 | No
Piper | 37 | locales (BCP-47?) | Yes
Coqui | 37 | ISO-693-2 | Yes

# Parse doc for individual TTS systems

## Meta MMS


https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html

In [2]:
with urllib.request.urlopen("https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html") as url:
   soup = BeautifulSoup(url, 'html.parser')

In [3]:
# Extract the table
tables = soup.find_all('table')
table = tables[0]

new_table = []
for row in table.find_all('tr'):
    new_row = []
    for col in row.find_all('td'):
       new_row.append(col.text)
        
    if new_row!=[]:
        new_table.append(new_row)

In [4]:
# Save to dataframe 
meta_mms = pd.DataFrame(new_table, 
             columns = ["iso_693_3","language_name","asr","tts","lid", "link"])

In [5]:
# Export
meta_mms.drop(columns=['link'], inplace=True)
meta_mms.to_csv("language_codes/meta_mms.csv", encoding="utf-8", index=False)

## Toucan

https://github.com/DigitalPhonetics/IMS-Toucan/blob/MassiveScaleToucan/Utility/language_list.md

In [6]:
with urllib.request.urlopen("https://github.com/DigitalPhonetics/IMS-Toucan/blob/MassiveScaleToucan/Utility/language_list.md") as url:
   soup = BeautifulSoup(url, 'html.parser')

In [7]:
# Extract the table
tables = soup.find_all('table')

table = tables[0]

new_table = []
for row in table.find_all('tr'):
    new_row = []
    for col in row.find_all('td'):
       new_row.append(col.text)
        
    if new_row!=[]:
        new_table.append(new_row)

In [8]:
# Save to dataframe 
toucan = pd.DataFrame(new_table,
                      columns = ["iso_693_3","language_name"])

In [9]:
# Export
toucan.to_csv("language_codes/ims_toucan.csv", encoding="utf-8", index=False)
toucan.head(2)

Unnamed: 0,iso_693_3,language_name
0,aaa,Ghotuo
1,aab,Alumu-Tesu


In [10]:
# Find BCP-like codes for language variants that have been added beyond the iso-693-3 standard
toucan_bcps = toucan[toucan.iso_693_3.str.contains("-")].iso_693_3.unique()
toucan[toucan.iso_693_3.str.contains("-")]

Unnamed: 0,iso_693_3,language_name
7225,en-us,American English
7226,en-sc,Scottish English
7227,fr-be,Belgian French
7228,fr-sw,Swiss French
7229,pt-br,Brazilian Portuguese
7230,spa-lat,Latin American Spanish
7231,vi-ctr,Central Vietnamese
7232,vi-so,Southern Vietnamese


## espeak NG

https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md

In [11]:
with urllib.request.urlopen("https://raw.githubusercontent.com/espeak-ng/espeak-ng/refs/heads/master/docs/languages.md") as url:
    content =  url.read().decode(url.headers.get_content_charset())

In [12]:
# Parse
content = content.split("\n\n")[3].replace("`", "")
content = content.split("\n")

In [13]:
# Convert to data frame
espeak = pd.DataFrame([[ j.strip() for j in i.split("|")] for i in content][2:])
espeak = espeak.replace("", np.nan).dropna(how="all", axis=1)
espeak.columns=["iso_639_5", "bcp_47", "language_family", "language_name", "accent_dialect" ]

In [14]:
corrections = { 
    # Match onto Toucan without change
        #'en-us':'en-us',
        #'fr-be':'fr-be',
        #'pt-br':'pt-br',
    # No clear map
        #'en-029': np.nan, 
        #'en-gb-x-gbclan': np.nan,
        #'en-gb-x-rp': np.nan,
        #'fa-latn': np.nan, 
        #'en-gb-x-gbcwmd': np.nan,
    ###################'fa': "pes", --> some ambiguity with Farsi (fas)
    ####################'ru-lv' : "lav", --> some ambiguity with Latvian vs. Russian Latvian
    'en-gb-scotland': "en-sc",
    'fr-ch': "fr-sw",
    'es-419': "spa-lat",
    'vi-vn-x-central': "vi-ctr",
    'vi-vn-x-south': "vi-so"}

In [15]:
# Get iso 693-3 codes
espeak = espeak[espeak.language_family!="Constructed"].copy() # Remove fake languages

espeak["iso_693_3"] = np.nan

for i, row in espeak.iterrows():
    if row.bcp_47 in toucan_bcps:
        espeak.loc[i, "iso_693_3"] = row.bcp_47
    elif row.bcp_47 in corrections.keys():
        espeak.loc[i, "iso_693_3"] = corrections[row.bcp_47]
    else:
        prefix = row.bcp_47.split("-")[0]
        if len(prefix) == 3:
            espeak.loc[i, "iso_693_3"] = prefix
        elif len(prefix) == 2:
            espeak.loc[i, "iso_693_3"] = pycountry.languages.get(alpha_2=prefix).alpha_3

In [16]:
# Drop duplicated iso_693_3 codes, keeping dominant variant. 
# This gets rid of Farsi with Roman spelling and Caribbean English, for example.

espeak[espeak.iso_693_3.duplicated(keep=False)] 
espeak.drop_duplicates('iso_693_3', keep="first", inplace=True)

In [17]:
# Export 
espeak.to_csv("language_codes/espeak_ng.csv", encoding="utf-8", index=False)
espeak.head(2)

Unnamed: 0,iso_639_5,bcp_47,language_family,language_name,accent_dialect,iso_693_3
0,gmw,af,West Germanic,Afrikaans,,afr
1,ine,sq,Indo-European,Albanian,,sqi


## Coqui

https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json

In [18]:
# Print models
model_list = subprocess.check_output(['tts','--list_models'])

In [19]:
# Clean the list 
model_list = str(model_list) \
    .replace("\\n", "") \
    .replace(" [already downloaded]", "") \
    .split("\\r")

model_list = [i.strip().split(": ")[-1] for i in model_list]
model_list = [i.split("/") + [i] for i in model_list]

In [20]:
# Save to dataframe
coqui  = pd.DataFrame(model_list[2:], columns = model_list[1][:-1] + ["path"] )
coqui = coqui[coqui['type'] == "tts_models"]

In [21]:
coqui.rename(columns = {"language":"language_code"}, inplace=True)

# Drop multilingual models
# TODO: Look into languages with multiple variants and how to support them with this interface
coqui = coqui[~coqui.language_code.isin(['multilingual'])].copy()

# Get iso 693-3 codes
coqui['language_code'] = coqui['language_code'].apply(lambda x: x.split("-")[0].split("_")[0])
coqui['iso_693_3'] = np.nan

for i, row in coqui.iterrows():
    if len(row.language_code)==3:
        coqui.loc[i, 'iso_693_3']=row.language_code
    elif len(row.language_code)==2:
        coqui.loc[i, 'iso_693_3']=pycountry.languages.get(alpha_2=row.language_code).alpha_3

In [22]:
# Export
coqui.to_csv("language_codes/coqui.csv", encoding="utf-8", index=False)
coqui.head(2)

Unnamed: 0,type,language_code,dataset,model,path,iso_693_3
4,tts_models,bg,cv,vits,tts_models/bg/cv/vits,bul
5,tts_models,cs,cv,vits,tts_models/cs/cv/vits,ces


In [23]:
# Collapse to one observation per code
coqui = coqui.groupby('iso_693_3')['path'].apply(list).reset_index()
coqui.rename(columns={'path':'models'}, inplace=True)

## Piper

https://huggingface.co/spaces/k2-fsa/text-to-speech/raw/main/model.py

In [24]:
with urllib.request.urlopen("https://huggingface.co/spaces/k2-fsa/text-to-speech/raw/main/model.py") as url:
    content =  url.read().decode(url.headers.get_content_charset())

In [25]:
# Extract content 
content = content.split("\n")
content = [i.split(':')[0] for i in content if "vits-piper" in i]
content = [i.split('"')[1] for i in content if '#' not in i.split('"')[0]]
content = content[1:]

In [26]:
# Convert to dataframe
piper = pd.DataFrame(content, columns=['models'])

In [27]:
# Get iso 693-3 language codes
piper['language_codes'] = piper['models'].apply(lambda x: x.split("-")[2])

piper['iso_693_3']= piper['language_codes'].apply(lambda x: pycountry.languages.get(alpha_2= x.split("_")[0]).alpha_3)

recodes = {'en_US': 'en-us',  'es_MX': 'spa-lat', 'pt_BR':'pt-br'}

for i, row in piper.iterrows():
    if row.language_codes in list(recodes.keys()):
        piper.loc[i, 'iso_693_3'] = recodes[row.language_codes]

In [28]:
# Export
piper.to_csv("language_codes/piper.csv", index=False, encoding="utf-8")
piper.head(2)

Unnamed: 0,models,language_codes,iso_693_3
0,csukuangfj/vits-piper-zh_CN-huayan-medium,zh_CN,zho
1,csukuangfj/vits-piper-en_US-glados|1 speaker,en_US,en-us


In [29]:
# Collapse to one row per code
piper = piper.groupby(['iso_693_3'])['models'].apply(list).reset_index()

https://github.com/rhasspy/piper/blob/master/VOICES.md

(Note: skip the below for now since piper demos are being supplied by k2-fsa site)

## African voices

In [30]:
# ! TODO

# Merge into meta list

In [31]:
# Get list of all codes across datasets
iso_693_3 = set(
    toucan.iso_693_3.unique().tolist() +
    meta_mms.iso_693_3.unique().tolist() +
    espeak.iso_693_3.unique().tolist() +
    coqui.iso_693_3.unique().tolist() +
    piper.iso_693_3.unique().tolist()
) 

In [32]:
all_tts = pd.DataFrame(index=list(iso_693_3)).reset_index()
all_tts.columns = ['iso_693_3']

In [33]:
# Add suffix to columns of each individual dataset
toucan.columns = [c + "_toucan" for c in toucan.columns]
meta_mms.columns = [c + "_meta_mms" for c in meta_mms.columns]
espeak.columns = [c + "_espeak" for c in espeak.columns]
coqui.columns = [c + "_coqui" for c in coqui.columns]
piper.columns = [c + "_piper" for c in piper.columns]

In [34]:
# Merge
all_tts = all_tts.merge(toucan, left_on="iso_693_3", right_on="iso_693_3_toucan", how="left")
all_tts = all_tts.merge(meta_mms, left_on="iso_693_3", right_on="iso_693_3_meta_mms", how="left")
all_tts = all_tts.merge(espeak, left_on="iso_693_3", right_on="iso_693_3_espeak", how="left")
all_tts = all_tts.merge(coqui, left_on="iso_693_3", right_on="iso_693_3_coqui", how="left")
all_tts = all_tts.merge(piper, left_on="iso_693_3", right_on="iso_693_3_piper", how="left")

In [35]:
# Locate the identifier for each tts model
all_tts['toucan_id'] = all_tts['language_name_toucan'] + " (" + all_tts['iso_693_3_toucan'] +")"
all_tts['meta_mms_id'] = all_tts['iso_693_3_meta_mms']
all_tts['coqui_id'] = all_tts['models_coqui']
all_tts['espeak_id'] = all_tts['bcp_47_espeak']
all_tts['piper_id'] = all_tts['models_piper']

In [None]:
all_tts['language_name'] = all_tts['language_name_toucan'].copy()

all_tts['language_name'] = np.where(all_tts['language_name'].isna(), all_tts['language_name_meta_mms'], all_tts['language_name'])

all_tts['language_name'] = np.where(all_tts['language_name'].isna(), all_tts['language_name_espeak'], all_tts['language_name'])

all_tts.loc[all_tts['language_name'].isna() & (all_tts['iso_693_3']=='twi'), 'language_name'] = 'twi'
all_tts.loc[all_tts['language_name'].isna() & (all_tts['iso_693_3']=='zho'), 'language_name'] = 'Chinese'

In [71]:
all_tts.language_name.value_counts().head(15)

Estonian         2
Persian          2
Swahili          2
Albanian         2
Oriya            2
Malay            2
Kurdish          2
Oromo            2
Uzbek            2
Azerbaijani      2
Konkani          2
Maay             1
Bengali          1
Ganza            1
Dendi (Benin)    1
Name: language_name, dtype: int64

In [73]:
all_tts[all_tts['language_name']=='Kurdish']

Unnamed: 0,iso_693_3,iso_693_3_toucan,language_name_toucan,iso_693_3_meta_mms,language_name_meta_mms,asr_meta_mms,tts_meta_mms,lid_meta_mms,iso_639_5_espeak,bcp_47_espeak,...,iso_693_3_coqui,models_coqui,iso_693_3_piper,models_piper,toucan_id,meta_mms_id,coqui_id,espeak_id,piper_id,language_name
1884,ckb,ckb,Kurdish,ckb,"Kurdish, Central",✔️,,✔️,,,...,,,,,Kurdish (ckb),ckb,,,,Kurdish
4988,kur,,,,,,,,ira,ku,...,,,,,,,,ku,,Kurdish


# Export

In [36]:
# Allow people to look up model identifiers by ISO code
model_lookups_by_iso = all_tts[['iso_693_3', 'language_name', 'toucan_id', 'meta_mms_id', 'coqui_id', 'piper_id', 'espeak_id']].sort_values(['language_name'])
model_lookups_by_iso.to_csv("model_lookups_by_iso.csv")

In [39]:
model_lookups_by_iso

Unnamed: 0,iso_693_3,toucan_id,meta_mms_id,coqui_id,piper_id,espeak_id
0,ymm,Maay (ymm),ymm,,,
1,amt,Amto (amt),amt,,,
2,fqs,Fas (fqs),fqs,,,
3,xud,Umiida (xud),,,,
4,chf,Tabasco Chontal (chf),chf,,,
...,...,...,...,...,...,...
7321,ldo,Loo (ldo),ldo,,,
7322,mfj,Mefele (mfj),mfj,,,
7323,swa,,,,[csukuangfj/vits-piper-sw_CD-lanfrica-medium],sw
7324,khg,Khams Tibetan (khg),khg,,,


In [37]:
# ALlow for a big picture analysis of model support by language
model_support = model_lookups_by_iso.set_index(['iso_693_3']).isna().astype(int)
model_support['n_platforms'] = model_support.sum(axis=1)

model_support.to_csv("model_support.csv")