In [11]:
import pandas as pd
import numpy as np
import re
from iso639 import Lang

In [12]:
google_translate = pd.read_csv('data/google_translate.csv', index_col='No')
deepl = pd.read_csv('data/DeepL.csv', index_col='No')
gpt = pd.read_csv('data/gpt.csv', index_col='No')
gemini = pd.read_csv('data/gemini.csv', index_col='No')
copilot = pd.read_csv('data/copilot.csv', index_col='No')

gender = pd.read_csv('data/GrammaticalGender.csv')
gender = gender.sort_values(by='Language')

In [13]:
def clean_language_name(name):
    return re.sub(r'\s*\(.*?\)', '', name).strip()


google_translate['Languages'] = google_translate['Languages'].apply(clean_language_name)
deepl['Language'] = deepl['Language'].apply(clean_language_name)
gpt['Language'] = gpt['Language'].apply(clean_language_name)
gemini['Language'] = gemini['Language'].apply(clean_language_name)
copilot['Language'] = copilot['Language'].apply(clean_language_name)

In [14]:
languages = np.array(list(set(set(google_translate.Languages) | set(deepl.Language) | set(gpt.Language) | set(gemini.Language) | set(copilot.Language))))
No = np.array([i+1 for i in range(len(languages))])

languages.sort()

df = {'No': No, 'Language': languages}
df = pd.DataFrame(df)
df = df.set_index('No')


df['GoogleTranslate'] = df['Language'].isin(google_translate['Languages']).astype(int)
df['DeepL'] = df['Language'].isin(deepl['Language']).astype(int)
df['GPT'] = df['Language'].isin(gpt['Language']).astype(int)
df['Gemini'] = df['Language'].isin(gemini['Language']).astype(int)
df['Copilot'] = df['Language'].isin(copilot['Language']).astype(int)

df['GrammaticalGender'] = df['Language'].isin(gender['Language']).astype(int)


df = df.sort_values(by='Language')
df

Unnamed: 0_level_0,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,GrammaticalGender
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Abkhaz,1,0,0,0,0,0
2,Acehnese,1,0,0,0,0,0
3,Acholi,1,0,0,0,0,0
4,Afar,1,0,0,0,0,1
5,Afrikaans,1,0,1,1,0,0
...,...,...,...,...,...,...,...
234,Yiddish,1,0,1,0,0,1
235,Yoruba,1,0,1,1,0,0
236,Yucatec Maya,1,0,0,0,0,0
237,Zapotec,1,0,0,0,0,0


In [15]:
df[df['GrammaticalGender'] == 1]

Unnamed: 0_level_0,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,GrammaticalGender
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4,Afar,1,0,0,0,0,1
6,Albanian,1,0,1,1,0,1
8,Amharic,1,0,1,0,0,1
9,Arabic,1,1,1,1,1,1
25,Belarusian,1,0,1,1,0,1
...,...,...,...,...,...,...,...
222,Ukrainian,1,1,1,1,1,1
223,Urdu,1,0,1,1,0,1
227,Venetian,1,0,0,0,0,1
230,Welsh,1,0,1,1,1,1


In [6]:
df.insert(0, 'Code', None)

for index, row in df.iterrows():
    if row['Code'] is None:  
        try:
            temp = Lang(row['Language']).pt3
            df.loc[df['Language'] == row['Language'], 'Code'] = temp
        except:
            continue  # Skip to the next iteration if there's an error
df

Unnamed: 0_level_0,Code,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,GrammaticalGender
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,,Abkhaz,1,0,0,0,0,0
2,,Acehnese,1,0,0,0,0,0
3,,Acholi,1,0,0,0,0,0
4,aar,Afar,1,0,0,0,0,1
5,afr,Afrikaans,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...
234,yid,Yiddish,1,0,1,0,0,1
235,yor,Yoruba,1,0,1,1,0,0
236,yua,Yucatec Maya,1,0,0,0,0,0
237,zap,Zapotec,1,0,0,0,0,0


In [7]:
none_list = df[df['Code'].isnull()]['Language'].tolist()
len(none_list), none_list

(39,
 ['Abkhaz',
  'Acehnese',
  'Acholi',
  'Avar',
  'Buryat',
  'Cantonese',
  'Frisian',
  'Fulani',
  'Greek',
  'Ilocano',
  'Jamaican Patois',
  'Jingpo',
  'Kiga',
  'Kikongo',
  'Kituba',
  'Kokborok',
  'Luganda',
  'Luo',
  'Makassar',
  'Marwadi',
  'Mauritian Creole',
  'Meadow Mari',
  'Meiteilon',
  'Mizo',
  'Myanmar',
  'NKo',
  'Nahuatl',
  'Ndebele',
  'Nepalbhasa',
  'Occitan',
  'Qʼeqchiʼ',
  'Romani',
  'Sami',
  'Scots Gaelic',
  'Sesotho',
  'Seychellois Creole',
  'Tamazight',
  'Tongan',
  'Tuvan'])

In [8]:
language_codes = {
    'Abkhaz': 'abk',
    'Acehnese': 'ace',
    'Acholi': 'ach',
    'Avar': 'ava',
    'Buryat': 'bxm',
    'Cantonese': 'yue',
    'Frisian': 'frr',
    'Fulani': 'ful',
    'Greek': 'ell',
    'Ilocano': 'ilo',
    'Jamaican Patois': 'jam',
    'Jingpo': 'kac',
    'Kiga': 'cgg',
    'Kikongo': 'kwy',
    'Kituba': 'ktu',
    'Kokborok': 'trp',
    'Luganda': 'lug',
    'Luo': 'luo',
    'Makassar': 'mak',
    'Marwadi': 'rwr',
    'Mauritian Creole': 'mfe',
    'Meadow Mari': 'mhr',
    'Meiteilon': 'mni',
    'Mizo': 'lus',
    'Myanmar': 'mya',
    'NKo': 'nqo',
    'Nahuatl': 'nhe',
    'Ndebele': 'nde',
    'Nepalbhasa': 'new',
    'Occitan': 'oci',
    'Qʼeqchiʼ': 'kek',
    'Romani': 'rom',
    'Sami': 'sme',
    'Scots Gaelic': 'gla',
    'Sesotho': 'sot',
    'Seychellois Creole': 'crs',
    'Tamazight': 'tzm',
    'Tongan': 'ton',
    'Tuvan': 'tyv'
}


df['Code'] = df['Code'].fillna(df['Language'].map(language_codes))
df

Unnamed: 0_level_0,Code,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,GrammaticalGender
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,abk,Abkhaz,1,0,0,0,0,0
2,ace,Acehnese,1,0,0,0,0,0
3,ach,Acholi,1,0,0,0,0,0
4,aar,Afar,1,0,0,0,0,1
5,afr,Afrikaans,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...
234,yid,Yiddish,1,0,1,0,0,1
235,yor,Yoruba,1,0,1,1,0,0
236,yua,Yucatec Maya,1,0,0,0,0,0
237,zap,Zapotec,1,0,0,0,0,0


In [9]:
none_list = df[df['Code'].isnull()]['Language'].tolist()
len(none_list), none_list

(0, [])

### Code-Language-Family

In [10]:
#https://github.com/dw-data/ai-languages/blob/main/README.md
df2 = pd.read_excel('data/Data.xlsx', sheet_name="Ethnologue", header=2)
df2.sample(10)

Unnamed: 0,users,DLS,family-link,family-link-href,languagecode,languagelink-href,vitality,languagecountry,languagename,lat,long
5032,Less than 10K,Still,Afro-Asiatic,https://www.ethnologue.com/subgroup/31/,piy,https://www.ethnologue.com/language/piy/,Stable,Nigeria,Piya-Kwonci,9.387,10.8618
4592,,Still,Australian,https://www.ethnologue.com/subgroup/87/,nny,https://www.ethnologue.com/language/nny/,Extinct,Australia,Nyangga,-18.7503,142.1521
2700,1M to 1B,Thriving,Japonic,https://www.ethnologue.com/subgroup/1710/,jpn,https://www.ethnologue.com/language/jpn/,Institutional,Japan,Japanese,36.6554,139.2716
5163,Less than 10K,Still,Sign language,https://www.ethnologue.com/subgroup/2/,psg,https://www.ethnologue.com/language/psg/,Endangered,Malaysia,Penang Sign Language,5.3527,100.4659
1432,1M to 1B,Still,Sino-Tibetan,https://www.ethnologue.com/subgroup/236/,czh,https://www.ethnologue.com/language/czh/,Stable,China,"Chinese, Huizhou",29.6246,118.1569
470,10K to 1M,Still,Niger-Congo,https://www.ethnologue.com/subgroup/47/,ayg,https://www.ethnologue.com/language/ayg/,Stable,Togo,Ginyanga,8.2282,0.9529
5790,10K to 1M,Still,Austronesian,https://www.ethnologue.com/subgroup/447/,srv,https://www.ethnologue.com/language/srv/,Stable,Philippines,"Sorsoganon, Southern",12.6795,124.0146
478,Less than 10K,Still,Sepik,https://www.ethnologue.com/subgroup/1590/,ayq,https://www.ethnologue.com/language/ayq/,Stable,Papua New Guinea,Ayi,-3.9587,142.4089
3672,10K to 1M,Still,Nilo-Saharan,https://www.ethnologue.com/subgroup/39/,lwo,https://www.ethnologue.com/language/lwo/,Stable,South Sudan,Luwo,7.4575,28.3935
6303,10K to 1M,Still,Sign language,https://www.ethnologue.com/subgroup/2/,tsq,https://www.ethnologue.com/language/tsq/,Stable,Thailand,Thai Sign Language,13.7837,100.4455


In [None]:
langs = df[['languagecode', 'languagename', 'family-link']].copy()
langs.rename(columns={'languagecode': 'Code', 
                      'languagename': 'Language', 
                      'family-link': 'Family'}, inplace=True)
langs.to_csv('data/languages.csv', index=False)