In [1]:
import pandas as pd
import numpy as np
import re
from iso639 import Lang

In [2]:
google_translate = pd.read_csv('data/google_translate.csv', index_col='No')
deepl = pd.read_csv('data/DeepL.csv', index_col='No')
gpt = pd.read_csv('data/gpt.csv', index_col='No')
gemini = pd.read_csv('data/gemini.csv', index_col='No')
copilot = pd.read_csv('data/copilot.csv', index_col='No')
claude = pd.read_csv('data/claude.csv', index_col='No')

gender = pd.read_csv('data/GrammaticalGender.csv')
gender = gender.sort_values(by='Language')

In [3]:
def clean_language_name(name):
    return re.sub(r'\s*\(.*?\)', '', name).strip()


google_translate['Languages'] = google_translate['Languages'].apply(clean_language_name)
deepl['Language'] = deepl['Language'].apply(clean_language_name)
gpt['Language'] = gpt['Language'].apply(clean_language_name)
gemini['Language'] = gemini['Language'].apply(clean_language_name)
copilot['Language'] = copilot['Language'].apply(clean_language_name)
claude['Language'] = claude['Language'].apply(clean_language_name)

In [4]:
languages = np.array(list(set(set(google_translate.Languages) | set(deepl.Language) | set(gpt.Language) | set(gemini.Language) | set(copilot.Language) | set(claude.Language))))
No = np.array([i+1 for i in range(len(languages))])

languages.sort()

df = {'Language': languages}
df = pd.DataFrame(df)


df['GoogleTranslate'] = df['Language'].isin(google_translate['Languages']).astype(int)
df['DeepL'] = df['Language'].isin(deepl['Language']).astype(int)
df['GPT'] = df['Language'].isin(gpt['Language']).astype(int)
df['Gemini'] = df['Language'].isin(gemini['Language']).astype(int)
df['Copilot'] = df['Language'].isin(copilot['Language']).astype(int)
df['Claude'] = df['Language'].isin(claude['Language']).astype(int)

df['GrammaticalGender'] = df['Language'].isin(gender['Language']).astype(int)


df = df.sort_values(by='Language')
df

Unnamed: 0,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,Claude,GrammaticalGender
0,Abkhaz,1,0,0,0,0,0,0
1,Acehnese,1,0,0,0,0,0,0
2,Acholi,1,0,0,0,0,0,0
3,Afar,1,0,0,0,0,0,1
4,Afrikaans,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...
234,Yiddish,1,0,1,0,0,0,1
235,Yoruba,1,0,1,1,0,0,0
236,Yucatec Maya,1,0,0,0,0,0,0
237,Zapotec,1,0,0,0,0,0,0


In [5]:
df[df['GrammaticalGender'] == 1]

Unnamed: 0,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,Claude,GrammaticalGender
3,Afar,1,0,0,0,0,0,1
5,Albanian,1,0,1,1,0,0,1
7,Amharic,1,0,1,0,0,0,1
8,Arabic,1,1,1,1,1,1,1
24,Belarusian,1,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...
222,Ukrainian,1,1,1,1,1,1,1
223,Urdu,1,0,1,1,0,1,1
227,Venetian,1,0,0,0,0,0,1
230,Welsh,1,0,1,1,1,0,1


In [6]:
df.insert(0, 'Code', None)

for index, row in df.iterrows():
    if row['Code'] is None:  
        try:
            temp = Lang(row['Language']).pt3
            df.loc[df['Language'] == row['Language'], 'Code'] = temp
        except:
            continue  # Skip to the next iteration if there's an error
df

Unnamed: 0,Code,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,Claude,GrammaticalGender
0,,Abkhaz,1,0,0,0,0,0,0
1,,Acehnese,1,0,0,0,0,0,0
2,,Acholi,1,0,0,0,0,0,0
3,aar,Afar,1,0,0,0,0,0,1
4,afr,Afrikaans,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
234,yid,Yiddish,1,0,1,0,0,0,1
235,yor,Yoruba,1,0,1,1,0,0,0
236,yua,Yucatec Maya,1,0,0,0,0,0,0
237,zap,Zapotec,1,0,0,0,0,0,0


In [7]:
none_list = df[df['Code'].isnull()]['Language'].tolist()
print(len(none_list), none_list)

39 ['Abkhaz', 'Acehnese', 'Acholi', 'Avar', 'Buryat', 'Cantonese', 'Frisian', 'Fulani', 'Greek', 'Ilocano', 'Jamaican Patois', 'Jingpo', 'Kiga', 'Kikongo', 'Kituba', 'Kokborok', 'Luganda', 'Luo', 'Makassar', 'Marwadi', 'Mauritian Creole', 'Meadow Mari', 'Meiteilon', 'Mizo', 'Myanmar', 'NKo', 'Nahuatl', 'Ndebele', 'Nepalbhasa', 'Occitan', 'Qʼeqchiʼ', 'Romani', 'Sami', 'Scots Gaelic', 'Sesotho', 'Seychellois Creole', 'Tamazight', 'Tongan', 'Tuvan']


In [8]:
language_codes = {
    'Abkhaz': 'abk',
    'Acehnese': 'ace',
    'Acholi': 'ach',
    'Avar': 'ava',
    'Buryat': 'bxm',
    'Cantonese': 'yue',
    'Frisian': 'frr',
    'Fulani': 'ful',
    'Greek': 'ell',
    'Ilocano': 'ilo',
    'Jamaican Patois': 'jam',
    'Jingpo': 'kac',
    'Kiga': 'cgg',
    'Kikongo': 'kwy',
    'Kituba': 'ktu',
    'Kokborok': 'trp',
    'Luganda': 'lug',
    'Luo': 'luo',
    'Makassar': 'mak',
    'Marwadi': 'rwr',
    'Mauritian Creole': 'mfe',
    'Meadow Mari': 'mhr',
    'Meiteilon': 'mni',
    'Mizo': 'lus',
    'Myanmar': 'mya',
    'NKo': 'nqo',
    'Nahuatl': 'nhe',
    'Ndebele': 'nde',
    'Nepalbhasa': 'new',
    'Occitan': 'oci',
    'Qʼeqchiʼ': 'kek',
    'Romani': 'rom',
    'Sami': 'sme',
    'Scots Gaelic': 'gla',
    'Sesotho': 'sot',
    'Seychellois Creole': 'crs',
    'Tamazight': 'tzm',
    'Tongan': 'ton',
    'Tuvan': 'tyv'
}


df['Code'] = df['Code'].fillna(df['Language'].map(language_codes))
df

Unnamed: 0,Code,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,Claude,GrammaticalGender
0,abk,Abkhaz,1,0,0,0,0,0,0
1,ace,Acehnese,1,0,0,0,0,0,0
2,ach,Acholi,1,0,0,0,0,0,0
3,aar,Afar,1,0,0,0,0,0,1
4,afr,Afrikaans,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
234,yid,Yiddish,1,0,1,0,0,0,1
235,yor,Yoruba,1,0,1,1,0,0,0
236,yua,Yucatec Maya,1,0,0,0,0,0,0
237,zap,Zapotec,1,0,0,0,0,0,0


In [9]:
none_list = df[df['Code'].isnull()]['Language'].tolist()
print(len(none_list), none_list)

0 []


### Code-Language-Family

In [10]:
#https://github.com/dw-data/ai-languages/blob/main/README.md
df2 = pd.read_excel('data/Data.xlsx', sheet_name="Ethnologue", header=2)
df2.sample(10)

Unnamed: 0,users,DLS,family-link,family-link-href,languagecode,languagelink-href,vitality,languagecountry,languagename,lat,long
2881,Less than 10K,Still,Sign language,https://www.ethnologue.com/subgroup/2/,kgi,https://www.ethnologue.com/language/kgi/,Endangered,Malaysia,Selangor Sign Language,3.3061,101.4956
3679,10K to 1M,Still,Austro-Asiatic,https://www.ethnologue.com/subgroup/231/,lyg,https://www.ethnologue.com/language/lyg/,Endangered,India,Lyngngam,25.8531,91.8063
5771,,,Sign language,https://www.ethnologue.com/subgroup/2/,sqx,https://www.ethnologue.com/language/sqx/,Stable,Israel,Kufr Qassem Sign Language,31.9175,35.2228
2912,Less than 10K,Still,Trans-New Guinea,https://www.ethnologue.com/subgroup/1038/,khs,https://www.ethnologue.com/language/khs/,Stable,Papua New Guinea,Kasua,-6.7219,142.9542
1996,,Still,Sign language,https://www.ethnologue.com/subgroup/2/,gds,https://www.ethnologue.com/language/gds/,Endangered,Nepal,Ghandruk Sign Language,28.3792,83.8013
3375,10K to 1M,Emerging,Sino-Tibetan,https://www.ethnologue.com/subgroup/236/,lbf,https://www.ethnologue.com/language/lbf/,Endangered,India,Tinani,32.3992,77.487
2655,Less than 10K,Still,Afro-Asiatic,https://www.ethnologue.com/subgroup/31/,jie,https://www.ethnologue.com/language/jie/,Endangered,Nigeria,Jilbe,11.8014,14.5781
1641,Less than 10K,Still,Austronesian,https://www.ethnologue.com/subgroup/447/,dro,https://www.ethnologue.com/language/dro/,Endangered,Malaysia,"Melanau, Daro-Matu",2.4443,111.5872
5426,,Still,Australian,https://www.ethnologue.com/subgroup/87/,rxd,https://www.ethnologue.com/language/rxd/,Extinct,Australia,Ngardi,-21.0231,129.0308
2852,10K to 1M,Still,Dravidian,https://www.ethnologue.com/subgroup/1265/,kfb,https://www.ethnologue.com/language/kfb/,Stable,India,Northwestern Kolami,20.0626,78.0913


In [11]:
langs = df2[['languagecode', 'languagename', 'family-link']].copy()
langs.rename(columns={'languagecode': 'Code', 
                      'languagename': 'Language', 
                      'family-link': 'Family'}, inplace=True)
langs.to_csv('data/languages.csv', index=False)
langs.sample(10)

Unnamed: 0,Code,Language,Family
1445,dak,Dakota,Siouan-Catawban
2090,gmm,Gbaya-Mbodomo,Niger-Congo
2334,hmm,"Miao, Central Mashan",Hmong-Mien
6311,tsy,Tebul Sign Language,Sign language
1740,eiv,Askopan,North Bougainville
7077,xwe,"Gbe, Xwela",Niger-Congo
5505,sdn,"Sardinian, Gallurese",Indo-European
1303,cob,Chicomuceltec,Mayan
6370,tvt,"Naga, Tutsa",Sino-Tibetan
4311,nam,Ngan’gityemerri,Australian


In [12]:
df = df.merge(langs[['Code', 'Family']], on='Code', how='left')
df

Unnamed: 0,Code,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,Claude,GrammaticalGender,Family
0,abk,Abkhaz,1,0,0,0,0,0,0,Abkhaz-Adyghe
1,ace,Acehnese,1,0,0,0,0,0,0,Austronesian
2,ach,Acholi,1,0,0,0,0,0,0,Nilo-Saharan
3,aar,Afar,1,0,0,0,0,0,1,Afro-Asiatic
4,afr,Afrikaans,1,0,1,1,0,0,0,Indo-European
...,...,...,...,...,...,...,...,...,...,...
234,yid,Yiddish,1,0,1,0,0,0,1,
235,yor,Yoruba,1,0,1,1,0,0,0,Niger-Congo
236,yua,Yucatec Maya,1,0,0,0,0,0,0,Mayan
237,zap,Zapotec,1,0,0,0,0,0,0,


In [13]:
none_list = df[df['Family'].isna()]['Code'].tolist()
print(len(none_list), none_list)

36 ['sqi', 'ara', 'aym', 'aze', 'bal', 'bik', 'zho', 'din', 'doi', 'est', 'ful', 'grn', 'hmn', 'kau', 'kaz', 'kom', 'kok', 'kur', 'kir', 'lav', 'mlg', 'msa', 'mon', 'nep', 'orm', 'pus', 'fas', 'que', 'rom', 'swa', 'tyv', 'twi', 'uzb', 'sah', 'yid', 'zap']


In [14]:
language_families = {
    'sqi': 'Indo-European',
    'ara': 'Afro-Asiatic',
    'aym': 'Aymaran',
    'aze': 'Turkic',
    'bal': 'Indo-European',
    'bik': 'Austronesian',
    'zho': 'Sino-Tibetan',
    'din': 'Nilo-Saharan',
    'doi': 'Indo-European',
    'est': 'Uralic',
    'ful': 'Niger-Congo',
    'grn': 'Tupian',
    'hmn': 'Hmong-Mien',
    'kau': 'Nilo-Saharan',
    'kaz': 'Turkic',
    'kom': 'Uralic',
    'kok': 'Indo-European',
    'kur': 'Indo-European',
    'kir': 'Turkic',
    'lav': 'Indo-European',
    'mlg': 'Austronesian',
    'msa': 'Austronesian',
    'mon': 'Mongolic',
    'nep': 'Indo-European',
    'orm': 'Afro-Asiatic',
    'pus': 'Indo-European',
    'fas': 'Indo-Iranian',
    'que': 'Quechuan',
    'rom': 'Indo-European',
    'swa': 'Niger-Congo',
    'tyv': 'Turkic',
    'twi': 'Kwa',
    'uzb': 'Turkic',
    'sah': 'Turkic',
    'yid': 'Indo-European',
    'zap': 'Otomanguean'
}


df['Family'] = df['Family'].fillna(df['Code'].map(language_families))
df

Unnamed: 0,Code,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,Claude,GrammaticalGender,Family
0,abk,Abkhaz,1,0,0,0,0,0,0,Abkhaz-Adyghe
1,ace,Acehnese,1,0,0,0,0,0,0,Austronesian
2,ach,Acholi,1,0,0,0,0,0,0,Nilo-Saharan
3,aar,Afar,1,0,0,0,0,0,1,Afro-Asiatic
4,afr,Afrikaans,1,0,1,1,0,0,0,Indo-European
...,...,...,...,...,...,...,...,...,...,...
234,yid,Yiddish,1,0,1,0,0,0,1,Indo-European
235,yor,Yoruba,1,0,1,1,0,0,0,Niger-Congo
236,yua,Yucatec Maya,1,0,0,0,0,0,0,Mayan
237,zap,Zapotec,1,0,0,0,0,0,0,Otomanguean


In [15]:
none_list = df[df['Family'].isna()]['Code'].tolist()
print(len(none_list), none_list)

0 []


In [16]:
gender_df = df[df['GrammaticalGender'] == 1]
gender_df

Unnamed: 0,Code,Language,GoogleTranslate,DeepL,GPT,Gemini,Copilot,Claude,GrammaticalGender,Family
3,aar,Afar,1,0,0,0,0,0,1,Afro-Asiatic
5,sqi,Albanian,1,0,1,1,0,0,1,Indo-European
7,amh,Amharic,1,0,1,0,0,0,1,Afro-Asiatic
8,ara,Arabic,1,1,1,1,1,1,1,Afro-Asiatic
24,bel,Belarusian,1,0,1,1,0,0,1,Indo-European
...,...,...,...,...,...,...,...,...,...,...
222,ukr,Ukrainian,1,1,1,1,1,1,1,Indo-European
223,urd,Urdu,1,0,1,1,0,1,1,Indo-European
227,vec,Venetian,1,0,0,0,0,0,1,Indo-European
230,cym,Welsh,1,0,1,1,1,0,1,Indo-European


In [17]:
df.to_csv('data/result.csv', index=False)
gender_df.to_csv('data/result_gender.csv', index=False)