In [1]:
import pandas as pd
from iso639 import Lang

In [2]:
df = pd.read_csv('source_data/grammatical-gender.csv') #From Wikipedia
df = df.sort_values(by='Language').reset_index(drop=True)

df

Unnamed: 0,Language
0,Afar
1,Agaw
2,Alamblak
3,Albanian
4,Amharic
...,...
85,Welsh
86,Worrorra
87,Yiddish
88,Zande


In [3]:
df.insert(0, 'Code', None)
for index, row in df.iterrows():
    if row['Code'] is None:  
        try:
            temp = Lang(row['Language']).pt3
            df.loc[df['Language'] == row['Language'], 'Code'] = temp
        except:
            continue  # Skip to the next iteration if there's an error
df

Unnamed: 0,Code,Language
0,aar,Afar
1,,Agaw
2,amp,Alamblak
3,sqi,Albanian
4,amh,Amharic
...,...,...
85,cym,Welsh
86,wro,Worrorra
87,yid,Yiddish
88,,Zande


In [4]:
none_list = df[df['Code'].isnull()]['Language'].tolist()
print(len(none_list), none_list)

9 ['Agaw', 'Friulan', 'Greek', 'Kalaw Lagaw Ya', 'Occitan', 'Romani', 'Tamazight', 'Tuareg', 'Zande']


In [5]:
language_codes = {'Agaw': 'awn',
                 'Friulan': 'fur',
                 'Greek': 'ell',
                 'Kalaw Lagaw Ya': 'mwp',
                 'Occitan': 'oci',
                 'Romani': 'rom',
                 'Tamazight': 'zgh',
                 'Tuareg': 'tmh',
                 'Zande': 'zne'}

In [6]:
df['Code'] = df['Code'].fillna(df['Language'].map(language_codes))
df

Unnamed: 0,Code,Language
0,aar,Afar
1,awn,Agaw
2,amp,Alamblak
3,sqi,Albanian
4,amh,Amharic
...,...,...
85,cym,Welsh
86,wro,Worrorra
87,yid,Yiddish
88,zne,Zande


In [7]:
df.to_csv('output_data/gramm_gend_code.csv', index=False)

In [8]:
# https://github.com/dw-data/ai-languages/blob/main/README.md
df2 = pd.read_excel('source_data/Data.xlsx', sheet_name="Ethnologue", header=2)
df2.sample(10)

Unnamed: 0,users,DLS,family-link,family-link-href,languagecode,languagelink-href,vitality,languagecountry,languagename,lat,long
2800,Less than 10K,Still,Afro-Asiatic,https://www.ethnologue.com/subgroup/31/,kcx,https://www.ethnologue.com/language/kcx/,Stable,Ethiopia,Kachama-Ganjule,6.392,37.9304
7128,10K to 1M,Still,Niger-Congo,https://www.ethnologue.com/subgroup/47/,ybl,https://www.ethnologue.com/language/ybl/,Stable,Nigeria,Yukuben,6.8956,9.7954
1368,,Still,Miwok-Costanoan,https://www.ethnologue.com/subgroup/3852/,csi,https://www.ethnologue.com/language/csi/,Endangered,United States,"Miwok, Coast",38.0005,-122.7255
1023,Less than 10K,Still,East Geelvink Bay,https://www.ethnologue.com/subgroup/1397/,bvz,https://www.ethnologue.com/language/bvz/,Stable,Indonesia,Bauzi,-2.3979,137.5845
5108,10K to 1M,Still,Mayan,https://www.ethnologue.com/subgroup/268/,poc,https://www.ethnologue.com/language/poc/,Endangered,Guatemala,Poqomam,14.7325,-89.738
7372,10K to 1M,Still,Niger-Congo,https://www.ethnologue.com/subgroup/47/,zin,https://www.ethnologue.com/language/zin/,Endangered,Tanzania,Zinza,-2.6722,32.213
720,Less than 10K,Emerging,Austronesian,https://www.ethnologue.com/subgroup/447/,bjl,https://www.ethnologue.com/language/bjl/,Stable,Papua New Guinea,Bulu,-5.0601,150.0947
4205,Less than 10K,Still,Austronesian,https://www.ethnologue.com/subgroup/447/,mvt,https://www.ethnologue.com/language/mvt/,Endangered,Vanuatu,Mpotovoro,-15.9128,167.2081
4619,10K to 1M,Still,Uto-Aztecan,https://www.ethnologue.com/subgroup/1066/,npl,https://www.ethnologue.com/language/npl/,Stable,Mexico,"Nahuatl, Southeastern Puebla",18.3769,-97.2526
7202,Less than 10K,Still,Torricelli,https://www.ethnologue.com/subgroup/2211/,yll,https://www.ethnologue.com/language/yll/,Stable,Papua New Guinea,Yil,-3.5043,142.2036


In [9]:
langs = df2[['languagecode', 'family-link', 'users', 'vitality', 'languagecountry', 'DLS']].copy()
langs.rename(columns={'languagecode': 'Code',  
                      'family-link': 'Family',
                      'users': 'Speaker',
                      'vitality': 'Vitality',
                      'languagecountry': 'Country',
                      'DLS': 'Digital Language Support'}, inplace=True)
langs.to_csv('output_data/languages.csv', index=False)
langs.sample(10)

Unnamed: 0,Code,Family,Speaker,Vitality,Country,Digital Language Support
4688,nua,Austronesian,Less than 10K,Endangered,New Caledonia,Still
3293,kxo,Language isolate,,Endangered,Brazil,Still
6168,tmd,Piawi,Less than 10K,Stable,Papua New Guinea,Still
5984,tcl,Sino-Tibetan,,Endangered,Myanmar,Still
6859,wxa,Unclassified,10K to 1M,Endangered,China,Still
496,bac,Austronesian,10K to 1M,Stable,Indonesia,Still
4472,nij,Austronesian,10K to 1M,Institutional,Indonesia,Still
4539,nlo,Niger-Congo,Less than 10K,Stable,Democratic Republic of the Congo,Still
2830,kef,Niger-Congo,Less than 10K,Stable,Togo,Still
6547,utr,Niger-Congo,10K to 1M,Stable,Nigeria,Still


In [10]:
df = df.merge(langs, on='Code', how='left')
df

Unnamed: 0,Code,Language,Family,Speaker,Vitality,Country,Digital Language Support
0,aar,Afar,Afro-Asiatic,1M to 1B,Institutional,Ethiopia,Emerging
1,awn,Agaw,Afro-Asiatic,10K to 1M,Stable,Ethiopia,Emerging
2,amp,Alamblak,Sepik,Less than 10K,Endangered,Papua New Guinea,Still
3,sqi,Albanian,,,,,
4,amh,Amharic,Afro-Asiatic,1M to 1B,Institutional,Ethiopia,Vital
...,...,...,...,...,...,...,...
85,cym,Welsh,Indo-European,10K to 1M,Institutional,United Kingdom,Vital
86,wro,Worrorra,Australian,,Endangered,Australia,Still
87,yid,Yiddish,,,,,
88,zne,Zande,Niger-Congo,1M to 1B,Institutional,Democratic Republic of the Congo,Still


In [11]:
nan_info = df.isna().sum()
nan_info

Code                         0
Language                     0
Family                      13
Speaker                     18
Vitality                    13
Country                     13
Digital Language Support    13
dtype: int64

In [12]:
none_list = df[df['Family'].isna()]['Code'].tolist()
print(len(none_list), none_list)

13 ['sqi', 'ara', 'kok', 'kur', 'lav', 'orm', 'pus', 'rom', 'srd', 'swa', 'tmh', 'yid', 'zza']


In [13]:
mapping = {
    'sqi': {'Family': 'Indo-European', 'Speaker': '1M to 1B', 'Vitality': 'Stable', 'Country': 'Albania', 'Digital Language Support': 'Emerging'},
    'ara': {'Family': 'Afro-Asiatic', 'Speaker': 'More than 1B', 'Vitality': 'Institutional', 'Country': 'Egypt', 'Digital Language Support': 'Thriving'},
    'kok': {'Family': 'Indo-European', 'Speaker': '1M to 1B', 'Vitality': 'Endangered', 'Country': 'India', 'Digital Language Support': 'Emerging'},
    'kur': {'Family': 'Indo-European', 'Speaker': '1M to 1B', 'Vitality': 'Endangered', 'Country': 'Turkey', 'Digital Language Support': 'Still'},
    'lav': {'Family': 'Indo-European', 'Speaker': '1M to 1B', 'Vitality': 'Stable', 'Country': 'Latvia', 'Digital Language Support': 'Ascending'},
    'orm': {'Family': 'Afro-Asiatic', 'Speaker': '1M to 1B', 'Vitality': 'Stable', 'Country': 'Ethiopia', 'Digital Language Support': 'Emerging'},
    'pus': {'Family': 'Indo-European', 'Speaker': '1M to 1B', 'Vitality': 'Stable', 'Country': 'Afghanistan', 'Digital Language Support': 'Emerging'},
    'rom': {'Family': 'Indo-European', 'Speaker': '10K to 1M', 'Vitality': 'Endangered', 'Country': 'Romania', 'Digital Language Support': 'Still'},
    'srd': {'Family': 'Indo-European', 'Speaker': '10K to 1M', 'Vitality': 'Endangered', 'Country': 'Italy', 'Digital Language Support': 'Still'},
    'swa': {'Family': 'Niger-Congo', 'Speaker': '1M to 1B', 'Vitality': 'Institutional', 'Country': 'Tanzania', 'Digital Language Support': 'Vital'},
    'tmh': {'Family': 'Afro-Asiatic', 'Speaker': '10K to 1M', 'Vitality': 'Endangered', 'Country': 'Mali', 'Digital Language Support': 'Still'},
    'yid': {'Family': 'Indo-European', 'Speaker': '10K to 1M', 'Vitality': 'Endangered', 'Country': 'USA', 'Digital Language Support': 'Emerging'},
    'zza': {'Family': 'Indo-European', 'Speaker': '10K to 1M', 'Vitality': 'Endangered', 'Country': 'Turkey', 'Digital Language Support': 'Still'},
}


In [14]:
for code, values in mapping.items():
    for column, value in values.items():
        df.loc[df['Code'] == code, column] = value

df

Unnamed: 0,Code,Language,Family,Speaker,Vitality,Country,Digital Language Support
0,aar,Afar,Afro-Asiatic,1M to 1B,Institutional,Ethiopia,Emerging
1,awn,Agaw,Afro-Asiatic,10K to 1M,Stable,Ethiopia,Emerging
2,amp,Alamblak,Sepik,Less than 10K,Endangered,Papua New Guinea,Still
3,sqi,Albanian,Indo-European,1M to 1B,Stable,Albania,Emerging
4,amh,Amharic,Afro-Asiatic,1M to 1B,Institutional,Ethiopia,Vital
...,...,...,...,...,...,...,...
85,cym,Welsh,Indo-European,10K to 1M,Institutional,United Kingdom,Vital
86,wro,Worrorra,Australian,,Endangered,Australia,Still
87,yid,Yiddish,Indo-European,10K to 1M,Endangered,USA,Emerging
88,zne,Zande,Niger-Congo,1M to 1B,Institutional,Democratic Republic of the Congo,Still


In [15]:
rows_with_nan = df[df.isna().any(axis=1)]

# Get unique codes
unique_codes_with_nan = rows_with_nan['Code'].unique()

# Display result
df.loc[df['Code'].isin(unique_codes_with_nan)]

Unnamed: 0,Code,Language,Family,Speaker,Vitality,Country,Digital Language Support
13,cop,Coptic,Afro-Asiatic,,Endangered,Egypt,Emerging
41,lld,Ladin,Indo-European,,Endangered,Italy,Emerging
42,lat,Latin,Indo-European,,Endangered,Vatican State,Ascending
80,zgh,Tamazight,Afro-Asiatic,,Institutional,Morocco,Ascending
86,wro,Worrorra,Australian,,Endangered,Australia,Still


In [16]:
print(df.groupby('Vitality').size())
print('-'*50)
print(df.groupby('Speaker').size())
print('-'*50)
print(df.groupby('Family').size())
print('-'*50)
print(df.groupby('Digital Language Support').size())

Vitality
Endangered       22
Institutional    55
Stable           13
dtype: int64
--------------------------------------------------
Speaker
10K to 1M        21
1M to 1B         58
Less than 10K     5
More than 1B      1
dtype: int64
--------------------------------------------------
Family
Afro-Asiatic         13
Australian            2
Dravidian             1
Indo-European        65
Language isolate      1
Matacoan              1
Nakh-Daghestanian     1
Niger-Congo           5
Sepik                 1
dtype: int64
--------------------------------------------------
Digital Language Support
Ascending    23
Emerging     14
Still        11
Thriving     27
Vital        15
dtype: int64


### Digital Language Support 
**Still** — this language shows no signs of digital support \
**Emerging** — the language has some content in digital form and/or encoding tools \
**Ascending** — the language has some spell checking or localized tools or machine translation as well \
**Vital** — the language is supported by multiple tools in all of the above categories and as well as some speech processing \
**Thriving** — the language has all of the above plus virtual assistants

### Vitality 
**Institutional** — The language has been developed to the point that it is used and sustained by institutions beyond the home and community. \
**Stable** — The language is not being sustained by formal institutions, but it is still the norm in the home and community that all children learn and use the language. \
**Endangered** — It is no longer the norm that children learn and use this language. \
**Extinct** - The language is no longer used and no one retains a sense of ethnic identity associated with the language.

### Exclude languages with NaN users (language has no users) even if there is digital language support

In [17]:
df = df.dropna(subset=['Speaker'])
df

Unnamed: 0,Code,Language,Family,Speaker,Vitality,Country,Digital Language Support
0,aar,Afar,Afro-Asiatic,1M to 1B,Institutional,Ethiopia,Emerging
1,awn,Agaw,Afro-Asiatic,10K to 1M,Stable,Ethiopia,Emerging
2,amp,Alamblak,Sepik,Less than 10K,Endangered,Papua New Guinea,Still
3,sqi,Albanian,Indo-European,1M to 1B,Stable,Albania,Emerging
4,amh,Amharic,Afro-Asiatic,1M to 1B,Institutional,Ethiopia,Vital
...,...,...,...,...,...,...,...
84,vec,Venetian,Indo-European,1M to 1B,Stable,Italy,Ascending
85,cym,Welsh,Indo-European,10K to 1M,Institutional,United Kingdom,Vital
87,yid,Yiddish,Indo-European,10K to 1M,Endangered,USA,Emerging
88,zne,Zande,Niger-Congo,1M to 1B,Institutional,Democratic Republic of the Congo,Still


In [18]:
print(df.groupby('Vitality').size())
print('-'*50)
print(df.groupby('Speaker').size())
print('-'*50)
print(df.groupby('Family').size())
print('-'*50)
print(df.groupby('Digital Language Support').size())

Vitality
Endangered       18
Institutional    54
Stable           13
dtype: int64
--------------------------------------------------
Speaker
10K to 1M        21
1M to 1B         58
Less than 10K     5
More than 1B      1
dtype: int64
--------------------------------------------------
Family
Afro-Asiatic         11
Australian            1
Dravidian             1
Indo-European        63
Language isolate      1
Matacoan              1
Nakh-Daghestanian     1
Niger-Congo           5
Sepik                 1
dtype: int64
--------------------------------------------------
Digital Language Support
Ascending    21
Emerging     12
Still        10
Thriving     27
Vital        15
dtype: int64


### Exclude languages that show no signs of digital support

In [28]:
df = df[df['Digital Language Support'] != 'Still']
df

Unnamed: 0,Code,Language,Family,Speaker,Vitality,Country,Digital Language Support
0,aar,Afar,Afro-Asiatic,1M to 1B,Institutional,Ethiopia,Emerging
1,awn,Agaw,Afro-Asiatic,10K to 1M,Stable,Ethiopia,Emerging
3,sqi,Albanian,Indo-European,1M to 1B,Stable,Albania,Emerging
4,amh,Amharic,Afro-Asiatic,1M to 1B,Institutional,Ethiopia,Vital
5,ara,Arabic,Afro-Asiatic,More than 1B,Institutional,Egypt,Thriving
...,...,...,...,...,...,...,...
82,ukr,Ukrainian,Indo-European,1M to 1B,Institutional,Ukraine,Thriving
83,urd,Urdu,Indo-European,1M to 1B,Institutional,Pakistan,Vital
84,vec,Venetian,Indo-European,1M to 1B,Stable,Italy,Ascending
85,cym,Welsh,Indo-European,10K to 1M,Institutional,United Kingdom,Vital


In [29]:
df.to_csv('output_data/dataset_v1.csv', index=False)

In [24]:
print(df.groupby('Vitality').size ())
print('-'*50) 
print(df.groupby('Speaker').size())
print('-'*50)
# print(df.groupby('Family').size())
# print('-'*50)
print(df.groupby('Digital Language Support').size())

Vitality
Endangered       11
Institutional    53
Stable           11
dtype: int64
--------------------------------------------------
Speaker
10K to 1M        16
1M to 1B         56
Less than 10K     2
More than 1B      1
dtype: int64
--------------------------------------------------
Digital Language Support
Ascending    21
Emerging     12
Thriving     27
Vital        15
dtype: int64


### Digital Language Support 
**Still** — this language shows no signs of digital support \
**Emerging** — the language has some content in digital form and/or encoding tools \
**Ascending** — the language has some spell checking or localized tools or machine translation as well \
**Vital** — the language is supported by multiple tools in all of the above categories and as well as some speech processing \
**Thriving** — the language has all of the above plus virtual assistants

### Vitality 
**Institutional** — The language has been developed to the point that it is used and sustained by institutions beyond the home and community. \
**Stable** — The language is not being sustained by formal institutions, but it is still the norm in the home and community that all children learn and use the language. \
**Endangered** — It is no longer the norm that children learn and use this language. \
**Extinct** - The language is no longer used and no one retains a sense of ethnic identity associated with the language.

In [23]:
df[df.Speaker == 'Less than 10K']

Unnamed: 0,Code,Language,Family,Speaker,Vitality,Country,Digital Language Support
14,cor,Cornish,Indo-European,Less than 10K,Endangered,United Kingdom,Ascending
49,glv,Manx,Indo-European,Less than 10K,Endangered,Isle of Man,Ascending


In [27]:
df[df['Digital Language Support'] == 'Emerging']

Unnamed: 0,Code,Language,Family,Speaker,Vitality,Country,Digital Language Support
0,aar,Afar,Afro-Asiatic,1M to 1B,Institutional,Ethiopia,Emerging
1,awn,Agaw,Afro-Asiatic,10K to 1M,Stable,Ethiopia,Emerging
3,sqi,Albanian,Indo-European,1M to 1B,Stable,Albania,Emerging
7,bej,Beja,Afro-Asiatic,1M to 1B,Institutional,Sudan,Emerging
12,che,Chechen,Nakh-Daghestanian,1M to 1B,Stable,Russian Federation,Emerging
39,kok,Konkani,Indo-European,1M to 1B,Endangered,India,Emerging
55,orm,Oromo,Afro-Asiatic,1M to 1B,Stable,Ethiopia,Emerging
56,pus,Pashto,Indo-European,1M to 1B,Stable,Afghanistan,Emerging
63,run,Rundi,Niger-Congo,1M to 1B,Institutional,Burundi,Emerging
65,rue,Rusyn,Indo-European,10K to 1M,Stable,Ukraine,Emerging
