In [1]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

import pandas as pd
import numpy as np
import json

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


## Lang2vec

1) Querying the URIEL database, as well as the trained language vectors from Malaviya et al, 2017. The main operation is `get_features(languages, feature_sets, header=False, minimal=False)`, which returns a dictionary with the feature vector for every language in languages for the feature_sets.

2) Returning pre-computed distances between languages, based on some typological information. The main operation here is `distance(distance, language1, language2)`, which returns a float distance.

In [2]:
#!pip3 install lang2vec
import lang2vec.lang2vec as l2v

In [4]:
# Available feature sets
features = l2v.FEATURE_SETS
features

['syntax_wals',
 'phonology_wals',
 'syntax_sswl',
 'syntax_ethnologue',
 'phonology_ethnologue',
 'inventory_ethnologue',
 'inventory_phoible_aa',
 'inventory_phoible_gm',
 'inventory_phoible_saphon',
 'inventory_phoible_spa',
 'inventory_phoible_ph',
 'inventory_phoible_ra',
 'inventory_phoible_upsid',
 'syntax_knn',
 'phonology_knn',
 'inventory_knn',
 'syntax_average',
 'phonology_average',
 'inventory_average',
 'fam',
 'id',
 'geo',
 'learned']

In [10]:
# See whether our languages are supported 
for i in ["slv", "hrv", "srp", "mkd", "bul", "isl", "tur", "ukr", "cat", "ell", "mlt", "alb", "hbs", "cnr", "bos"]:
	if i not in l2v.LANGUAGES:
		print(i)

hbs
cnr


Montenegrin is not supported.

In [37]:
# For each of the features, create a table to see how many features are supported for all of the languages
lang_features = {}

for feature in features:
	current_feature_dict = {}
	for lang in ["slv", "hrv", "srp", "hbs", "mkd", "bul", "isl", "tur", "ukr", "cat", "ell", "mlt", "alb", "eng"]:
		try:
			current_feature_dict[lang] = l2v.get_features(lang, feature)[lang]
		except:
			current_feature_dict[lang] = "NaN"
	lang_features[feature] = current_feature_dict

lang_features

{'syntax_wals': {'slv': [1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   1.0,
   0.0,
   1.0,
   1.0,
   1.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   '--',
   '--',
   '--',
   '--',
   '--',
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   1.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   0.0,
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   0.0,
   1.0,
   '--',
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   1.0,
   0.0,
   '--',
   '--',
   '--',
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--'],
  'hrv': ['--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--',
   '--

In [65]:
df_lang_features = pd.DataFrame(lang_features["geo"])

# Replace empty lines by NaN
df_lang_features = df_lang_features.replace("--", np.nan)

display(df_lang_features.describe())

df_lang_features

Unnamed: 0,slv,hrv,srp,hbs,mkd,bul,isl,tur,ukr,cat,ell,mlt,alb,eng
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,0.499099,0.499141,0.499153,0.49916,0.499191,0.499159,0.498743,0.499239,0.49907,0.499115,0.499179,0.49929,0.499193,0.49894
std,0.217936,0.217945,0.217886,0.217936,0.217921,0.217866,0.217886,0.217888,0.217838,0.217997,0.217888,0.217898,0.217933,0.217938
min,0.0176,0.0239,0.0343,0.0317,0.0196,0.0297,0.0243,0.0299,0.0334,0.0023,0.0239,0.027,0.0256,0.0141
25%,0.33,0.33295,0.33825,0.3329,0.33835,0.3331,0.32985,0.32805,0.338,0.3334,0.335,0.33165,0.33425,0.33085
50%,0.5013,0.4988,0.498,0.499,0.5019,0.4955,0.5003,0.4934,0.4989,0.4934,0.4976,0.5014,0.501,0.5008
75%,0.6609,0.66625,0.66775,0.6676,0.6688,0.6669,0.66555,0.66545,0.6619,0.67065,0.67005,0.66415,0.6712,0.6706
max,0.9811,0.9893,0.98,0.9894,0.9787,0.968,0.9618,0.9775,0.9609,0.9694,0.968,0.9722,0.9859,0.9762


Unnamed: 0,slv,hrv,srp,hbs,mkd,bul,isl,tur,ukr,cat,ell,mlt,alb,eng
0,0.7239,0.7176,0.7115,0.7121,0.6964,0.7056,0.8322,0.6851,0.7324,0.7035,0.6999,0.6666,0.7007,0.7665
1,0.7704,0.7665,0.7662,0.7640,0.7513,0.7637,0.8328,0.7510,0.7892,0.7366,0.7579,0.7138,0.7539,0.7924
2,0.7753,0.7675,0.7572,0.7601,0.7428,0.7486,0.9000,0.7212,0.7750,0.7640,0.7433,0.7201,0.7482,0.8278
3,0.6888,0.6839,0.6818,0.6804,0.6667,0.6784,0.7795,0.6647,0.7044,0.6607,0.6726,0.6314,0.6698,0.7214
4,0.8341,0.8295,0.8269,0.8259,0.8117,0.8225,0.8857,0.8040,0.8490,0.8018,0.8167,0.7769,0.8151,0.8569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,0.2387,0.2468,0.2579,0.2545,0.2721,0.2668,0.1085,0.2950,0.2407,0.2471,0.2721,0.2931,0.2665,0.1835
295,0.2960,0.3006,0.3021,0.3037,0.3171,0.3051,0.2111,0.3181,0.2793,0.3257,0.3110,0.3532,0.3142,0.2659
296,0.1912,0.1969,0.2017,0.2016,0.2168,0.2072,0.1115,0.2279,0.1804,0.2177,0.2130,0.2488,0.2129,0.1579
297,0.2688,0.2755,0.2824,0.2814,0.2975,0.2889,0.1566,0.3109,0.2621,0.2874,0.2946,0.3259,0.2930,0.2240


In [32]:
print(df_lang_features[df_lang_features["mlt"] > -1].to_markdown())

|    |   slv |   hrv |   srp |   hbs |   mkd |   bul |   isl |   tur |   ukr |   cat |   ell |   mlt |   alb |
|---:|------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|
| 12 |   nan |   nan |   nan |   nan |   nan |   nan |     1 |     0 |     1 |   nan |     1 |     1 |   nan |
| 13 |   nan |   nan |   nan |   nan |   nan |   nan |     1 |     0 |     1 |   nan |     1 |     1 |   nan |
| 66 |   nan |   nan |   nan |   nan |   nan |   nan |     1 |     0 |   nan |     0 |     0 |     1 |     0 |
| 69 |   nan |   nan |   nan |   nan |   nan |     1 |     0 |     1 |   nan |   nan |     1 |     1 |   nan |
| 70 |   nan |   nan |   nan |   nan |   nan |     1 |     1 |     1 |   nan |   nan |     1 |     1 |   nan |
| 71 |   nan |   nan |   nan |   nan |   nan |     0 |     0 |     1 |   nan |   nan |     0 |     0 |   nan |


Syntax_wals: For sr and hr, we need to use the "hbs" language to get any values. For Maltese, we get only 6 values, which are not present for Slovenian, HBS or Macedonian. So, the languages do not have any overlapping values.

In [34]:
lang_features.keys()

dict_keys(['syntax_wals', 'phonology_wals', 'syntax_sswl', 'syntax_ethnologue', 'phonology_ethnologue', 'inventory_ethnologue', 'inventory_phoible_aa', 'inventory_phoible_gm', 'inventory_phoible_saphon', 'inventory_phoible_spa', 'inventory_phoible_ph', 'inventory_phoible_ra', 'inventory_phoible_upsid', 'syntax_knn', 'phonology_knn', 'inventory_knn', 'syntax_average', 'phonology_average', 'inventory_average', 'fam', 'id', 'geo', 'learned'])

In [8]:
# Save the lang_features
with open("datasets/lang2vec_featues.json", "w") as json_file:
	json.dump(lang_features, json_file)

phonology_wals does not cover Islandic, Ukrainian and any South Slavic language, except for Bulgarian. syntax_sswl does not cover Macedonian, Maltese and Albanian. syntax_ethnologue does not cover Macedonian, Ukrainian, Catalan and Maltese. phonology_ethnologue covers only Serbian and HBS out of our languages. inventory_ethnologue, inventory_phoible_aa, inventory_phoible_gm, inventory_phoible_saphon, inventory_phoible_ra, do not cover any of our languages. inventory_phoible_spa does not cover Ukrainian, Catalan and any South Slavic except Bulgarian. inventory_phoible_ph does not cover Serbian, Bulgarian, Icelandic, Turkish, Greek, Maltese, Albanian. inventory_phoible_upsid does not cover Icelandic, Ukrainian, Catalan, Maltese and any of South Slavic except Bulgarian.

In [16]:
df_lang_features = pd.DataFrame(lang_features["geo"])#['syntax_wals', 'phonology_wals', 'syntax_sswl', 'syntax_ethnologue', 'phonology_ethnologue', 'inventory_ethnologue', 'inventory_phoible_aa', 'inventory_phoible_gm', 'inventory_phoible_saphon', 'inventory_phoible_spa', 'inventory_phoible_ph', 'inventory_phoible_ra', 'inventory_phoible_upsid', 'syntax_knn', 'phonology_knn', 'inventory_knn', 'syntax_average', 'phonology_average', 'inventory_average', 'fam', 'id', 'geo', 'learned']

# Replace empty lines by NaN
df_lang_features = df_lang_features.replace("--", np.nan)

display(df_lang_features.describe().round(2))

# Add average value of all features in a row
df_lang_features["average"] = df_lang_features.mean(axis=1)

# Keep only rows where the features are different - average is different than 0 or 1
df_lang_features = df_lang_features[df_lang_features["average"] != 0]
df_lang_features = df_lang_features[df_lang_features["average"] != 1]

display(df_lang_features.head(5))

df_lang_features.shape

Unnamed: 0,slv,hrv,srp,hbs,mkd,bul,isl,tur,ukr,cat,ell,mlt,alb
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
std,0.22,0.22,0.22,0.22,0.22,0.22,0.22,0.22,0.22,0.22,0.22,0.22,0.22
min,0.02,0.02,0.03,0.03,0.02,0.03,0.02,0.03,0.03,0.0,0.02,0.03,0.03
25%,0.33,0.33,0.34,0.33,0.34,0.33,0.33,0.33,0.34,0.33,0.34,0.33,0.33
50%,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.49,0.5,0.49,0.5,0.5,0.5
75%,0.66,0.67,0.67,0.67,0.67,0.67,0.67,0.67,0.66,0.67,0.67,0.66,0.67
max,0.98,0.99,0.98,0.99,0.98,0.97,0.96,0.98,0.96,0.97,0.97,0.97,0.99


Unnamed: 0,slv,hrv,srp,hbs,mkd,bul,isl,tur,ukr,cat,ell,mlt,alb,average
0,0.7239,0.7176,0.7115,0.7121,0.6964,0.7056,0.8322,0.6851,0.7324,0.7035,0.6999,0.6666,0.7007,0.714423
1,0.7704,0.7665,0.7662,0.764,0.7513,0.7637,0.8328,0.751,0.7892,0.7366,0.7579,0.7138,0.7539,0.762869
2,0.7753,0.7675,0.7572,0.7601,0.7428,0.7486,0.9,0.7212,0.775,0.764,0.7433,0.7201,0.7482,0.763331
3,0.6888,0.6839,0.6818,0.6804,0.6667,0.6784,0.7795,0.6647,0.7044,0.6607,0.6726,0.6314,0.6698,0.681777
4,0.8341,0.8295,0.8269,0.8259,0.8117,0.8225,0.8857,0.804,0.849,0.8018,0.8167,0.7769,0.8151,0.823062


(299, 14)

In [12]:
df_lang_features[df_lang_features["mlt"] > -1]

Unnamed: 0,slv,hrv,srp,hbs,mkd,bul,isl,tur,ukr,cat,ell,mlt,alb,average
12,,,1.0,1.0,,,1.0,0.0,1.0,,1.0,1.0,,0.857143
13,,,,,,,1.0,0.0,1.0,,1.0,1.0,,0.8
66,,,,,,,1.0,0.0,,0.0,0.0,1.0,0.0,0.333333
69,,,,,,1.0,0.0,1.0,,,1.0,1.0,,0.8
71,,,,,,0.0,0.0,1.0,,,0.0,0.0,,0.2


# Calculating correlation

Tips from the authors of lang2vec: In general,users will probably want to use the union or average of relevant sources,or use the knn predictions.

Perspective features:
- syntax_knn: 55 features where languages are different
- inventory_knn: 57 features where languages are different
- fam: 42 features where languages are different
- inventory_average: 63 features where languages are different
- geo: 299 features where languages are different

K-nearest neighbors approach: taking average of features of k-nearest neighbors for the language for which the features are missing (accuracy is said to be 93%).

Average: averages of sets.

phonology_knn has only 7 features where the values are different for our languages.
syntax_average is not useful because it does not have even 1 feature which would be present for all languages

In [10]:
# Compare cosine similarities of vectors
def cosine_similarity(x, y):
    
    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None
    
    # Compute the dot product between x and y
    dot_product = np.dot(x, y)
    
    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2)) 
    magnitude_y = np.sqrt(np.sum(y**2))
    
    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity

Create a dictionary with only the promising features.

In [4]:
# For each of the features, create a table to see how many features are supported for all of the languages
lang_features = {}
features = ["syntax_knn", "inventory_knn", "fam", "inventory_average", "geo"]

for feature in features:
	current_feature_dict = {}
	for lang in ["slv", "hrv", "srp", "hbs", "mkd", "bul", "isl", "tur", "ukr", "cat", "ell", "mlt", "alb", "eng"]:
		try:
			current_feature_dict[lang] = l2v.get_features(lang, feature)[lang]
		except:
			current_feature_dict[lang] = "NaN"
	lang_features[feature] = current_feature_dict

lang_features

{'syntax_knn': {'slv': [1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   1.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   1.0,
   1.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   1.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   1.0,
   1.0,
   0.0,
   0.0,
   1.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   1.0,
   1.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   1.0,
   0.0,
   0.0,
   0.0,
   1.0],
  'hrv': [1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   1.0,
   0.0,
   1.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   

In [7]:
# Save the dict
with open("datasets/lang2vec_features_promising_selection.json", "w") as json_file:
	json.dump(lang_features, json_file)

In [8]:
pd.DataFrame(lang_features["geo"])

Unnamed: 0,slv,hrv,srp,hbs,mkd,bul,isl,tur,ukr,cat,ell,mlt,alb,eng
0,0.7239,0.7176,0.7115,0.7121,0.6964,0.7056,0.8322,0.6851,0.7324,0.7035,0.6999,0.6666,0.7007,0.7665
1,0.7704,0.7665,0.7662,0.7640,0.7513,0.7637,0.8328,0.7510,0.7892,0.7366,0.7579,0.7138,0.7539,0.7924
2,0.7753,0.7675,0.7572,0.7601,0.7428,0.7486,0.9000,0.7212,0.7750,0.7640,0.7433,0.7201,0.7482,0.8278
3,0.6888,0.6839,0.6818,0.6804,0.6667,0.6784,0.7795,0.6647,0.7044,0.6607,0.6726,0.6314,0.6698,0.7214
4,0.8341,0.8295,0.8269,0.8259,0.8117,0.8225,0.8857,0.8040,0.8490,0.8018,0.8167,0.7769,0.8151,0.8569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,0.2387,0.2468,0.2579,0.2545,0.2721,0.2668,0.1085,0.2950,0.2407,0.2471,0.2721,0.2931,0.2665,0.1835
295,0.2960,0.3006,0.3021,0.3037,0.3171,0.3051,0.2111,0.3181,0.2793,0.3257,0.3110,0.3532,0.3142,0.2659
296,0.1912,0.1969,0.2017,0.2016,0.2168,0.2072,0.1115,0.2279,0.1804,0.2177,0.2130,0.2488,0.2129,0.1579
297,0.2688,0.2755,0.2824,0.2814,0.2975,0.2889,0.1566,0.3109,0.2621,0.2874,0.2946,0.3259,0.2930,0.2240


In [12]:
# For each language and feature, create a dictionary that calculates the cosine similarity of the language vector and a) Slovenian vector, b) English vector
lang_similarities_dict = {}

for feature in ["syntax_knn", "inventory_knn", "fam", "inventory_average", "geo"]:
	feature_df = pd.DataFrame(lang_features[feature])

	# Clean the df - remove all rows where all features are 0 or 1
	# Replace empty lines by NaN
	feature_df = feature_df.replace("--", np.nan)

	# Add average value of all features in a row
	feature_df["average"] = feature_df.mean(axis=1)

	# Keep only rows where the features are different - average is different than 0 or 1
	feature_df = feature_df[feature_df["average"] != 0]
	feature_df = feature_df[feature_df["average"] != 1]

	lang_similarities_dict[feature] = {}

	lang_dict = {'slv': "sl", 'hrv':"hr", 'srp':"sr", 'hbs':"hbs", 'mkd':"mk", 'bul':"bg", 'isl':"is", 'tur':"tr", 'ukr':"uk", 'cat':"ca",'ell':"el", 'mlt':"mt", 'alb':"sq"}

	def lang_similarity(lang1, lang2, df=feature_df):
		x = df[lang1].to_list()
		y = df[lang2].to_list()
		result = cosine_similarity(np.array(x), np.array(y))
		print(result)
		return result

	for lang in ['slv', 'hrv', 'srp', 'hbs', 'mkd', 'bul', 'isl', 'tur', 'ukr', 'cat','ell', 'mlt', 'alb']:
		print(feature)
		print(lang)
		similarity_to_sl = lang_similarity("slv", lang)
		similarity_to_en = lang_similarity("eng", lang)
		current_dict = {"similarity-to-sl": similarity_to_sl, "similarity-to-en": similarity_to_en, "avg_similarity": np.mean([similarity_to_en, similarity_to_sl])}
		lang_similarities_dict[feature][lang_dict[lang]] = current_dict
		print(f"Similarity to sl: {similarity_to_sl}, similarity to eng: {similarity_to_en}, mean: {np.mean([similarity_to_en, similarity_to_sl])}")
		print("----------------------")

syntax_knn
slv
1.0
0.7071067811865476
Similarity to sl: 1.0, similarity to eng: 0.7071067811865476, mean: 0.8535533905932737
----------------------
syntax_knn
hrv
0.9259259259259259
0.7463904912524668
Similarity to sl: 0.9259259259259259, similarity to eng: 0.7463904912524668, mean: 0.8361582085891963
----------------------
syntax_knn
srp
0.9819805060619657
0.6943650748294136
Similarity to sl: 0.9819805060619657, similarity to eng: 0.6943650748294136, mean: 0.8381727904456897
----------------------
syntax_knn
hbs
0.9629629629629629
0.6678230711206282
Similarity to sl: 0.9629629629629629, similarity to eng: 0.6678230711206282, mean: 0.8153930170417956
----------------------
syntax_knn
mkd
0.9456108576893003
0.7715167498104595
Similarity to sl: 0.9456108576893003, similarity to eng: 0.7715167498104595, mean: 0.85856380374988
----------------------
syntax_knn
bul
0.8888888888888888
0.7856742013183862
Similarity to sl: 0.8888888888888888, similarity to eng: 0.7856742013183862, mean: 0.8372

In [14]:
# Save the dict
with open("datasets/lang2vec_cosine_similarities.json", "w") as json_file:
	json.dump(lang_similarities_dict, json_file)