In [8]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

import pandas as pd


env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


## Lang2vec

1) Querying the URIEL database, as well as the trained language vectors from Malaviya et al, 2017. The main operation is `get_features(languages, feature_sets, header=False, minimal=False)`, which returns a dictionary with the feature vector for every language in languages for the feature_sets.

2) Returning pre-computed distances between languages, based on some typological information. The main operation here is `distance(distance, language1, language2)`, which returns a float distance.

In [2]:
#!pip3 install lang2vec
import lang2vec.lang2vec as l2v

In [3]:
# Available feature sets
features = l2v.FEATURE_SETS

In [4]:
# See whether our languages are supported 
for i in ["slv", "hrv", "srp", "mkd", "bul", "isl", "tur", "ukr", "cat", "ell", "mlt", "alb"]:
	if i not in l2v.LANGUAGES:
		print(i)

In [16]:
l2v.get_features("hbs", "syntax_wals")

{'hbs': [1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  '--',
  '--',
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  '--',
  '--',
  '--',
  '--',
  '--',
  1.0,
  0.0,
  0.0,
  0.0,
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  0.0,
  1.0,
  0.0,
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  0.0,
  1.0,
  '--',
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  0.0,
  0.0,
  1.0,
  0.0,
  '--',
  '--',
  '--',
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--',
  '--']}

In [19]:
lang_features = {}

for lang in ["slv", "hrv", "srp", "hbs", "mkd", "bul", "isl", "tur", "ukr", "cat", "ell", "mlt", "alb"]:
	lang_features_list = {}
	for feature in features:
		try:
			lang_features_list[feature] = l2v.get_features(lang, feature)
		except:
			lang_features_list[feature] = "NaN"
	lang_features[lang] = lang_features_list

lang_features

In [18]:
df_lang = pd.DataFrame(lang_features)
df_lang

Unnamed: 0,slv,hrv,srp,hbs,mkd,bul,isl,tur,ukr,cat,ell,mlt,alb
syntax_wals,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}
phonology_wals,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}
syntax_sswl,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}
syntax_ethnologue,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}
phonology_ethnologue,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}
inventory_ethnologue,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}
inventory_phoible_aa,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}
inventory_phoible_gm,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}
inventory_phoible_saphon,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}
inventory_phoible_spa,{'slv': []},{'hrv': []},{'srp': []},{'hbs': []},{'mkd': []},{'bul': []},{'isl': []},{'tur': []},{'ukr': []},{'cat': []},{'ell': []},{'mlt': []},{'alb': []}


In [15]:
# Save as jsonl
df_lang.to_json("lang2vec.jsonl", lines=True)