In [10]:
from sklearn.linear_model import Ridge, Lasso
import pickle
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler


In [40]:
# Location of the trained regression model if model already present
dataPath = '../corpus/regmodel/allLanguages_regression_model_averageLanguageStability.pkl'
with open(dataPath,'rb') as pickleFile:
    model = pickle.load(pickleFile)
    

In [114]:
# Train models via script
# SET THESE VARIABLES

# Location of file with average stability for each language (generated from Get Average Stabilities.ipynb)
# File should have the format of a csv file with columns "language" and
# "averageStability", where language is the Bible code for the language
# and averageStability is the average stability of that language
average_stabilities_path = '../corpus/average_stabilities_allLanguages.csv'

# Location of WALS features for each language for regression model (generated by Get Good WALS Values.ipynb)
# For each language, should be formatted as a pickle file with name
# {wals_features_path}{language}.pkl, where pickle file contains a list
# of WALS feature values for that language
wals_features_path = '../corpus/wals/allLanguages_language_features_small_wals_'

# Location to save each of the 1000 bootstrapped regression models
# Will be saved as {regression_model_path}_{bootstrap_iteration}.pkl, where each pickle file is a pickled sklearn Ridge Regression model
regression_model_path = '../corpus/regmodel/allLanguages_regression_model_averageLanguageStability_bootstrapping_'

# Location to save R^2 scores for each of the 1000 bootstrapped regression models
# Will be formatted as a pickle file, where the pickle is a list of 1000 float scores
regression_scores_path = '../corpus/regmodel/allLanguages_regression_model_averageLanguageStability_bootstrapping_scores.pkl'

# Set of languages to use in regression model (can adjust if needed, or leave the same)
languages = ['eng', 'rus', 'fin', 'hun', 'spa', 'tur', 'ind', 'mnd', 'jpn', 'kor', 'prs', 'hin', 'vie', 'heb', 'may', 'tha', 'lav', 'lat', 'hmo', 'cmn', 'pol', 'som', 'bul', 'ita', 'lit', 'swe', 'hat', 'nor', 'poh', 'est', 'mam', 'por', 'ukr', 'ben', 'che', 'lnd', 'mad']


In [115]:
print('Reading in all features...')
all_features = [] # All features for regression model
all_target = [] # All predicted output for regression model
average_stabilities = pd.read_csv(average_stabilities_path)
print(average_stabilities)

Reading in all features...
    Unnamed: 0 language  averageStability
0            0      cze          1.436686
1            1      guj          1.836723
2            2      qui          1.506698
3            3      far          1.314015
4            4      ind          1.850155
..         ...      ...               ...
90          90      kor          1.240248
91          91      cro          1.109676
92          92      tel          0.793152
93          93      pol          1.349616
94          94      cre          3.382265

[95 rows x 3 columns]


In [116]:
# Load language features and target output
for language in tqdm(languages):
	if language not in average_stabilities.language.values:
		continue        
	with open(wals_features_path+language+'.pkl','rb') as pickleFile:
		if len(all_features) == 0:
			all_features = [pickle.load(pickleFile)]
		else:
			features = [pickle.load(pickleFile)]
			all_features += features
		all_target.append(list(average_stabilities.loc[average_stabilities.language==language]['averageStability'])[0]) # Target output = average stability of language


100%|██████████| 37/37 [00:00<00:00, 1740.79it/s]


In [None]:
scores = [] # List of 1000 bootstrapped R2 scores
coef_vec = []
for iteration in range(0, 1000): # Bootstrap 1000 iterations
	print('iteration',iteration)
	indices = np.random.choice([i for i in range(len(all_features))],size=len(all_features),replace=True) # Randomly choose input features

	model = Lasso(random_state=42,alpha=0.01) # Ridge regression model
	train_features = [all_features[i] for i in indices]
	target_features = [all_target[i] for i in indices]
	model.fit(train_features,target_features)
	coef_vec.append(model.coef_)
#Mean value of model weights
coef = np.mean(np.array(coef_vec),axis=0)

In [119]:
# Get WALS category
cat = []
for ele in ft:
    c = int(ele.split(':')[0][:-1])
    if c < 20:
        cat.append('P')
    elif c<30:
        cat.append('M')
    elif c<58:
        cat.append('NC')
    elif c<65:
        cat.append('NS')
    elif c<81:
        cat.append('VC')
    elif c<98 or c>142:
        cat.append('VC')
    elif c<122:
        cat.append('SC')
    elif c<129:
        cat.append('SC')
    elif c<139:
        cat.append('L')
    elif c<141:
        cat.append('SL')
    else:
        cat.append('O')

In [120]:
coef_pair = list(zip(ft,coef,cat))

In [121]:
# Top 10 most negative
sorted(coef_pair, key=lambda x: x[1])[:10]

[('23A: Locus of Marking in the Clause__2.0', -0.12463866568061516, 'M'),
 ('68A: The Perfect__4.0', -0.08638707751960895, 'VC'),
 ('129A: Hand and Arm__2.0', -0.056172925783133584, 'L'),
 ('69A: Position of Tense-Aspect Affixes__2.0', -0.05430493671011963, 'VC'),
 ('98A: Alignment of Case Marking of Full Noun Phrases__2.0',
  -0.05376091954240658,
  'SC'),
 ('26A: Prefixing vs. Suffixing in Inflectional Morphology__2.0',
  -0.050444677896594306,
  'M'),
 ('106A: Reciprocal Constructions__3.0', -0.05031063216018384, 'SC'),
 ('41A: Distance Contrasts in Demonstratives__2.0',
  -0.04999370568531641,
  'NC'),
 ('100A: Alignment of Verbal Person Marking__2.0',
  -0.044566041212140424,
  'SC'),
 ('66A: The Past Tense__1.0', -0.040453337370293094, 'VC')]

In [122]:
# Top 10 most positive
sorted(coef_pair, key=lambda x: x[1],reverse=True)[:10]

[('69A: Position of Tense-Aspect Affixes__5.0', 0.5145264122863455, 'VC'),
 ('98A: Alignment of Case Marking of Full Noun Phrases__1.0',
  0.19748855647176836,
  'SC'),
 ('36A: The Associative Plural__4.0', 0.13446311701901434, 'NC'),
 ('40A: Inclusive/Exclusive Distinction in Verbal Inflection__1.0',
  0.1279187449083829,
  'NC'),
 ('57A: Position of Pronominal Possessive Affixes__4.0',
  0.07512107158593032,
  'NC'),
 ('92A: Position of Polar Question Particles__2.0', 0.0730202141676225, 'VC'),
 ('4A: Voicing in Plosives and Fricatives__2.0', 0.06607416682946288, 'P'),
 ('22A: Inflectional Synthesis of the Verb__2.0', 0.06139023884560894, 'M'),
 ('118A: Predicative Adjectives__1.0', 0.057464093478604164, 'SC'),
 ('37A: Definite Articles__4.0', 0.03903715553868932, 'NC')]