# Factors of Flexibility

Investigate what factors influence whether a word is flexible or not.

In [1]:
import sys
sys.path.append('../')

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import src.corpus
import src.partial

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Parse the corpus

In [2]:
data_file = "../data/wiki/processed/en.pkl"

corpus = src.corpus.POSCorpus.create_from_pickle(data_file_path=data_file)

In [3]:
lemma_count_df = corpus.get_per_lemma_stats()
lemma_count_df = lemma_count_df[lemma_count_df.total_count >= 100]
lemma_count_df.sort_values('total_count', ascending=False).head(20)

Unnamed: 0,lemma,noun_count,verb_count,majority_tag,total_count,minority_count,minority_ratio,is_flexible
59,use,4849,21363,VERB,26212,4849,0.184992,True
89,have,1061,25004,VERB,26065,1061,0.040706,False
204,know,8873,8949,VERB,17822,8873,0.497868,True
79,be,2177,15285,VERB,17462,2177,0.124671,True
187,include,27,15618,VERB,15645,27,0.001726,False
40,year,14239,4,NOUN,14243,4,0.000281,False
125,time,13678,24,NOUN,13702,24,0.001752,False
121,make,338,12584,VERB,12922,338,0.026157,False
62,become,8,10962,VERB,10970,8,0.000729,False
341,state,7528,2830,NOUN,10358,2830,0.273219,True


## LR to predict flexibility

In [4]:
lemma_count_df['log_freq'] = np.log(lemma_count_df.total_count)
lemma_count_df['length'] = lemma_count_df.lemma.apply(lambda x: len(x))

In [5]:
import statsmodels.discrete.discrete_model
model = statsmodels.discrete.discrete_model.Logit(
  lemma_count_df.is_flexible,
  pd.get_dummies(lemma_count_df[['majority_tag', 'log_freq', 'length']], drop_first=True)
)
lr = model.fit()

Optimization terminated successfully.
         Current function value: 0.517724
         Iterations 6


In [6]:
lr.summary()

0,1,2,3
Dep. Variable:,is_flexible,No. Observations:,3525.0
Model:,Logit,Df Residuals:,3522.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 08 Apr 2020",Pseudo R-squ.:,0.1213
Time:,01:05:15,Log-Likelihood:,-1825.0
converged:,True,LL-Null:,-2076.8
Covariance Type:,nonrobust,LLR p-value:,4.314e-110

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
log_freq,0.0876,0.018,4.939,0.000,0.053,0.122
length,-0.2858,0.017,-17.164,0.000,-0.318,-0.253
majority_tag_VERB,1.2855,0.090,14.228,0.000,1.108,1.463


In [7]:
lr.params.tolist()

[0.08763256400787484, -0.2858476856536388, 1.2855102571058359]

## Partial correlation

In [8]:
partials = src.partial.calculate_partial_correlation(pd.get_dummies(lemma_count_df[['majority_tag', 'log_freq', 'length', 'is_flexible']], drop_first=True))
print(partials['is_flexible']['log_freq'])
print(partials['is_flexible']['length'])
print(partials['is_flexible']['majority_tag_VERB'])
partials

0.033071615654940205
-0.2505545210308878
0.24698277418377984


Unnamed: 0,log_freq,length,is_flexible,majority_tag_VERB
log_freq,1.0,-0.176088,0.033072,0.120647
length,-0.176088,1.0,-0.250555,0.004645
is_flexible,0.033072,-0.250555,1.0,0.246983
majority_tag_VERB,0.120647,0.004645,0.246983,1.0
