In [72]:
import pandas as pd

threshold_pct_features = 0.50
threshold_pct_langauges = 1.0


# Analysis
### Features
Idea is to do some statistics on <i>feature</i> level to identify the fetures with the highest number of not-nan values.<br>
Please note that this is the first step in the analysis and followed by a similar analysis on <i>language</i> level.

In [73]:
#read denormalized feature sets.
df_denorm_features = pd.read_csv('data/vals_transformed.csv', sep=',', index_col = 'LANG_ID')

In [74]:
#features only; 
df_features = (df_denorm_features.reset_index(drop = True)  # 1. remove index col w/ LANG_UD
                                  .iloc[:, :-7]) #remove last n rows that contains language metadata

# df_features

In [75]:
num_languages = df_features.shape[0] #number of languages per feature (in case of European languages 62)

df_feat_nan_dist = df_features.isna().sum().to_frame() #calculate statistics about nan values per feature

df_feat_nan_dist = df_feat_nan_dist.rename(columns={0: 'cnt_not_given'})
df_feat_nan_dist['cnt_given'] = num_languages - df_feat_nan_dist['cnt_not_given'] #cnt for non nan values is (number of all languages / feature minus number of nan values)
df_feat_nan_dist['pct'] = df_feat_nan_dist['cnt_given'] / num_languages #relative values

df_feat_nan_dist = df_feat_nan_dist[(df_feat_nan_dist['pct'] >= threshold_pct_features)] #filter all features with less than 50% coverage

df_result_feature_analysis = df_feat_nan_dist.sort_values(by=['pct'], ascending=False) #order for convinience

# df_result_feature_analysis
df_result_feature_analysis.head(50)

Unnamed: 0,cnt_not_given,cnt_given,pct
53A,16,46,0.741935
97A,25,37,0.596774
83A,25,37,0.596774
82A,25,37,0.596774
81A,25,37,0.596774
87A,25,37,0.596774
95A,25,37,0.596774
143G,26,36,0.580645
143F,26,36,0.580645
138A,26,36,0.580645


### Language
Same idea: check number of non nan values per language and fetch top n percent

In [76]:
#List of features which have non nan values in more than 50% of languages in scope
lst = [i for i in df_result_feature_analysis.index] 

df_vals = df_denorm_features[lst]
df_vals

Unnamed: 0_level_0,53A,97A,83A,82A,81A,87A,95A,143G,143F,138A,...,51A,26A,72A,37A,92A,33A,101A,38A,90C,94A
LANG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alb,6.0,4.0,2.0,1.0,2.0,2.0,4.0,4.0,4.0,1.0,...,1.0,2.0,1.0,3.0,1.0,2.0,2.0,1.0,1.0,1.0
alt,,,,,,,,,,2.0,...,,,,,,,,,,
ast,,,,,,,,,,2.0,...,,,,,,,,,,
blr,7.0,5.0,3.0,3.0,7.0,1.0,5.0,4.0,4.0,1.0,...,1.0,2.0,,,1.0,,,,1.0,
bos,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
urm,,,,,,,,,,,...,,,,,,,,,,
vep,7.0,,,,,,,,,,...,,,,,,,,,,
vot,7.0,,,,,,5.0,,,,...,,,1.0,,,,,,,
wel,7.0,4.0,2.0,2.0,3.0,2.0,4.0,4.0,4.0,2.0,...,9.0,2.0,4.0,1.0,1.0,2.0,,4.0,1.0,1.0


In [77]:
num_features = df_vals.shape[1]

df_lang_nan_dist = df_vals.isnull().sum(axis=1).to_frame() #statistics about non nan values 
df_lang_nan_dist = df_lang_nan_dist.rename(columns={0: 'cnt_not_given'}) #rename column
df_lang_nan_dist['cnt_given'] = num_features - df_lang_nan_dist['cnt_not_given'] #calculate number of given features
df_lang_nan_dist['pct'] = df_lang_nan_dist['cnt_given'] / num_features #calculate ratio of non-nan features to all features

df_lang_nan_dist = df_lang_nan_dist.sort_values(by=['pct'], ascending=False) #sort by percentage descending

#filter all languages with coverage over 90
df_lang_nan_dist = df_lang_nan_dist[(df_lang_nan_dist['pct'] >= threshold_pct_langauges)]

df_lang_nan_dist.head(10)


Unnamed: 0_level_0,cnt_not_given,cnt_given,pct
LANG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rus,0,37,1.0
swe,0,37,1.0
iri,0,37,1.0
fre,0,37,1.0
fin,0,37,1.0
ita,0,37,1.0
est,0,37,1.0
eng,0,37,1.0
dut,0,37,1.0
dsh,0,37,1.0


# Create set of training data

In [78]:
df_training_set = df_denorm_features

# df_training_set
df_training_set = (df_training_set.filter(items=df_lang_nan_dist.index, axis=0) # filter languages with results from above
                                    .filter(items=df_result_feature_analysis.index, axis = 1)) #filter features with results from above

df_training_set.to_csv('data/training_set.csv', sep=',')
df_training_set.head(20)

Unnamed: 0_level_0,53A,97A,83A,82A,81A,87A,95A,143G,143F,138A,...,51A,26A,72A,37A,92A,33A,101A,38A,90C,94A
LANG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rus,7.0,3.0,2.0,1.0,2.0,1.0,4.0,4.0,4.0,1.0,...,1.0,2.0,4.0,5.0,3.0,2.0,1.0,5.0,1.0,1.0
swe,7.0,3.0,2.0,1.0,2.0,1.0,4.0,4.0,1.0,2.0,...,9.0,2.0,4.0,3.0,6.0,2.0,1.0,2.0,1.0,1.0
iri,7.0,4.0,2.0,2.0,3.0,2.0,4.0,4.0,4.0,2.0,...,5.0,4.0,1.0,1.0,1.0,6.0,6.0,4.0,1.0,1.0
fre,7.0,4.0,2.0,1.0,2.0,2.0,4.0,4.0,1.0,2.0,...,7.0,2.0,4.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
fin,7.0,3.0,2.0,1.0,2.0,1.0,3.0,4.0,4.0,2.0,...,1.0,2.0,1.0,5.0,3.0,2.0,6.0,5.0,1.0,1.0
ita,7.0,4.0,2.0,3.0,2.0,2.0,4.0,4.0,4.0,2.0,...,9.0,2.0,4.0,1.0,6.0,2.0,2.0,2.0,1.0,1.0
est,7.0,3.0,2.0,1.0,2.0,1.0,3.0,4.0,4.0,2.0,...,1.0,2.0,1.0,5.0,5.0,2.0,2.0,5.0,1.0,1.0
eng,7.0,3.0,2.0,1.0,2.0,1.0,4.0,4.0,4.0,2.0,...,9.0,2.0,4.0,1.0,6.0,2.0,1.0,1.0,1.0,1.0
dut,6.0,5.0,3.0,1.0,7.0,1.0,5.0,4.0,1.0,2.0,...,9.0,2.0,4.0,1.0,6.0,2.0,1.0,1.0,1.0,1.0
dsh,7.0,3.0,2.0,1.0,2.0,1.0,4.0,4.0,1.0,2.0,...,1.0,2.0,4.0,3.0,6.0,2.0,1.0,1.0,1.0,1.0
