### In this notebook, we experiment with (1) using a classifier that is based on dataset_2 and a part of the college data, and (2) without features whose distributions are too different in training and test, such as max_pearson_difference when the test in question is college.

### First, let's create this initial classifier.

In [39]:
import pandas as pd
dataset_2 = pd.read_csv('training-simplified-data-generation.csv')
dataset_2['class'] = [1 if row['gain_in_r2_score'] > 0 else -1 for index, row in dataset_2.iterrows()]

In [40]:
college = pd.read_csv('college-debt-records-features-single-column-w-class')
college['class'] = [1 if row['gain_in_r2_score'] > 0 else -1 for index, row in college.iterrows()]

In [41]:
college_train = college.sample(110)

college_test = pd.concat([college, college_train])
college_test = college_test.drop_duplicates(keep=False, inplace=False)

In [42]:
college_train.drop(['decrease_in_mae', 'decrease_in_mse','decrease_in_medae', 
                    'r2_score_before', 'r2_score_after', 'p(gain)', 'p(loss)', 'eval'], 
                    axis=1, inplace=True)
dataset_2.drop(['decrease_in_mae', 'decrease_in_mse', 'decrease_in_medae', 
                'r2_score_before', 'r2_score_after'], 
                axis=1, inplace=True)

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
FEATURES = ['query_num_of_columns', 'query_num_of_rows', 'query_row_column_ratio', 'query_max_skewness', 
            'query_max_kurtosis', 'query_max_unique', 'candidate_num_rows', 'candidate_row_column_ratio', 
            'candidate_max_skewness', 'candidate_max_kurtosis', 'candidate_max_unique', 'query_target_max_pearson', 
            'query_target_max_spearman', 'query_target_max_covariance', 'query_target_max_mutual_info', 
            'candidate_target_max_pearson', 'candidate_target_max_spearman', 'candidate_target_max_covariance', 
            'candidate_target_max_mutual_info', 'max_pearson_difference', 'containment_fraction']
training = pd.concat([dataset_2, college_train])

In [44]:
rf_mixed = RandomForestClassifier(n_estimators=100, random_state=42)
rf_mixed.fit(training[FEATURES], training['class'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [45]:
college_preds = rf_mixed.predict(college_test[FEATURES])

In [46]:
print(classification_report(college_test['class'], college_preds))

              precision    recall  f1-score   support

          -1       0.98      0.98      0.98       876
           1       0.84      0.82      0.83       117

   micro avg       0.96      0.96      0.96       993
   macro avg       0.91      0.90      0.90       993
weighted avg       0.96      0.96      0.96       993



### Note that just by adding 10% of the college data to the training, the results for college_test get much better. There's one caveat, though: maybe there's some "leaking" from training to test due to the fact that the query features for college are always the same. Now, let's evaluate some fp / tp / fn / tn examples.

In [66]:
college_test['pred'] = college_preds
false_positive = college_test.loc[(college_test['class'] == -1) & (college_test['pred'] == 1)]
false_negative = college_test.loc[(college_test['class'] == 1) & (college_test['pred'] == -1)]
true_positive = college_test.loc[(college_test['class'] == 1) & (college_test['pred'] == 1)]
true_negative = college_test.loc[(college_test['class'] == -1) & (college_test['pred'] == -1)]

In [52]:
false_positive.shape

(18, 40)

In [67]:
false_negative.shape

(21, 40)

In [54]:
true_positive.shape

(96, 40)

In [55]:
true_negative.shape

(858, 40)

In [70]:
sample_fp = false_positive.sample(2)
sample_tp = true_positive.sample(2)
sample_fn = false_negative.sample(2)
sample_tn = true_negative.sample(2)

In [58]:
import eli5
no_bias = lambda feature_name, feature_value: feature_name != '<BIAS>'
eli5.show_prediction(rf_mixed, sample_fp.iloc[0][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.124,containment_fraction,0.94
0.079,candidate_target_max_spearman,0.086
0.024,candidate_target_max_covariance,0.048
0.02,query_max_unique,3512.0
0.016,candidate_max_kurtosis,177.178
0.013,candidate_row_column_ratio,7703.0
0.012,candidate_max_skewness,13.384
0.007,query_row_column_ratio,415.833
0.005,query_target_max_mutual_info,0.668
0.004,query_num_of_rows,4990.0


In [59]:
eli5.show_prediction(rf_mixed, sample_fp.iloc[1][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.084,containment_fraction,0.934
0.075,candidate_target_max_mutual_info,0.24
0.022,candidate_target_max_spearman,0.072
0.019,candidate_max_skewness,20.493
0.019,query_row_column_ratio,415.833
0.016,candidate_target_max_covariance,0.024
0.016,candidate_row_column_ratio,7703.0
0.015,query_max_unique,3512.0
0.009,candidate_max_unique,388.0
0.009,query_target_max_mutual_info,0.668


In [60]:
eli5.show_prediction(rf_mixed, sample_tp.iloc[0][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.072,candidate_target_max_spearman,0.265
0.068,candidate_target_max_pearson,0.264
0.067,max_pearson_difference,0.195
0.023,query_max_unique,3512.0
0.016,query_target_max_spearman,0.457
0.015,query_row_column_ratio,415.833
0.015,containment_fraction,1.0
0.01,candidate_max_kurtosis,1.444
0.009,candidate_target_max_covariance,6.459
0.004,candidate_row_column_ratio,7703.0


In [61]:
eli5.show_prediction(rf_mixed, sample_tp.iloc[1][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.103,candidate_target_max_spearman,0.268
0.063,max_pearson_difference,0.281
0.031,candidate_target_max_covariance,11359.018
0.029,query_row_column_ratio,415.833
0.028,candidate_max_unique,1641.0
0.026,candidate_target_max_mutual_info,0.371
0.02,candidate_target_max_pearson,0.178
0.013,query_max_unique,3512.0
0.013,query_target_max_spearman,0.457
0.01,candidate_max_skewness,0.511


In [71]:
eli5.show_prediction(rf_mixed, sample_fn.iloc[0][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.095,containment_fraction,0.059
0.051,query_target_max_spearman,0.457
0.035,query_num_of_columns,12.0
0.034,query_target_max_pearson,0.46
0.034,candidate_max_unique,9.0
0.032,candidate_target_max_mutual_info,0.056
0.021,query_target_max_mutual_info,0.668
0.019,candidate_target_max_pearson,0.058
0.018,query_max_kurtosis,50.966
0.013,query_target_max_covariance,23744.186


In [72]:
eli5.show_prediction(rf_mixed, sample_fn.iloc[1][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.139,max_pearson_difference,0.456
0.049,candidate_target_max_spearman,0.005
0.04,candidate_target_max_pearson,0.004
0.022,query_num_of_columns,12.0
0.02,candidate_target_max_covariance,0.002
0.017,query_target_max_pearson,0.46
0.014,query_target_max_spearman,0.457
0.011,query_max_kurtosis,50.966
0.009,query_target_max_covariance,23744.186
0.008,query_target_max_mutual_info,0.668


In [64]:
eli5.show_prediction(rf_mixed, sample_tn.iloc[0][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.128,max_pearson_difference,0.46
0.094,candidate_target_max_spearman,0.0
0.089,containment_fraction,0.0
0.088,candidate_target_max_mutual_info,1.551
0.028,candidate_target_max_pearson,0.0
0.023,query_num_of_columns,12.0
0.019,query_target_max_pearson,0.46
0.018,candidate_target_max_covariance,0.0
0.016,query_target_max_spearman,0.457
0.012,query_max_kurtosis,50.966


In [65]:
eli5.show_prediction(rf_mixed, sample_tn.iloc[1][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.181,max_pearson_difference,0.456
0.1,containment_fraction,0.001
0.062,candidate_target_max_spearman,0.0
0.047,candidate_target_max_pearson,0.004
0.032,query_target_max_pearson,0.46
0.028,query_num_of_columns,12.0
0.027,candidate_target_max_mutual_info,0.018
0.022,query_max_kurtosis,50.966
0.018,query_target_max_spearman,0.457
0.011,query_target_max_covariance,23744.186


### The results above indicate the importance of containment fraction as something that determines a positive prediction whenever it is large. Which features are the most important in this model?

In [73]:
eli5.show_weights(rf_mixed, feature_names=FEATURES)

Weight,Feature
0.0798  ± 0.0379,candidate_target_max_spearman
0.0738  ± 0.0402,candidate_target_max_pearson
0.0715  ± 0.0274,max_pearson_difference
0.0620  ± 0.0201,candidate_target_max_covariance
0.0612  ± 0.0152,query_row_column_ratio
0.0569  ± 0.0124,candidate_max_skewness
0.0555  ± 0.0125,candidate_max_kurtosis
0.0529  ± 0.0139,query_num_of_columns
0.0518  ± 0.0160,query_max_kurtosis
0.0497  ± 0.0153,query_target_max_covariance


### Apparently, though, containment fraction is not a very important feature. Let's see what the predictions are over synth_test:

In [74]:
synth_test = pd.read_csv('test-simplified-data-generation.csv')
synth_test['class'] = [1 if row['gain_in_r2_score'] > 0 else -1 for index, row in synth_test.iterrows()]
preds = rf_mixed.predict(synth_test[FEATURES])
synth_test['pred'] = preds
print(classification_report(synth_test['class'], synth_test['pred']))

              precision    recall  f1-score   support

          -1       0.57      0.52      0.54      1780
           1       0.68      0.72      0.70      2496

   micro avg       0.63      0.63      0.63      4276
   macro avg       0.62      0.62      0.62      4276
weighted avg       0.63      0.63      0.63      4276



### For the record, these are the results we get when we train over dataset_2 alone:

In [75]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(dataset_2[FEATURES], dataset_2['class'])
preds = rf.predict(synth_test[FEATURES])

In [76]:
print(classification_report(synth_test['class'], preds))

              precision    recall  f1-score   support

          -1       0.57      0.53      0.55      1780
           1       0.68      0.71      0.70      2496

   micro avg       0.64      0.64      0.64      4276
   macro avg       0.62      0.62      0.62      4276
weighted avg       0.63      0.64      0.63      4276



### Note that the results are fairly similar, so let's stick to the model built over dataset_2 +  college_train. Some fp / tp / fn / tn are:

In [77]:
false_positive = synth_test.loc[(synth_test['class'] == -1) & (synth_test['pred'] == 1)]
sample_fp = false_positive.sample(2)
false_negative = synth_test.loc[(synth_test['class'] == 1) & (synth_test['pred'] == -1)]
sample_fn = false_negative.sample(2)
true_positive = synth_test.loc[(synth_test['class'] == 1) & (synth_test['pred'] == 1)]
sample_tp = true_positive.sample(2)
true_negative = synth_test.loc[(synth_test['class'] == -1) & (synth_test['pred'] == -1)]
sample_tn = true_negative.sample(2)

In [78]:
eli5.show_prediction(rf_mixed, sample_fp.iloc[0][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.044,query_target_max_covariance,38634.756
0.026,candidate_target_max_spearman,0.055
0.024,candidate_target_max_mutual_info,0.677
0.024,query_target_max_mutual_info,0.946
0.014,candidate_max_unique,60.0
0.011,containment_fraction,1.0
0.001,candidate_target_max_covariance,0.018
0.001,query_target_max_pearson,0.422
-0.004,candidate_max_skewness,2.366
-0.005,candidate_row_column_ratio,13750.0


In [79]:
eli5.show_prediction(rf_mixed, sample_fp.iloc[1][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.082,candidate_target_max_spearman,0.255
0.035,max_pearson_difference,-0.062
0.034,query_target_max_spearman,0.342
0.028,query_num_of_columns,4.0
0.027,candidate_target_max_pearson,0.28
0.023,query_target_max_pearson,0.341
0.016,candidate_max_kurtosis,8.14
0.01,query_row_column_ratio,86.25
0.007,candidate_target_max_covariance,9.392
0.005,query_max_skewness,3.063


In [80]:
eli5.show_prediction(rf_mixed, sample_tp.iloc[0][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.026,candidate_max_skewness,9.269
0.019,candidate_num_rows,313.0
0.015,query_max_skewness,1.365
0.013,candidate_target_max_mutual_info,0.039
0.01,candidate_max_kurtosis,94.667
0.009,candidate_target_max_spearman,0.065
0.006,query_num_of_rows,249.0
0.004,query_target_max_spearman,0.467
0.004,containment_fraction,1.0
0.003,query_target_max_pearson,0.478


In [81]:
eli5.show_prediction(rf_mixed, sample_tp.iloc[1][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.095,candidate_target_max_spearman,0.563
0.091,candidate_target_max_pearson,0.391
0.035,query_num_of_columns,3.0
0.032,candidate_target_max_mutual_info,0.559
0.027,max_pearson_difference,-0.072
0.022,candidate_target_max_covariance,746.448
0.021,query_target_max_pearson,0.462
0.01,query_target_max_spearman,0.566
0.009,candidate_num_rows,145.0
0.005,containment_fraction,1.0


In [82]:
eli5.show_prediction(rf_mixed, sample_fn.iloc[0][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.049,max_pearson_difference,-0.104
0.035,query_max_unique,933.0
0.016,candidate_max_skewness,0.0
0.015,query_max_skewness,14.683
0.009,candidate_max_unique,1.0
0.008,query_row_column_ratio,37.586
0.008,query_max_kurtosis,213.989
0.007,query_target_max_pearson,0.104
0.007,candidate_target_max_covariance,0.0
0.005,containment_fraction,1.0


In [83]:
eli5.show_prediction(rf_mixed, sample_fn.iloc[1][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.061,candidate_target_max_spearman,0.0
0.043,candidate_target_max_pearson,0.002
0.024,query_num_of_columns,11.0
0.022,candidate_target_max_covariance,0.0
0.021,query_target_max_mutual_info,0.305
0.019,query_target_max_spearman,0.688
0.018,max_pearson_difference,-0.703
0.01,query_max_kurtosis,1.277
0.006,query_target_max_pearson,0.705
0.002,query_target_max_covariance,26.087


In [84]:
eli5.show_prediction(rf_mixed, sample_tn.iloc[0][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.048,candidate_target_max_pearson,0.013
0.033,containment_fraction,0.052
0.031,candidate_target_max_covariance,0.001
0.028,query_max_kurtosis,1825.553
0.025,candidate_target_max_spearman,0.014
0.019,candidate_target_max_mutual_info,0.026
0.019,query_max_unique,10.0
0.017,query_target_max_mutual_info,0.017
0.015,candidate_max_unique,282.0
0.014,query_row_column_ratio,213.522


In [85]:
eli5.show_prediction(rf_mixed, sample_tn.iloc[1][FEATURES], 
                     feature_names=FEATURES, 
                     show_feature_values=True, feature_filter=no_bias)

Contribution?,Feature,Value
0.072,query_num_of_columns,21.0
0.058,candidate_target_max_spearman,0.008
0.053,max_pearson_difference,-0.697
0.032,candidate_target_max_pearson,0.008
0.025,candidate_max_unique,7997.0
0.021,query_target_max_spearman,0.727
0.019,candidate_num_rows,8192.0
0.017,candidate_max_kurtosis,13.048
0.009,query_target_max_mutual_info,0.292
0.004,query_row_column_ratio,306.429
