### In this notebook, we train random forest models using different definitions of 'gain' and 'loss' classes to see which one leads to the best generalization.

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report
import warnings; warnings.simplefilter('ignore')
from sklearn.preprocessing import StandardScaler

ALL_FEATURES = ['query_num_of_columns', 'query_num_of_rows', 'query_row_column_ratio',
                'query_max_skewness', 'query_max_kurtosis', 'query_max_unique', 
                'candidate_num_rows', 'candidate_max_skewness', 'candidate_max_kurtosis',
                'candidate_max_unique', 'query_target_max_pearson', 
                'query_target_max_spearman', 'query_target_max_covariance', 
                'query_target_max_mutual_info', 'candidate_target_max_pearson', 
                'candidate_target_max_spearman', 'candidate_target_max_covariance', 
                'candidate_target_max_mutual_info', 'containment_fraction']

### Initially, we'll consider 'containment_fraction' as a feature.

In [104]:
openml_training = pd.read_csv('training-simplified-data-generation-many-candidates-different-class-definitions.csv')
openml_training = openml_training.drop(['median_based_class_x'], axis=1)
openml_training = openml_training.rename(columns={'median_based_class_y':'median_based_class'})
openml_test = pd.read_csv('test-simplified-data-generation-many-candidates-different-class-definitions.csv')
college_debt = pd.read_csv('college-debt-different-class-definitions.csv')
taxi_collision = pd.read_csv('taxi-vehicle-collision-different-class-definitions.csv')
poverty_estimation = pd.read_csv('poverty-estimation-different-class-definitions.csv')

In [3]:
def train_model(features, classes):
    '''
    Builds a model using features to predict associated classes
    '''

    # normalizing data first
    # although it makes no difference in the shape of the forest,
    # the features for the case studies and/or test data might be in 
    # totally different scales...

    feature_scaler = StandardScaler()
    features_train = feature_scaler.fit_transform(features)

    rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(features_train, classes)
    return rf

In [4]:
model_pos_neg = train_model(openml_training[ALL_FEATURES], openml_training['class_pos_neg'])

model_harsh_grad = train_model(openml_training[ALL_FEATURES], openml_training['harsh_grad_class'])

model_2nd_grad_drop = train_model(openml_training[ALL_FEATURES], openml_training['2th_grad_drop_class'])

model_order_of_mag_drop = train_model(openml_training[ALL_FEATURES], openml_training['order_of_mag_drop_class'])

model_median_based = train_model(openml_training[ALL_FEATURES], openml_training['median_based_class'])

In [5]:
def normalize_features(features):
    '''
    This function normalizes features using sklearn's StandardScaler
    '''
    feature_scaler = StandardScaler()
    return feature_scaler.fit_transform(features)

In [6]:
test_pos_neg_preds = model_pos_neg.predict(normalize_features(openml_test[ALL_FEATURES]))
test_pos_neg_preds_probs = model_pos_neg.predict_proba(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['class_pos_neg'], test_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.59      0.37      0.45     37058
        loss       0.55      0.76      0.64     38680

    accuracy                           0.57     75738
   macro avg       0.57      0.56      0.55     75738
weighted avg       0.57      0.57      0.55     75738



In [7]:
test_harsh_grad_preds = model_harsh_grad.predict(normalize_features(openml_test[ALL_FEATURES]))
test_harsh_grad_preds_probs = model_harsh_grad.predict_proba(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['harsh_grad_class'], test_harsh_grad_preds))

              precision    recall  f1-score   support

        gain       0.82      0.05      0.10      2602
        loss       0.97      1.00      0.98     73136

    accuracy                           0.97     75738
   macro avg       0.90      0.53      0.54     75738
weighted avg       0.96      0.97      0.95     75738



In [8]:
test_2nd_grad_drop_preds = model_2nd_grad_drop.predict(normalize_features(openml_test[ALL_FEATURES]))
test_2nd_grad_drop_preds_probs = model_2nd_grad_drop.predict_proba(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['2th_grad_drop_class'], test_2nd_grad_drop_preds))

              precision    recall  f1-score   support

        gain       0.70      0.05      0.10      5590
        loss       0.93      1.00      0.96     70148

    accuracy                           0.93     75738
   macro avg       0.81      0.52      0.53     75738
weighted avg       0.91      0.93      0.90     75738



In [9]:
test_order_of_mag_drop_preds = model_order_of_mag_drop.predict(normalize_features(openml_test[ALL_FEATURES]))
test_order_of_mag_drop_preds_probs = model_order_of_mag_drop.predict_proba(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['order_of_mag_drop_class'], test_order_of_mag_drop_preds))

              precision    recall  f1-score   support

        gain       0.53      0.06      0.11     13648
        loss       0.83      0.99      0.90     62090

    accuracy                           0.82     75738
   macro avg       0.68      0.52      0.51     75738
weighted avg       0.77      0.82      0.76     75738



In [10]:
openml_test = openml_test.drop(['median_based_class_x'], axis=1)
openml_test = openml_test.rename(columns={'median_based_class_y':'median_based_class'})

test_median_based_preds = model_median_based.predict(normalize_features(openml_test[ALL_FEATURES]))
test_median_based_preds_probs = model_median_based.predict_proba(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['median_based_class'], test_median_based_preds))

              precision    recall  f1-score   support

        gain       0.50      0.22      0.31     26548
        loss       0.68      0.88      0.77     49190

    accuracy                           0.65     75738
   macro avg       0.59      0.55      0.54     75738
weighted avg       0.61      0.65      0.60     75738



### Well... Results were not very good for this example of test, with the best class strategy being the positive/negative one (f1-score of 0.45 for class 'gain')... Let's see how it works for the case studies.

In [11]:
college_debt = college_debt.rename(columns={'harsh_grad_class_y':'harsh_grad_class', 
                                       '2th_grad_drop_class_y': '2th_grad_drop_class', 
                                       'order_of_mag_drop_class_y': 'order_of_mag_drop_class',
                                       'median_based_class_y': 'median_based_class'})

college_debt_pos_neg_preds = model_pos_neg.predict(normalize_features(college_debt[ALL_FEATURES]))
college_debt_pos_neg_preds_probs = model_pos_neg.predict_proba(normalize_features(college_debt[ALL_FEATURES]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(college_debt['class_pos_neg'], college_debt_pos_neg_preds))

college_debt_harsh_grad_preds = model_harsh_grad.predict(normalize_features(college_debt[ALL_FEATURES]))
college_debt_harsh_grad_preds_probs = model_harsh_grad.predict_proba(normalize_features(college_debt[ALL_FEATURES]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(college_debt['harsh_grad_class'], college_debt_harsh_grad_preds))

college_debt_2nd_grad_drop_preds = model_2nd_grad_drop.predict(normalize_features(college_debt[ALL_FEATURES]))
college_debt_2nd_grad_drop_preds_probs = model_2nd_grad_drop.predict_proba(normalize_features(college_debt[ALL_FEATURES]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(college_debt['2th_grad_drop_class'], college_debt_2nd_grad_drop_preds))

college_debt_order_of_mag_drop_preds = model_order_of_mag_drop.predict(normalize_features(college_debt[ALL_FEATURES]))
college_debt_order_of_mag_drop_preds_probs = model_order_of_mag_drop.predict_proba(normalize_features(college_debt[ALL_FEATURES]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(college_debt['order_of_mag_drop_class'], college_debt_order_of_mag_drop_preds))

college_debt_median_based_preds = model_median_based.predict(normalize_features(college_debt[ALL_FEATURES]))
college_debt_median_based_preds_probs = model_median_based.predict_proba(normalize_features(college_debt[ALL_FEATURES]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(college_debt['median_based_class'], college_debt_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       0.14      1.00      0.25       130
        loss       1.00      0.18      0.30       973

    accuracy                           0.28      1103
   macro avg       0.57      0.59      0.27      1103
weighted avg       0.90      0.28      0.30      1103

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         1
        loss       1.00      0.96      0.98      1102

    accuracy                           0.96      1103
   macro avg       0.50      0.48      0.49      1103
weighted avg       1.00      0.96      0.98      1103

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.05      0.67      0.09         3
        loss       1.00      0.97      0.98      1100

    accuracy                           0.96      1103
   macro avg  

In [105]:
taxi_collision_pos_neg_preds = model_pos_neg.predict(normalize_features(taxi_collision[ALL_FEATURES]))
taxi_collision_pos_neg_preds_probs = model_pos_neg.predict_proba(normalize_features(taxi_collision[ALL_FEATURES]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(taxi_collision['class_pos_neg'], taxi_collision_pos_neg_preds))

taxi_collision_harsh_grad_preds = model_harsh_grad.predict(normalize_features(taxi_collision[ALL_FEATURES]))
taxi_collision_harsh_grad_preds_probs = model_harsh_grad.predict_proba(normalize_features(taxi_collision[ALL_FEATURES]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision['harsh_grad_class'], taxi_collision_harsh_grad_preds))

taxi_collision_2nd_grad_drop_preds = model_2nd_grad_drop.predict(normalize_features(taxi_collision[ALL_FEATURES]))
taxi_collision_2nd_grad_drop_preds_probs = model_2nd_grad_drop.predict_proba(normalize_features(taxi_collision[ALL_FEATURES]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision['2th_grad_drop_class'], taxi_collision_2nd_grad_drop_preds))

taxi_collision_order_of_mag_drop_preds = model_order_of_mag_drop.predict(normalize_features(taxi_collision[ALL_FEATURES]))
taxi_collision_order_of_mag_drop_preds_probs = model_order_of_mag_drop.predict_proba(normalize_features(taxi_collision[ALL_FEATURES]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(taxi_collision['order_of_mag_drop_class'], taxi_collision_order_of_mag_drop_preds))

taxi_collision_median_based_preds = model_median_based.predict(normalize_features(taxi_collision[ALL_FEATURES]))
taxi_collision_median_based_preds_probs = model_median_based.predict_proba(normalize_features(taxi_collision[ALL_FEATURES]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(taxi_collision['median_based_class'], taxi_collision_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.87      0.93       447
        loss       0.00      0.00      0.00         0

    accuracy                           0.87       447
   macro avg       0.50      0.44      0.47       447
weighted avg       1.00      0.87      0.93       447

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         3
        loss       0.99      0.94      0.96       444

    accuracy                           0.93       447
   macro avg       0.50      0.47      0.48       447
weighted avg       0.99      0.93      0.96       447

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.04      0.50      0.07         4
        loss       0.99      0.88      0.93       443

    accuracy                           0.87       447
   macro avg  

In [13]:
poverty_estimation_pos_neg_preds = model_pos_neg.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
poverty_estimation_pos_neg_preds_probs = model_pos_neg.predict_proba(normalize_features(poverty_estimation[ALL_FEATURES]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(poverty_estimation['class_pos_neg'], poverty_estimation_pos_neg_preds))

poverty_estimation_harsh_grad_preds = model_harsh_grad.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
poverty_estimation_harsh_grad_preds_probs = model_harsh_grad.predict_proba(normalize_features(poverty_estimation[ALL_FEATURES]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation['harsh_grad_class'], poverty_estimation_harsh_grad_preds))

poverty_estimation_2nd_grad_drop_preds = model_2nd_grad_drop.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
poverty_estimation_2nd_grad_drop_preds_probs = model_2nd_grad_drop.predict_proba(normalize_features(poverty_estimation[ALL_FEATURES]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation['2th_grad_drop_class'], poverty_estimation_2nd_grad_drop_preds))

poverty_estimation_order_of_mag_drop_preds = model_order_of_mag_drop.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
poverty_estimation_order_of_mag_drop_preds_probs = model_order_of_mag_drop.predict_proba(normalize_features(poverty_estimation[ALL_FEATURES]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(poverty_estimation['order_of_mag_drop_class'], poverty_estimation_order_of_mag_drop_preds))

poverty_estimation_median_based_preds = model_median_based.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
poverty_estimation_median_based_preds_probs = model_median_based.predict_proba(normalize_features(poverty_estimation[ALL_FEATURES]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(poverty_estimation['median_based_class'], poverty_estimation_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       0.09      1.00      0.16     11526
        loss       1.00      0.00      0.00    119402

    accuracy                           0.09    130928
   macro avg       0.54      0.50      0.08    130928
weighted avg       0.92      0.09      0.01    130928

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         1
        loss       1.00      0.97      0.98    130927

    accuracy                           0.97    130928
   macro avg       0.50      0.48      0.49    130928
weighted avg       1.00      0.97      0.98    130928

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.00      0.00      0.00        13
        loss       1.00      0.92      0.96    130915

    accuracy                           0.92    130928
   macro avg  

### Again, results are not good for case studies when we use models created over openml-train (with many candidates per query)... Low containment rates and mean imputation strategies might be creating problems here. Note that the results get worse when there are *very few* examples in class "gain".

### Before we move on to experiments in which we only consider high containment rates, let's see which features were most important for each one of the models we created.

In [14]:
def show_feature_importances(feature_names, gini_indices):
    '''
    Given the names of the features and their Gini indices 
    for a given random forest model, this function prints the 
    features' importances in order.
    '''
    return sorted([(name, importance) for name, importance in zip(feature_names, gini_indices)],
                  key=lambda x: x[1],
                  reverse=True)

In [15]:
print('*** FEATURE IMPORTANCES -- model_pos_neg')
print(show_feature_importances(ALL_FEATURES, model_pos_neg.feature_importances_))

*** FEATURE IMPORTANCES -- model_pos_neg
[('candidate_target_max_spearman', 0.09327154524434599), ('candidate_target_max_pearson', 0.09269963717573605), ('candidate_max_skewness', 0.08560987541738488), ('candidate_max_kurtosis', 0.08484251778868), ('candidate_target_max_mutual_info', 0.07243872886740163), ('query_row_column_ratio', 0.06428677375367618), ('query_num_of_columns', 0.06208084094257545), ('candidate_max_unique', 0.05894097014923773), ('candidate_num_rows', 0.05747051085668549), ('query_max_kurtosis', 0.04813827898098861), ('query_target_max_spearman', 0.04759890774884748), ('query_target_max_pearson', 0.0473440303159694), ('query_max_skewness', 0.045533674264868707), ('query_target_max_mutual_info', 0.03869156670888208), ('query_max_unique', 0.03807734956417052), ('containment_fraction', 0.034187330841214066), ('query_num_of_rows', 0.025831986522149214), ('candidate_target_max_covariance', 0.0019964131553971217), ('query_target_max_covariance', 0.0009590617017893569)]


In [16]:
print('*** FEATURE IMPORTANCES -- model_harsh_grad')
print(show_feature_importances(ALL_FEATURES, model_harsh_grad.feature_importances_))

*** FEATURE IMPORTANCES -- model_harsh_grad
[('candidate_target_max_pearson', 0.12845309121785464), ('candidate_target_max_spearman', 0.12679667282820742), ('candidate_max_skewness', 0.08268012547662078), ('candidate_max_kurtosis', 0.07772789793934655), ('query_row_column_ratio', 0.06956562485914608), ('candidate_target_max_mutual_info', 0.06752709580716859), ('query_num_of_columns', 0.05514659112429107), ('candidate_max_unique', 0.0483843050297933), ('query_target_max_pearson', 0.04773428644279254), ('query_target_max_spearman', 0.047406711792386196), ('query_max_kurtosis', 0.04532020057709763), ('query_max_skewness', 0.04450310005850299), ('candidate_num_rows', 0.038461371090134026), ('query_target_max_mutual_info', 0.03518462271998167), ('query_max_unique', 0.0345012678217499), ('query_num_of_rows', 0.024834090082620462), ('containment_fraction', 0.02045612683734786), ('candidate_target_max_covariance', 0.00456069437512219), ('query_target_max_covariance', 0.0007561239198360735)]


In [17]:
print('*** FEATURE IMPORTANCES -- model_2nd_grad_drop')
print(show_feature_importances(ALL_FEATURES, model_2nd_grad_drop.feature_importances_))

*** FEATURE IMPORTANCES -- model_2nd_grad_drop
[('candidate_target_max_spearman', 0.12317088163047275), ('candidate_target_max_pearson', 0.11635787055716527), ('candidate_max_skewness', 0.08462908918814994), ('candidate_max_kurtosis', 0.08270441142206873), ('query_row_column_ratio', 0.07181546141796596), ('candidate_target_max_mutual_info', 0.06623251675126447), ('query_num_of_columns', 0.05865295134027073), ('candidate_max_unique', 0.05077788936991163), ('query_target_max_pearson', 0.048565333480530624), ('query_target_max_spearman', 0.04843698831217798), ('query_max_kurtosis', 0.04772405351829744), ('query_max_skewness', 0.045602710883535444), ('candidate_num_rows', 0.0414059222273426), ('query_target_max_mutual_info', 0.032218037311034835), ('query_max_unique', 0.03029921398912687), ('containment_fraction', 0.023901042213157416), ('query_num_of_rows', 0.021233675750339454), ('candidate_target_max_covariance', 0.004252139384316361), ('query_target_max_covariance', 0.00201981125287148

In [18]:
print('*** FEATURE IMPORTANCES -- model_order_of_mag_drop')
print(show_feature_importances(ALL_FEATURES, model_pos_neg.feature_importances_))

*** FEATURE IMPORTANCES -- model_order_of_mag_drop
[('candidate_target_max_spearman', 0.09327154524434599), ('candidate_target_max_pearson', 0.09269963717573605), ('candidate_max_skewness', 0.08560987541738488), ('candidate_max_kurtosis', 0.08484251778868), ('candidate_target_max_mutual_info', 0.07243872886740163), ('query_row_column_ratio', 0.06428677375367618), ('query_num_of_columns', 0.06208084094257545), ('candidate_max_unique', 0.05894097014923773), ('candidate_num_rows', 0.05747051085668549), ('query_max_kurtosis', 0.04813827898098861), ('query_target_max_spearman', 0.04759890774884748), ('query_target_max_pearson', 0.0473440303159694), ('query_max_skewness', 0.045533674264868707), ('query_target_max_mutual_info', 0.03869156670888208), ('query_max_unique', 0.03807734956417052), ('containment_fraction', 0.034187330841214066), ('query_num_of_rows', 0.025831986522149214), ('candidate_target_max_covariance', 0.0019964131553971217), ('query_target_max_covariance', 0.00095906170178935

In [19]:
print('*** FEATURE IMPORTANCES -- model_median_based')
print(show_feature_importances(ALL_FEATURES, model_pos_neg.feature_importances_))

*** FEATURE IMPORTANCES -- model_median_based
[('candidate_target_max_spearman', 0.09327154524434599), ('candidate_target_max_pearson', 0.09269963717573605), ('candidate_max_skewness', 0.08560987541738488), ('candidate_max_kurtosis', 0.08484251778868), ('candidate_target_max_mutual_info', 0.07243872886740163), ('query_row_column_ratio', 0.06428677375367618), ('query_num_of_columns', 0.06208084094257545), ('candidate_max_unique', 0.05894097014923773), ('candidate_num_rows', 0.05747051085668549), ('query_max_kurtosis', 0.04813827898098861), ('query_target_max_spearman', 0.04759890774884748), ('query_target_max_pearson', 0.0473440303159694), ('query_max_skewness', 0.045533674264868707), ('query_target_max_mutual_info', 0.03869156670888208), ('query_max_unique', 0.03807734956417052), ('containment_fraction', 0.034187330841214066), ('query_num_of_rows', 0.025831986522149214), ('candidate_target_max_covariance', 0.0019964131553971217), ('query_target_max_covariance', 0.0009590617017893569)]


### It's undeniable that features candidate_target_max_spearman and candidate_target_max_pearson are absolutely crucial for all the models... As a matter of fact, the ordering across the features does not vary a lot for the different models.

### Let's check how results change when we consider higher containment ratios alone (and stop using containment ratio as a feature). For now, just to make sure that this orthogonal issue of "missing data imputation" gets out of the way, I'll only consider instances (for training and test) with containment ratio $\theta = 1$.

In [20]:
FEATURES = ['query_num_of_columns', 'query_num_of_rows', 'query_row_column_ratio',
            'query_max_skewness', 'query_max_kurtosis', 'query_max_unique', 
            'candidate_num_rows', 'candidate_max_skewness', 'candidate_max_kurtosis',
            'candidate_max_unique', 'query_target_max_pearson', 
            'query_target_max_spearman', 'query_target_max_covariance', 
            'query_target_max_mutual_info', 'candidate_target_max_pearson', 
            'candidate_target_max_spearman', 'candidate_target_max_covariance', 
            'candidate_target_max_mutual_info']
THETA = 1

openml_training_high_containment = openml_training.loc[openml_training['containment_fraction'] >= THETA]

model_pos_neg_high_containment = train_model(openml_training_high_containment[FEATURES], 
                                             openml_training_high_containment['class_pos_neg'])

model_harsh_grad_high_containment = train_model(openml_training_high_containment[FEATURES], 
                                                openml_training_high_containment['harsh_grad_class'])

model_2nd_grad_drop_high_containment = train_model(openml_training_high_containment[FEATURES], 
                                                   openml_training_high_containment['2th_grad_drop_class'])

model_order_of_mag_drop_high_containment = train_model(openml_training_high_containment[FEATURES], 
                                                       openml_training_high_containment['order_of_mag_drop_class'])

model_median_based_high_containment = train_model(openml_training_high_containment[FEATURES], 
                                                  openml_training_high_containment['median_based_class'])

In [21]:
openml_test_high_containment = openml_test.loc[openml_test['containment_fraction'] >= THETA]

test_high_containment_pos_neg_preds = model_pos_neg_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_pos_neg_preds_probs = model_pos_neg_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['class_pos_neg'], test_high_containment_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.54      0.47      0.50     19748
        loss       0.53      0.60      0.56     20028

    accuracy                           0.54     39776
   macro avg       0.54      0.53      0.53     39776
weighted avg       0.54      0.54      0.53     39776



In [22]:
test_high_containment_harsh_grad_preds = model_harsh_grad_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_harsh_grad_preds_probs = model_harsh_grad_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['harsh_grad_class'], test_high_containment_harsh_grad_preds))

              precision    recall  f1-score   support

        gain       0.92      0.08      0.14      1807
        loss       0.96      1.00      0.98     37969

    accuracy                           0.96     39776
   macro avg       0.94      0.54      0.56     39776
weighted avg       0.96      0.96      0.94     39776



In [23]:
test_high_containment_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['2th_grad_drop_class'], test_high_containment_2nd_grad_drop_preds))

              precision    recall  f1-score   support

        gain       0.83      0.05      0.10      3509
        loss       0.92      1.00      0.96     36267

    accuracy                           0.92     39776
   macro avg       0.87      0.53      0.53     39776
weighted avg       0.91      0.92      0.88     39776



In [24]:
test_high_containment_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['order_of_mag_drop_class'], test_high_containment_order_of_mag_drop_preds))

              precision    recall  f1-score   support

        gain       0.70      0.07      0.12      8221
        loss       0.80      0.99      0.89     31555

    accuracy                           0.80     39776
   macro avg       0.75      0.53      0.50     39776
weighted avg       0.78      0.80      0.73     39776



In [25]:
test_high_containment_median_based_preds = model_median_based_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_median_based_preds_probs = model_median_based_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['median_based_class'], test_high_containment_median_based_preds))

              precision    recall  f1-score   support

        gain       0.56      0.20      0.29     14622
        loss       0.66      0.91      0.77     25154

    accuracy                           0.65     39776
   macro avg       0.61      0.55      0.53     39776
weighted avg       0.62      0.65      0.59     39776



### Again, results seem to be better for positive/negative definition of class... Let's see how it goes to the case studies, now also with higher containment.

In [26]:
college_debt_high_containment = college_debt.loc[college_debt['containment_fraction'] >= THETA]

college_debt_high_containment_pos_neg_preds = model_pos_neg_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_pos_neg_preds_probs = model_pos_neg_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(college_debt_high_containment['class_pos_neg'], college_debt_high_containment_pos_neg_preds))

college_debt_high_containment_harsh_grad_preds = model_harsh_grad_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_harsh_grad_preds_probs = model_harsh_grad_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(college_debt_high_containment['harsh_grad_class'], college_debt_high_containment_harsh_grad_preds))

college_debt_high_containment_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(college_debt_high_containment['2th_grad_drop_class'], college_debt_high_containment_2nd_grad_drop_preds))

college_debt_high_containment_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(college_debt_high_containment['order_of_mag_drop_class'], college_debt_high_containment_order_of_mag_drop_preds))

college_debt_high_containment_median_based_preds = model_median_based_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_median_based_preds_probs = model_median_based_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(college_debt_high_containment['median_based_class'], college_debt_high_containment_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      1.00      1.00         6
        loss       1.00      1.00      1.00         3

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         1
        loss       0.89      1.00      0.94         8

    accuracy                           0.89         9
   macro avg       0.44      0.50      0.47         9
weighted avg       0.79      0.89      0.84         9

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         1
        loss       0.89      1.00      0.94         8

    accuracy                           0.89         9
   macro avg  

In [106]:
taxi_collision_high_containment = taxi_collision.loc[taxi_collision['containment_fraction'] >= THETA]

taxi_collision_high_containment_pos_neg_preds = model_pos_neg_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_pos_neg_preds_probs = model_pos_neg_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(taxi_collision_high_containment['class_pos_neg'], taxi_collision_high_containment_pos_neg_preds))

taxi_collision_high_containment_harsh_grad_preds = model_harsh_grad_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_harsh_grad_preds_probs = model_harsh_grad_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['harsh_grad_class'], taxi_collision_high_containment_harsh_grad_preds))

taxi_collision_high_containment_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['2th_grad_drop_class'], taxi_collision_high_containment_2nd_grad_drop_preds))

taxi_collision_high_containment_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['order_of_mag_drop_class'], taxi_collision_high_containment_order_of_mag_drop_preds))

taxi_collision_high_containment_median_based_preds = model_median_based_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_median_based_preds_probs = model_median_based_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(taxi_collision_high_containment['median_based_class'], taxi_collision_high_containment_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.22      0.36        18
        loss       0.00      0.00      0.00         0

    accuracy                           0.22        18
   macro avg       0.50      0.11      0.18        18
weighted avg       1.00      0.22      0.36        18

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

**** ORDER OF

In [28]:
poverty_estimation_high_containment = poverty_estimation.loc[poverty_estimation['containment_fraction'] >= THETA]

poverty_estimation_high_containment_pos_neg_preds = model_pos_neg_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_pos_neg_preds_probs = model_pos_neg_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(poverty_estimation_high_containment['class_pos_neg'], poverty_estimation_high_containment_pos_neg_preds))

poverty_estimation_high_containment_harsh_grad_preds = model_harsh_grad_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_harsh_grad_preds_probs = model_harsh_grad_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['harsh_grad_class'], poverty_estimation_high_containment_harsh_grad_preds))

poverty_estimation_high_containment_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['2th_grad_drop_class'], poverty_estimation_high_containment_2nd_grad_drop_preds))

poverty_estimation_high_containment_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['order_of_mag_drop_class'], poverty_estimation_high_containment_order_of_mag_drop_preds))

poverty_estimation_high_containment_median_based_preds = model_median_based_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_median_based_preds_probs = model_median_based_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(poverty_estimation_high_containment['median_based_class'], poverty_estimation_high_containment_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.36      0.53        11
        loss       0.00      0.00      0.00         0

    accuracy                           0.36        11
   macro avg       0.50      0.18      0.27        11
weighted avg       1.00      0.36      0.53        11

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

**** ORDER OF

### Results do seem to get better when no mean data imputation is performed. What if we keep the models with $\theta = 1$ but accept a certain level of missing data imputation in test and use cases (say $\theta = 0.5$)?

In [29]:
THETA = 0.5

openml_test_high_containment = openml_test.loc[openml_test['containment_fraction'] >= THETA]

test_high_containment_pos_neg_preds = model_pos_neg_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_pos_neg_preds_probs = model_pos_neg_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['class_pos_neg'], test_high_containment_pos_neg_preds))

test_high_containment_harsh_grad_preds = model_harsh_grad_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_harsh_grad_preds_probs = model_harsh_grad_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['harsh_grad_class'], test_high_containment_harsh_grad_preds))

test_high_containment_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['2th_grad_drop_class'], test_high_containment_2nd_grad_drop_preds))

test_high_containment_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['order_of_mag_drop_class'], test_high_containment_order_of_mag_drop_preds))

test_high_containment_median_based_preds = model_median_based_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
test_high_containment_median_based_preds_probs = model_median_based_high_containment.predict_proba(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['median_based_class'], test_high_containment_median_based_preds))

              precision    recall  f1-score   support

        gain       0.56      0.43      0.49     23212
        loss       0.53      0.65      0.59     23169

    accuracy                           0.54     46381
   macro avg       0.54      0.54      0.54     46381
weighted avg       0.54      0.54      0.54     46381

              precision    recall  f1-score   support

        gain       0.89      0.07      0.13      2114
        loss       0.96      1.00      0.98     44267

    accuracy                           0.96     46381
   macro avg       0.93      0.54      0.55     46381
weighted avg       0.95      0.96      0.94     46381

              precision    recall  f1-score   support

        gain       0.82      0.05      0.09      4237
        loss       0.91      1.00      0.95     42144

    accuracy                           0.91     46381
   macro avg       0.87      0.52      0.52     46381
weighted avg       0.90      0.91      0.87     46381

              preci

In [30]:
college_debt_high_containment = college_debt.loc[college_debt['containment_fraction'] >= THETA]

college_debt_high_containment_pos_neg_preds = model_pos_neg_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_pos_neg_preds_probs = model_pos_neg_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(college_debt_high_containment['class_pos_neg'], college_debt_high_containment_pos_neg_preds))

college_debt_high_containment_harsh_grad_preds = model_harsh_grad_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_harsh_grad_preds_probs = model_harsh_grad_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(college_debt_high_containment['harsh_grad_class'], college_debt_high_containment_harsh_grad_preds))

college_debt_high_containment_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(college_debt_high_containment['2th_grad_drop_class'], college_debt_high_containment_2nd_grad_drop_preds))

college_debt_high_containment_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(college_debt_high_containment['order_of_mag_drop_class'], college_debt_high_containment_order_of_mag_drop_preds))

college_debt_high_containment_median_based_preds = model_median_based_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
college_debt_high_containment_median_based_preds_probs = model_median_based_high_containment.predict_proba(normalize_features(college_debt_high_containment[FEATURES]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(college_debt_high_containment['median_based_class'], college_debt_high_containment_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       0.95      0.41      0.58        92
        loss       0.30      0.92      0.45        25

    accuracy                           0.52       117
   macro avg       0.62      0.67      0.51       117
weighted avg       0.81      0.52      0.55       117

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         1
        loss       0.99      0.98      0.99       116

    accuracy                           0.97       117
   macro avg       0.50      0.49      0.49       117
weighted avg       0.98      0.97      0.98       117

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.17      0.67      0.27         3
        loss       0.99      0.91      0.95       114

    accuracy                           0.91       117
   macro avg  

In [107]:
taxi_collision_high_containment = taxi_collision.loc[taxi_collision['containment_fraction'] >= THETA]

taxi_collision_high_containment_pos_neg_preds = model_pos_neg_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_pos_neg_preds_probs = model_pos_neg_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(taxi_collision_high_containment['class_pos_neg'], taxi_collision_high_containment_pos_neg_preds))

taxi_collision_high_containment_harsh_grad_preds = model_harsh_grad_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_harsh_grad_preds_probs = model_harsh_grad_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['harsh_grad_class'], taxi_collision_high_containment_harsh_grad_preds))

taxi_collision_high_containment_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['2th_grad_drop_class'], taxi_collision_high_containment_2nd_grad_drop_preds))

taxi_collision_high_containment_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['order_of_mag_drop_class'], taxi_collision_high_containment_order_of_mag_drop_preds))

taxi_collision_high_containment_median_based_preds = model_median_based_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
taxi_collision_high_containment_median_based_preds_probs = model_median_based_high_containment.predict_proba(normalize_features(taxi_collision_high_containment[FEATURES]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(taxi_collision_high_containment['median_based_class'], taxi_collision_high_containment_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.22      0.36        18
        loss       0.00      0.00      0.00         0

    accuracy                           0.22        18
   macro avg       0.50      0.11      0.18        18
weighted avg       1.00      0.22      0.36        18

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

**** ORDER OF

In [32]:
poverty_estimation_high_containment = poverty_estimation.loc[poverty_estimation['containment_fraction'] >= THETA]

poverty_estimation_high_containment_pos_neg_preds = model_pos_neg_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_pos_neg_preds_probs = model_pos_neg_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(poverty_estimation_high_containment['class_pos_neg'], poverty_estimation_high_containment_pos_neg_preds))

poverty_estimation_high_containment_harsh_grad_preds = model_harsh_grad_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_harsh_grad_preds_probs = model_harsh_grad_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['harsh_grad_class'], poverty_estimation_high_containment_harsh_grad_preds))

poverty_estimation_high_containment_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['2th_grad_drop_class'], poverty_estimation_high_containment_2nd_grad_drop_preds))

poverty_estimation_high_containment_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['order_of_mag_drop_class'], poverty_estimation_high_containment_order_of_mag_drop_preds))

poverty_estimation_high_containment_median_based_preds = model_median_based_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
poverty_estimation_high_containment_median_based_preds_probs = model_median_based_high_containment.predict_proba(normalize_features(poverty_estimation_high_containment[FEATURES]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(poverty_estimation_high_containment['median_based_class'], poverty_estimation_high_containment_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.36      0.53        11
        loss       0.00      0.00      0.00         0

    accuracy                           0.36        11
   macro avg       0.50      0.18      0.27        11
weighted avg       1.00      0.36      0.53        11

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

**** ORDER OF

### Now with a few more candidates, we still can say that the results are not very good, but the POSITIVE/NEGATIVE and the MEDIAN-BASED policy seem to be the best.

### For all the next experiments, we'll use $\theta = 1$ for building models and consider $\theta = 0.5$ for test data and case studies.

### Maybe some of the features we've been using are adding problems to the data? To check this out, we'll take a minimalistic approach and create models that use only EITHER candidate_target_max_spearman OR candidate_target_max_pearson.

In [33]:
PEARSON = ['candidate_target_max_pearson']
SPEARMAN = ['candidate_target_max_spearman']

model_pos_neg_high_containment_pearson = train_model(openml_training_high_containment[PEARSON],
                                                     openml_training_high_containment['class_pos_neg'])

model_harsh_grad_high_containment_pearson = train_model(openml_training_high_containment[PEARSON],
                                                        openml_training_high_containment['harsh_grad_class'])

model_2nd_grad_drop_high_containment_pearson = train_model(openml_training_high_containment[PEARSON],
                                                           openml_training_high_containment['2th_grad_drop_class'])

model_order_of_mag_drop_high_containment_pearson = train_model(openml_training_high_containment[PEARSON],
                                                               openml_training_high_containment['order_of_mag_drop_class'])

model_median_based_high_containment_pearson = train_model(openml_training_high_containment[PEARSON],
                                                          openml_training_high_containment['median_based_class'])





model_pos_neg_high_containment_spearman = train_model(openml_training_high_containment[SPEARMAN],
                                                      openml_training_high_containment['class_pos_neg'])

model_harsh_grad_high_containment_spearman = train_model(openml_training_high_containment[SPEARMAN],
                                                         openml_training_high_containment['harsh_grad_class'])

model_2nd_grad_drop_high_containment_spearman = train_model(openml_training_high_containment[SPEARMAN],
                                                            openml_training_high_containment['2th_grad_drop_class'])

model_order_of_mag_drop_high_containment_spearman = train_model(openml_training_high_containment[SPEARMAN],
                                                                openml_training_high_containment['order_of_mag_drop_class'])

model_median_based_high_containment_spearman = train_model(openml_training_high_containment[SPEARMAN],
                                                          openml_training_high_containment['median_based_class'])

### Let's start with Pearson.

In [34]:
test_high_containment_pearson_pos_neg_preds = model_pos_neg_high_containment_pearson.predict(normalize_features(openml_test_high_containment[PEARSON]))
test_high_containment_pearson_pos_neg_preds_probs = model_pos_neg_high_containment_pearson.predict_proba(normalize_features(openml_test_high_containment[PEARSON]))
print(classification_report(openml_test_high_containment['class_pos_neg'], test_high_containment_pearson_pos_neg_preds))

test_high_containment_pearson_harsh_grad_preds = model_harsh_grad_high_containment_pearson.predict(normalize_features(openml_test_high_containment[PEARSON]))
test_high_containment_pearson_harsh_grad_preds_probs = model_harsh_grad_high_containment_pearson.predict_proba(normalize_features(openml_test_high_containment[PEARSON]))
print(classification_report(openml_test_high_containment['harsh_grad_class'], test_high_containment_pearson_harsh_grad_preds))

test_high_containment_pearson_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment_pearson.predict(normalize_features(openml_test_high_containment[PEARSON]))
test_high_containment_pearson_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment_pearson.predict_proba(normalize_features(openml_test_high_containment[PEARSON]))
print(classification_report(openml_test_high_containment['2th_grad_drop_class'], test_high_containment_pearson_2nd_grad_drop_preds))

test_high_containment_pearson_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment_pearson.predict(normalize_features(openml_test_high_containment[PEARSON]))
test_high_containment_pearson_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment_pearson.predict_proba(normalize_features(openml_test_high_containment[PEARSON]))
print(classification_report(openml_test_high_containment['order_of_mag_drop_class'], test_high_containment_pearson_order_of_mag_drop_preds))

test_high_containment_pearson_median_based_preds = model_median_based_high_containment_pearson.predict(normalize_features(openml_test_high_containment[PEARSON]))
test_high_containment_pearson_median_based_preds_probs = model_median_based_high_containment_pearson.predict_proba(normalize_features(openml_test_high_containment[PEARSON]))
print(classification_report(openml_test_high_containment['median_based_class'], test_high_containment_pearson_median_based_preds))

              precision    recall  f1-score   support

        gain       0.51      0.47      0.49     23212
        loss       0.50      0.54      0.52     23169

    accuracy                           0.51     46381
   macro avg       0.51      0.51      0.51     46381
weighted avg       0.51      0.51      0.51     46381

              precision    recall  f1-score   support

        gain       0.18      0.05      0.08      2114
        loss       0.96      0.99      0.97     44267

    accuracy                           0.95     46381
   macro avg       0.57      0.52      0.52     46381
weighted avg       0.92      0.95      0.93     46381

              precision    recall  f1-score   support

        gain       0.19      0.05      0.08      4237
        loss       0.91      0.98      0.94     42144

    accuracy                           0.89     46381
   macro avg       0.55      0.51      0.51     46381
weighted avg       0.85      0.89      0.86     46381

              preci

In [35]:
college_debt_high_containment_pearson_pos_neg_preds = model_pos_neg_high_containment_pearson.predict(normalize_features(college_debt_high_containment[PEARSON]))
college_debt_high_containment_pearson_pos_neg_preds_probs = model_pos_neg_high_containment_pearson.predict_proba(normalize_features(college_debt_high_containment[PEARSON]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(college_debt_high_containment['class_pos_neg'], college_debt_high_containment_pearson_pos_neg_preds))

college_debt_high_containment_pearson_harsh_grad_preds = model_harsh_grad_high_containment_pearson.predict(normalize_features(college_debt_high_containment[PEARSON]))
college_debt_high_containment_pearson_harsh_grad_preds_probs = model_harsh_grad_high_containment_pearson.predict_proba(normalize_features(college_debt_high_containment[PEARSON]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(college_debt_high_containment['harsh_grad_class'], college_debt_high_containment_pearson_harsh_grad_preds))

college_debt_high_containment_pearson_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment_pearson.predict(normalize_features(college_debt_high_containment[PEARSON]))
college_debt_high_containment_pearson_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment_pearson.predict_proba(normalize_features(college_debt_high_containment[PEARSON]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(college_debt_high_containment['2th_grad_drop_class'], college_debt_high_containment_pearson_2nd_grad_drop_preds))

college_debt_high_containment_pearson_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment_pearson.predict(normalize_features(college_debt_high_containment[PEARSON]))
college_debt_high_containment_pearson_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment_pearson.predict_proba(normalize_features(college_debt_high_containment[PEARSON]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(college_debt_high_containment['order_of_mag_drop_class'], college_debt_high_containment_pearson_order_of_mag_drop_preds))

college_debt_high_containment_pearson_median_based_preds = model_median_based_high_containment_pearson.predict(normalize_features(college_debt_high_containment[PEARSON]))
college_debt_high_containment_pearson_median_based_preds_probs = model_median_based_high_containment_pearson.predict_proba(normalize_features(college_debt_high_containment[PEARSON]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(college_debt_high_containment['median_based_class'], college_debt_high_containment_pearson_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       0.89      0.37      0.52        92
        loss       0.27      0.84      0.40        25

    accuracy                           0.47       117
   macro avg       0.58      0.60      0.46       117
weighted avg       0.76      0.47      0.50       117

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         1
        loss       0.99      0.96      0.97       116

    accuracy                           0.95       117
   macro avg       0.50      0.48      0.49       117
weighted avg       0.98      0.95      0.97       117

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         3
        loss       0.97      0.96      0.96       114

    accuracy                           0.93       117
   macro avg  

In [108]:
taxi_collision_high_containment_pearson_pos_neg_preds = model_pos_neg_high_containment_pearson.predict(normalize_features(taxi_collision_high_containment[PEARSON]))
taxi_collision_high_containment_pearson_pos_neg_preds_probs = model_pos_neg_high_containment_pearson.predict_proba(normalize_features(taxi_collision_high_containment[PEARSON]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(taxi_collision_high_containment['class_pos_neg'], taxi_collision_high_containment_pearson_pos_neg_preds))

taxi_collision_high_containment_pearson_harsh_grad_preds = model_harsh_grad_high_containment_pearson.predict(normalize_features(taxi_collision_high_containment[PEARSON]))
taxi_collision_high_containment_pearson_harsh_grad_preds_probs = model_harsh_grad_high_containment_pearson.predict_proba(normalize_features(taxi_collision_high_containment[PEARSON]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['harsh_grad_class'], taxi_collision_high_containment_pearson_harsh_grad_preds))

taxi_collision_high_containment_pearson_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment_pearson.predict(normalize_features(taxi_collision_high_containment[PEARSON]))
taxi_collision_high_containment_pearson_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment_pearson.predict_proba(normalize_features(taxi_collision_high_containment[PEARSON]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['2th_grad_drop_class'], taxi_collision_high_containment_pearson_2nd_grad_drop_preds))

taxi_collision_high_containment_pearson_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment_pearson.predict(normalize_features(taxi_collision_high_containment[PEARSON]))
taxi_collision_high_containment_pearson_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment_pearson.predict_proba(normalize_features(taxi_collision_high_containment[PEARSON]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['order_of_mag_drop_class'], taxi_collision_high_containment_pearson_order_of_mag_drop_preds))

taxi_collision_high_containment_pearson_median_based_preds = model_median_based_high_containment_pearson.predict(normalize_features(taxi_collision_high_containment[PEARSON]))
taxi_collision_high_containment_pearson_median_based_preds_probs = model_median_based_high_containment_pearson.predict_proba(normalize_features(taxi_collision_high_containment[PEARSON]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(taxi_collision_high_containment['median_based_class'], taxi_collision_high_containment_pearson_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.50      0.67        18
        loss       0.00      0.00      0.00         0

    accuracy                           0.50        18
   macro avg       0.50      0.25      0.33        18
weighted avg       1.00      0.50      0.67        18

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

**** ORDER OF

In [37]:
poverty_estimation_high_containment_pearson_pos_neg_preds = model_pos_neg_high_containment_pearson.predict(normalize_features(poverty_estimation_high_containment[PEARSON]))
poverty_estimation_high_containment_pearson_pos_neg_preds_probs = model_pos_neg_high_containment_pearson.predict_proba(normalize_features(poverty_estimation_high_containment[PEARSON]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(poverty_estimation_high_containment['class_pos_neg'], poverty_estimation_high_containment_pearson_pos_neg_preds))

poverty_estimation_high_containment_pearson_harsh_grad_preds = model_harsh_grad_high_containment_pearson.predict(normalize_features(poverty_estimation_high_containment[PEARSON]))
poverty_estimation_high_containment_pearson_harsh_grad_preds_probs = model_harsh_grad_high_containment_pearson.predict_proba(normalize_features(poverty_estimation_high_containment[PEARSON]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['harsh_grad_class'], poverty_estimation_high_containment_pearson_harsh_grad_preds))

poverty_estimation_high_containment_pearson_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment_pearson.predict(normalize_features(poverty_estimation_high_containment[PEARSON]))
poverty_estimation_high_containment_pearson_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment_pearson.predict_proba(normalize_features(poverty_estimation_high_containment[PEARSON]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['2th_grad_drop_class'], poverty_estimation_high_containment_pearson_2nd_grad_drop_preds))

poverty_estimation_high_containment_pearson_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment_pearson.predict(normalize_features(poverty_estimation_high_containment[PEARSON]))
poverty_estimation_high_containment_pearson_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment_pearson.predict_proba(normalize_features(poverty_estimation_high_containment[PEARSON]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['order_of_mag_drop_class'], poverty_estimation_high_containment_pearson_order_of_mag_drop_preds))

poverty_estimation_high_containment_pearson_median_based_preds = model_median_based_high_containment_pearson.predict(normalize_features(poverty_estimation_high_containment[PEARSON]))
poverty_estimation_high_containment_pearson_median_based_preds_probs = model_median_based_high_containment_pearson.predict_proba(normalize_features(poverty_estimation_high_containment[PEARSON]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(poverty_estimation_high_containment['median_based_class'], poverty_estimation_high_containment_pearson_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.36      0.53        11
        loss       0.00      0.00      0.00         0

    accuracy                           0.36        11
   macro avg       0.50      0.18      0.27        11
weighted avg       1.00      0.36      0.53        11

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

**** ORDER OF

### Now let's do the same with Spearman alone.

In [38]:
test_high_containment_spearman_pos_neg_preds = model_pos_neg_high_containment_spearman.predict(normalize_features(openml_test_high_containment[SPEARMAN]))
test_high_containment_spearman_pos_neg_preds_probs = model_pos_neg_high_containment_spearman.predict_proba(normalize_features(openml_test_high_containment[SPEARMAN]))
test_high_containment_spearman_pos_neg_preds_probs = model_pos_neg_high_containment_spearman.predict_proba(normalize_features(openml_test_high_containment[SPEARMAN]))
print(classification_report(openml_test_high_containment['class_pos_neg'], test_high_containment_spearman_pos_neg_preds))

test_high_containment_spearman_harsh_grad_preds = model_harsh_grad_high_containment_spearman.predict(normalize_features(openml_test_high_containment[SPEARMAN]))
test_high_containment_spearman_harsh_grad_preds_probs = model_harsh_grad_high_containment_spearman.predict_proba(normalize_features(openml_test_high_containment[SPEARMAN]))
print(classification_report(openml_test_high_containment['harsh_grad_class'], test_high_containment_spearman_harsh_grad_preds))

test_high_containment_spearman_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment_spearman.predict(normalize_features(openml_test_high_containment[SPEARMAN]))
test_high_containment_spearman_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment_spearman.predict_proba(normalize_features(openml_test_high_containment[SPEARMAN]))
print(classification_report(openml_test_high_containment['2th_grad_drop_class'], test_high_containment_spearman_2nd_grad_drop_preds))

test_high_containment_spearman_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment_spearman.predict(normalize_features(openml_test_high_containment[SPEARMAN]))
test_high_containment_spearman_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment_spearman.predict_proba(normalize_features(openml_test_high_containment[SPEARMAN]))
print(classification_report(openml_test_high_containment['order_of_mag_drop_class'], test_high_containment_spearman_order_of_mag_drop_preds))

test_high_containment_spearman_median_based_preds = model_median_based_high_containment_spearman.predict(normalize_features(openml_test_high_containment[SPEARMAN]))
test_high_containment_spearman_median_based_preds_probs = model_median_based_high_containment_spearman.predict_proba(normalize_features(openml_test_high_containment[SPEARMAN]))
print(classification_report(openml_test_high_containment['median_based_class'], test_high_containment_spearman_median_based_preds))

              precision    recall  f1-score   support

        gain       0.51      0.46      0.48     23212
        loss       0.51      0.56      0.53     23169

    accuracy                           0.51     46381
   macro avg       0.51      0.51      0.51     46381
weighted avg       0.51      0.51      0.51     46381

              precision    recall  f1-score   support

        gain       0.11      0.03      0.05      2114
        loss       0.96      0.99      0.97     44267

    accuracy                           0.95     46381
   macro avg       0.53      0.51      0.51     46381
weighted avg       0.92      0.95      0.93     46381

              precision    recall  f1-score   support

        gain       0.14      0.04      0.06      4237
        loss       0.91      0.98      0.94     42144

    accuracy                           0.89     46381
   macro avg       0.53      0.51      0.50     46381
weighted avg       0.84      0.89      0.86     46381

              preci

In [39]:
college_debt_high_containment_spearman_pos_neg_preds = model_pos_neg_high_containment_spearman.predict(normalize_features(college_debt_high_containment[SPEARMAN]))
college_debt_high_containment_spearman_pos_neg_preds_probs = model_pos_neg_high_containment_spearman.predict_proba(normalize_features(college_debt_high_containment[SPEARMAN]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(college_debt_high_containment['class_pos_neg'], college_debt_high_containment_spearman_pos_neg_preds))

college_debt_high_containment_spearman_harsh_grad_preds = model_harsh_grad_high_containment_spearman.predict(normalize_features(college_debt_high_containment[SPEARMAN]))
college_debt_high_containment_spearman_harsh_grad_preds_probs = model_harsh_grad_high_containment_spearman.predict_proba(normalize_features(college_debt_high_containment[SPEARMAN]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(college_debt_high_containment['harsh_grad_class'], college_debt_high_containment_spearman_harsh_grad_preds))

college_debt_high_containment_spearman_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment_spearman.predict(normalize_features(college_debt_high_containment[SPEARMAN]))
college_debt_high_containment_spearman_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment_spearman.predict_proba(normalize_features(college_debt_high_containment[SPEARMAN]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(college_debt_high_containment['2th_grad_drop_class'], college_debt_high_containment_spearman_2nd_grad_drop_preds))

college_debt_high_containment_spearman_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment_spearman.predict(normalize_features(college_debt_high_containment[SPEARMAN]))
college_debt_high_containment_spearman_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment_spearman.predict_proba(normalize_features(college_debt_high_containment[SPEARMAN]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(college_debt_high_containment['order_of_mag_drop_class'], college_debt_high_containment_spearman_order_of_mag_drop_preds))

college_debt_high_containment_spearman_median_based_preds = model_median_based_high_containment_spearman.predict(normalize_features(college_debt_high_containment[SPEARMAN]))
college_debt_high_containment_spearman_median_based_preds_probs = model_median_based_high_containment_spearman.predict_proba(normalize_features(college_debt_high_containment[SPEARMAN]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(college_debt_high_containment['median_based_class'], college_debt_high_containment_spearman_median_based_preds))



**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       0.85      0.36      0.50        92
        loss       0.24      0.76      0.37        25

    accuracy                           0.44       117
   macro avg       0.54      0.56      0.44       117
weighted avg       0.72      0.44      0.47       117

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       1.00      1.00      1.00         1
        loss       1.00      1.00      1.00       116

    accuracy                           1.00       117
   macro avg       1.00      1.00      1.00       117
weighted avg       1.00      1.00      1.00       117

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.33      0.50         3
        loss       0.98      1.00      0.99       114

    accuracy                           0.98       117
   macro avg  

In [109]:
taxi_collision_high_containment_spearman_pos_neg_preds = model_pos_neg_high_containment_spearman.predict(normalize_features(taxi_collision_high_containment[SPEARMAN]))
taxi_collision_high_containment_spearman_pos_neg_preds_probs = model_pos_neg_high_containment_spearman.predict_proba(normalize_features(taxi_collision_high_containment[SPEARMAN]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(taxi_collision_high_containment['class_pos_neg'], taxi_collision_high_containment_spearman_pos_neg_preds))

taxi_collision_high_containment_spearman_harsh_grad_preds = model_harsh_grad_high_containment_spearman.predict(normalize_features(taxi_collision_high_containment[SPEARMAN]))
taxi_collision_high_containment_spearman_harsh_grad_preds_probs = model_harsh_grad_high_containment_spearman.predict_proba(normalize_features(taxi_collision_high_containment[SPEARMAN]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['harsh_grad_class'], taxi_collision_high_containment_spearman_harsh_grad_preds))

taxi_collision_high_containment_spearman_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment_spearman.predict(normalize_features(taxi_collision_high_containment[SPEARMAN]))
taxi_collision_high_containment_spearman_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment_spearman.predict_proba(normalize_features(taxi_collision_high_containment[SPEARMAN]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['2th_grad_drop_class'], taxi_collision_high_containment_spearman_2nd_grad_drop_preds))

taxi_collision_high_containment_spearman_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment_spearman.predict(normalize_features(taxi_collision_high_containment[SPEARMAN]))
taxi_collision_high_containment_spearman_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment_spearman.predict_proba(normalize_features(taxi_collision_high_containment[SPEARMAN]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(taxi_collision_high_containment['order_of_mag_drop_class'], taxi_collision_high_containment_spearman_order_of_mag_drop_preds))

taxi_collision_high_containment_spearman_median_based_preds = model_median_based_high_containment_spearman.predict(normalize_features(taxi_collision_high_containment[SPEARMAN]))
taxi_collision_high_containment_spearman_median_based_preds_probs = model_median_based_high_containment_spearman.predict_proba(normalize_features(taxi_collision_high_containment[SPEARMAN]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(taxi_collision_high_containment['median_based_class'], taxi_collision_high_containment_spearman_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.33      0.50        18
        loss       0.00      0.00      0.00         0

    accuracy                           0.33        18
   macro avg       0.50      0.17      0.25        18
weighted avg       1.00      0.33      0.50        18

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

**** ORDER OF

In [41]:
poverty_estimation_high_containment_spearman_pos_neg_preds = model_pos_neg_high_containment_spearman.predict(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
poverty_estimation_high_containment_spearman_pos_neg_preds_probs = model_pos_neg_high_containment_spearman.predict_proba(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
print('**** POSITIVE AND NEGATIVE POLICY ****')
print(classification_report(poverty_estimation_high_containment['class_pos_neg'], poverty_estimation_high_containment_spearman_pos_neg_preds))

poverty_estimation_high_containment_spearman_harsh_grad_preds = model_harsh_grad_high_containment_spearman.predict(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
poverty_estimation_high_containment_spearman_harsh_grad_preds_probs = model_harsh_grad_high_containment_spearman.predict_proba(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
print('**** HARSH GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['harsh_grad_class'], poverty_estimation_high_containment_spearman_harsh_grad_preds))

poverty_estimation_high_containment_spearman_2nd_grad_drop_preds = model_2nd_grad_drop_high_containment_spearman.predict(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
poverty_estimation_high_containment_spearman_2nd_grad_drop_preds_probs = model_2nd_grad_drop_high_containment_spearman.predict_proba(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
print('**** SECOND GRADIENT DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['2th_grad_drop_class'], poverty_estimation_high_containment_spearman_2nd_grad_drop_preds))

poverty_estimation_high_containment_spearman_order_of_mag_drop_preds = model_order_of_mag_drop_high_containment_spearman.predict(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
poverty_estimation_high_containment_spearman_order_of_mag_drop_preds_probs = model_order_of_mag_drop_high_containment_spearman.predict_proba(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
print('**** ORDER OF MAGNITUDE DROP POLICY ****')
print(classification_report(poverty_estimation_high_containment['order_of_mag_drop_class'], poverty_estimation_high_containment_spearman_order_of_mag_drop_preds))

poverty_estimation_high_containment_spearman_median_based_preds = model_median_based_high_containment_spearman.predict(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
poverty_estimation_high_containment_spearman_median_based_preds_probs = model_median_based_high_containment_spearman.predict_proba(normalize_features(poverty_estimation_high_containment[SPEARMAN]))
print('**** MEDIAN-BASED POLICY ****')
print(classification_report(poverty_estimation_high_containment['median_based_class'], poverty_estimation_high_containment_spearman_median_based_preds))

**** POSITIVE AND NEGATIVE POLICY ****
              precision    recall  f1-score   support

        gain       1.00      0.64      0.78        11
        loss       0.00      0.00      0.00         0

    accuracy                           0.64        11
   macro avg       0.50      0.32      0.39        11
weighted avg       1.00      0.64      0.78        11

**** HARSH GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

**** SECOND GRADIENT DROP POLICY ****
              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

**** ORDER OF

### Again, results with positive/negative and median-based policies worked best (especially positive/negative). Note that results do get a bit worse when we use just these two features (one at a time)...

### For now on, let's use FEATURES again (i.e., ALL_FEATURES - ['containment_ratio']).

### What if, instead of using a random forest classifier, we use SVM variations?

In [42]:
from sklearn.svm import LinearSVC


def train_linear_svm(features, classes):
    '''
    Builds a model using features to predict associated classes
    '''

    feature_scaler = StandardScaler()
    features_train = feature_scaler.fit_transform(features)
    
    clf = LinearSVC(random_state=42, tol=1e-5)
    clf.fit(features_train, classes)

    return clf

In [43]:
linear_svm_pos_neg_high_containment = train_linear_svm(openml_training_high_containment[FEATURES],
                                                       openml_training_high_containment['class_pos_neg'])

In [45]:
linear_svm_harsh_grad_high_containment = train_linear_svm(openml_training_high_containment[FEATURES],
                                                          openml_training_high_containment['harsh_grad_class'])

linear_svm_2nd_grad_drop_high_containment = train_linear_svm(openml_training_high_containment[FEATURES],
                                                             openml_training_high_containment['2th_grad_drop_class'])

linear_svm_order_of_mag_drop_high_containment = train_linear_svm(openml_training_high_containment[FEATURES],
                                                                 openml_training_high_containment['order_of_mag_drop_class'])

linear_svm_median_based_high_containment = train_linear_svm(openml_training_high_containment[FEATURES],
                                                              openml_training_high_containment['median_based_class'])

In [46]:
test_high_containment_linear_svm_pos_neg_preds = linear_svm_pos_neg_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['class_pos_neg'], test_high_containment_linear_svm_pos_neg_preds))

test_high_containment_linear_svm_harsh_grad_preds = linear_svm_harsh_grad_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['harsh_grad_class'], test_high_containment_linear_svm_harsh_grad_preds))

test_high_containment_linear_svm_2nd_grad_drop_preds = linear_svm_2nd_grad_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['2th_grad_drop_class'], test_high_containment_linear_svm_2nd_grad_drop_preds))

test_high_containment_linear_svm_order_of_mag_drop_preds = linear_svm_order_of_mag_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['order_of_mag_drop_class'], test_high_containment_linear_svm_order_of_mag_drop_preds))

test_high_containment_linear_svm_median_based_preds = linear_svm_median_based_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['median_based_class'], test_high_containment_linear_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       0.57      0.46      0.51     23212
        loss       0.55      0.66      0.60     23169

    accuracy                           0.56     46381
   macro avg       0.56      0.56      0.55     46381
weighted avg       0.56      0.56      0.55     46381

              precision    recall  f1-score   support

        gain       1.00      0.00      0.00      2114
        loss       0.95      1.00      0.98     44267

    accuracy                           0.95     46381
   macro avg       0.98      0.50      0.49     46381
weighted avg       0.96      0.95      0.93     46381

              precision    recall  f1-score   support

        gain       0.92      0.00      0.01      4237
        loss       0.91      1.00      0.95     42144

    accuracy                           0.91     46381
   macro avg       0.92      0.50      0.48     46381
weighted avg       0.91      0.91      0.87     46381

              preci

In [47]:
college_debt_high_containment_linear_svm_pos_neg_preds = linear_svm_pos_neg_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['class_pos_neg'], college_debt_high_containment_linear_svm_pos_neg_preds))

college_debt_high_containment_linear_svm_harsh_grad_preds = linear_svm_harsh_grad_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['harsh_grad_class'], college_debt_high_containment_linear_svm_harsh_grad_preds))

college_debt_high_containment_linear_svm_2nd_grad_drop_preds = linear_svm_2nd_grad_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['2th_grad_drop_class'], college_debt_high_containment_linear_svm_2nd_grad_drop_preds))

college_debt_high_containment_linear_svm_order_of_mag_drop_preds = linear_svm_order_of_mag_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['order_of_mag_drop_class'], college_debt_high_containment_linear_svm_order_of_mag_drop_preds))

college_debt_high_containment_linear_svm_median_based_preds = linear_svm_median_based_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['median_based_class'], college_debt_high_containment_linear_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       1.00      0.15      0.26        92
        loss       0.24      1.00      0.39        25

    accuracy                           0.33       117
   macro avg       0.62      0.58      0.33       117
weighted avg       0.84      0.33      0.29       117

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         1
        loss       0.99      1.00      1.00       116

    accuracy                           0.99       117
   macro avg       0.50      0.50      0.50       117
weighted avg       0.98      0.99      0.99       117

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         3
        loss       0.97      1.00      0.99       114

    accuracy                           0.97       117
   macro avg       0.49      0.50      0.49       117
weighted avg       0.95      0.97      0.96       117

              preci

In [110]:
taxi_collision_high_containment_linear_svm_pos_neg_preds = linear_svm_pos_neg_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['class_pos_neg'], taxi_collision_high_containment_linear_svm_pos_neg_preds))

taxi_collision_high_containment_linear_svm_harsh_grad_preds = linear_svm_harsh_grad_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['harsh_grad_class'], taxi_collision_high_containment_linear_svm_harsh_grad_preds))

taxi_collision_high_containment_linear_svm_2nd_grad_drop_preds = linear_svm_2nd_grad_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['2th_grad_drop_class'], taxi_collision_high_containment_linear_svm_2nd_grad_drop_preds))

taxi_collision_high_containment_linear_svm_order_of_mag_drop_preds = linear_svm_order_of_mag_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['order_of_mag_drop_class'], taxi_collision_high_containment_linear_svm_order_of_mag_drop_preds))

taxi_collision_high_containment_linear_svm_median_based_preds = linear_svm_median_based_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['median_based_class'], taxi_collision_high_containment_linear_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       1.00      0.11      0.20        18
        loss       0.00      0.00      0.00         0

    accuracy                           0.11        18
   macro avg       0.50      0.06      0.10        18
weighted avg       1.00      0.11      0.20        18

              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        18

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00        15
        loss      

In [49]:
poverty_estimation_high_containment_linear_svm_pos_neg_preds = linear_svm_pos_neg_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['class_pos_neg'], poverty_estimation_high_containment_linear_svm_pos_neg_preds))

poverty_estimation_high_containment_linear_svm_harsh_grad_preds = linear_svm_harsh_grad_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['harsh_grad_class'], poverty_estimation_high_containment_linear_svm_harsh_grad_preds))

poverty_estimation_high_containment_linear_svm_2nd_grad_drop_preds = linear_svm_2nd_grad_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['2th_grad_drop_class'], poverty_estimation_high_containment_linear_svm_2nd_grad_drop_preds))

poverty_estimation_high_containment_linear_svm_order_of_mag_drop_preds = linear_svm_order_of_mag_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['order_of_mag_drop_class'], poverty_estimation_high_containment_linear_svm_order_of_mag_drop_preds))

poverty_estimation_high_containment_linear_svm_median_based_preds = linear_svm_median_based_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['median_based_class'], poverty_estimation_high_containment_linear_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       1.00      0.09      0.17        11
        loss       0.00      0.00      0.00         0

    accuracy                           0.09        11
   macro avg       0.50      0.05      0.08        11
weighted avg       1.00      0.09      0.17        11

              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00        10
        loss      

### Let's experiment with an RBF variation.

In [50]:
from sklearn.svm import SVC


def train_rbf_svm(features, classes):
    '''
    Builds a model using features to predict associated classes
    '''

    feature_scaler = StandardScaler()
    features_train = feature_scaler.fit_transform(features)
    
    clf = SVC(max_iter=1000, gamma='auto')
    clf.fit(features_train, classes)

    return clf

In [51]:
rbf_svm_pos_neg_high_containment = train_rbf_svm(openml_training_high_containment[FEATURES],
                                                 openml_training_high_containment['class_pos_neg'])

In [52]:
rbf_svm_harsh_grad_high_containment = train_rbf_svm(openml_training_high_containment[FEATURES],
                                                    openml_training_high_containment['harsh_grad_class'])

rbf_svm_2nd_grad_drop_high_containment = train_rbf_svm(openml_training_high_containment[FEATURES],
                                                       openml_training_high_containment['2th_grad_drop_class'])

rbf_svm_order_of_mag_drop_high_containment = train_rbf_svm(openml_training_high_containment[FEATURES],
                                                           openml_training_high_containment['order_of_mag_drop_class'])

rbf_svm_median_based_high_containment = train_rbf_svm(openml_training_high_containment[FEATURES],
                                                        openml_training_high_containment['median_based_class'])

In [53]:
test_high_containment_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['class_pos_neg'], test_high_containment_rbf_svm_pos_neg_preds))

test_high_containment_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['harsh_grad_class'], test_high_containment_rbf_svm_harsh_grad_preds))

test_high_containment_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['2th_grad_drop_class'], test_high_containment_rbf_svm_2nd_grad_drop_preds))

test_high_containment_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['order_of_mag_drop_class'], test_high_containment_rbf_svm_order_of_mag_drop_preds))

test_high_containment_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['median_based_class'], test_high_containment_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       0.48      0.39      0.43     23212
        loss       0.49      0.57      0.53     23169

    accuracy                           0.48     46381
   macro avg       0.48      0.48      0.48     46381
weighted avg       0.48      0.48      0.48     46381

              precision    recall  f1-score   support

        gain       0.07      0.26      0.11      2114
        loss       0.96      0.83      0.89     44267

    accuracy                           0.80     46381
   macro avg       0.51      0.55      0.50     46381
weighted avg       0.92      0.80      0.85     46381

              precision    recall  f1-score   support

        gain       0.10      0.35      0.15      4237
        loss       0.91      0.67      0.77     42144

    accuracy                           0.64     46381
   macro avg       0.50      0.51      0.46     46381
weighted avg       0.84      0.64      0.72     46381

              preci

In [54]:
college_debt_high_containment_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['class_pos_neg'], college_debt_high_containment_rbf_svm_pos_neg_preds))

college_debt_high_containment_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['harsh_grad_class'], college_debt_high_containment_rbf_svm_harsh_grad_preds))

college_debt_high_containment_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['2th_grad_drop_class'], college_debt_high_containment_rbf_svm_2nd_grad_drop_preds))

college_debt_high_containment_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['order_of_mag_drop_class'], college_debt_high_containment_rbf_svm_order_of_mag_drop_preds))

college_debt_high_containment_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['median_based_class'], college_debt_high_containment_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       0.74      0.76      0.75        92
        loss       0.00      0.00      0.00        25

    accuracy                           0.60       117
   macro avg       0.37      0.38      0.37       117
weighted avg       0.58      0.60      0.59       117

              precision    recall  f1-score   support

        gain       0.02      1.00      0.04         1
        loss       1.00      0.60      0.75       116

    accuracy                           0.61       117
   macro avg       0.51      0.80      0.40       117
weighted avg       0.99      0.61      0.75       117

              precision    recall  f1-score   support

        gain       0.06      1.00      0.12         3
        loss       1.00      0.60      0.75       114

    accuracy                           0.61       117
   macro avg       0.53      0.80      0.43       117
weighted avg       0.98      0.61      0.73       117

              preci

In [111]:
taxi_collision_high_containment_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['class_pos_neg'], taxi_collision_high_containment_rbf_svm_pos_neg_preds))

taxi_collision_high_containment_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['harsh_grad_class'], taxi_collision_high_containment_rbf_svm_harsh_grad_preds))

taxi_collision_high_containment_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['2th_grad_drop_class'], taxi_collision_high_containment_rbf_svm_2nd_grad_drop_preds))

taxi_collision_high_containment_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['order_of_mag_drop_class'], taxi_collision_high_containment_rbf_svm_order_of_mag_drop_preds))

taxi_collision_high_containment_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['median_based_class'], taxi_collision_high_containment_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       1.00      0.33      0.50        18
        loss       0.00      0.00      0.00         0

    accuracy                           0.33        18
   macro avg       0.50      0.17      0.25        18
weighted avg       1.00      0.33      0.50        18

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         0
        loss       1.00      0.72      0.84        18

    accuracy                           0.72        18
   macro avg       0.50      0.36      0.42        18
weighted avg       1.00      0.72      0.84        18

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         0
        loss       1.00      0.72      0.84        18

    accuracy                           0.72        18
   macro avg       0.50      0.36      0.42        18
weighted avg       1.00      0.72      0.84        18

              preci

In [56]:
poverty_estimation_high_containment_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['class_pos_neg'], poverty_estimation_high_containment_rbf_svm_pos_neg_preds))

poverty_estimation_high_containment_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['harsh_grad_class'], poverty_estimation_high_containment_rbf_svm_harsh_grad_preds))

poverty_estimation_high_containment_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['2th_grad_drop_class'], poverty_estimation_high_containment_rbf_svm_2nd_grad_drop_preds))

poverty_estimation_high_containment_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['order_of_mag_drop_class'], poverty_estimation_high_containment_rbf_svm_order_of_mag_drop_preds))

poverty_estimation_high_containment_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['median_based_class'], poverty_estimation_high_containment_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       1.00      0.55      0.71        11
        loss       0.00      0.00      0.00         0

    accuracy                           0.55        11
   macro avg       0.50      0.27      0.35        11
weighted avg       1.00      0.55      0.71        11

              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         0
        loss       1.00      0.82      0.90        11

    accuracy                           0.82        11
   macro avg       0.50      0.41      0.45        11
weighted avg       1.00      0.82      0.90        11

              precision    recall  f1-score   support

        gain      

### It looks like the SVM-RBF classifier was in fact the best for our problem! This is not in line with the results we had from the AutoML pipeline, but probably because of the class policy we were using, and because of the data we were using. 

### Let's see how SVM-RBF works for when $\theta = 1$ not only for the training data but also for the test data and case studies.

In [57]:
THETA = 1
openml_test_high_containment = openml_test.loc[openml_test['containment_fraction'] >= THETA]
college_debt_high_containment = college_debt.loc[college_debt['containment_fraction'] >= THETA]
taxi_collision_high_containment = taxi_collision.loc[taxi_collision['containment_fraction'] >= THETA]
poverty_estimation_high_containment = poverty_estimation.loc[poverty_estimation['containment_fraction'] >= THETA]

In [58]:
test_high_containment_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['class_pos_neg'], test_high_containment_rbf_svm_pos_neg_preds))

test_high_containment_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['harsh_grad_class'], test_high_containment_rbf_svm_harsh_grad_preds))

test_high_containment_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['2th_grad_drop_class'], test_high_containment_rbf_svm_2nd_grad_drop_preds))

test_high_containment_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['order_of_mag_drop_class'], test_high_containment_rbf_svm_order_of_mag_drop_preds))

test_high_containment_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['median_based_class'], test_high_containment_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       0.48      0.40      0.43     19748
        loss       0.49      0.57      0.53     20028

    accuracy                           0.48     39776
   macro avg       0.48      0.48      0.48     39776
weighted avg       0.48      0.48      0.48     39776

              precision    recall  f1-score   support

        gain       0.07      0.28      0.12      1807
        loss       0.96      0.83      0.89     37969

    accuracy                           0.80     39776
   macro avg       0.52      0.56      0.50     39776
weighted avg       0.92      0.80      0.85     39776

              precision    recall  f1-score   support

        gain       0.10      0.37      0.15      3509
        loss       0.92      0.66      0.77     36267

    accuracy                           0.63     39776
   macro avg       0.51      0.51      0.46     39776
weighted avg       0.84      0.63      0.71     39776

              preci

In [59]:
college_debt_high_containment_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['class_pos_neg'], college_debt_high_containment_rbf_svm_pos_neg_preds))

college_debt_high_containment_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['harsh_grad_class'], college_debt_high_containment_rbf_svm_harsh_grad_preds))

college_debt_high_containment_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['2th_grad_drop_class'], college_debt_high_containment_rbf_svm_2nd_grad_drop_preds))

college_debt_high_containment_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['order_of_mag_drop_class'], college_debt_high_containment_rbf_svm_order_of_mag_drop_preds))

college_debt_high_containment_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['median_based_class'], college_debt_high_containment_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       0.57      0.67      0.62         6
        loss       0.00      0.00      0.00         3

    accuracy                           0.44         9
   macro avg       0.29      0.33      0.31         9
weighted avg       0.38      0.44      0.41         9

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         1
        loss       0.89      1.00      0.94         8

    accuracy                           0.89         9
   macro avg       0.44      0.50      0.47         9
weighted avg       0.79      0.89      0.84         9

              precision    recall  f1-score   support

        gain       0.25      1.00      0.40         1
        loss       1.00      0.62      0.77         8

    accuracy                           0.67         9
   macro avg       0.62      0.81      0.58         9
weighted avg       0.92      0.67      0.73         9

              preci

In [112]:
taxi_collision_high_containment_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['class_pos_neg'], taxi_collision_high_containment_rbf_svm_pos_neg_preds))

taxi_collision_high_containment_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['harsh_grad_class'], taxi_collision_high_containment_rbf_svm_harsh_grad_preds))

taxi_collision_high_containment_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['2th_grad_drop_class'], taxi_collision_high_containment_rbf_svm_2nd_grad_drop_preds))

taxi_collision_high_containment_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['order_of_mag_drop_class'], taxi_collision_high_containment_rbf_svm_order_of_mag_drop_preds))

taxi_collision_high_containment_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['median_based_class'], taxi_collision_high_containment_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       1.00      0.33      0.50        18
        loss       0.00      0.00      0.00         0

    accuracy                           0.33        18
   macro avg       0.50      0.17      0.25        18
weighted avg       1.00      0.33      0.50        18

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         0
        loss       1.00      0.72      0.84        18

    accuracy                           0.72        18
   macro avg       0.50      0.36      0.42        18
weighted avg       1.00      0.72      0.84        18

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         0
        loss       1.00      0.72      0.84        18

    accuracy                           0.72        18
   macro avg       0.50      0.36      0.42        18
weighted avg       1.00      0.72      0.84        18

              preci

In [61]:
poverty_estimation_high_containment_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['class_pos_neg'], poverty_estimation_high_containment_rbf_svm_pos_neg_preds))

poverty_estimation_high_containment_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['harsh_grad_class'], poverty_estimation_high_containment_rbf_svm_harsh_grad_preds))

poverty_estimation_high_containment_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['2th_grad_drop_class'], poverty_estimation_high_containment_rbf_svm_2nd_grad_drop_preds))

poverty_estimation_high_containment_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['order_of_mag_drop_class'], poverty_estimation_high_containment_rbf_svm_order_of_mag_drop_preds))

poverty_estimation_high_containment_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['median_based_class'], poverty_estimation_high_containment_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       1.00      0.55      0.71        11
        loss       0.00      0.00      0.00         0

    accuracy                           0.55        11
   macro avg       0.50      0.27      0.35        11
weighted avg       1.00      0.55      0.71        11

              precision    recall  f1-score   support

        loss       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         0
        loss       1.00      0.82      0.90        11

    accuracy                           0.82        11
   macro avg       0.50      0.41      0.45        11
weighted avg       1.00      0.82      0.90        11

              precision    recall  f1-score   support

        gain      

### How about using this SVM-RBF model trained with $\theta = 1$ over test data and use cases with $\theta = 0$ (i.e., over all the test and use case data)? Here, 'containment ratio' is not a feature again.

In [62]:
test_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(openml_test[FEATURES]))
print(classification_report(openml_test['class_pos_neg'], test_rbf_svm_pos_neg_preds))

test_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(openml_test[FEATURES]))
print(classification_report(openml_test['harsh_grad_class'], test_rbf_svm_harsh_grad_preds))

test_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['2th_grad_drop_class'], test_high_containment_rbf_svm_2nd_grad_drop_preds))

test_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(openml_test[FEATURES]))
print(classification_report(openml_test['order_of_mag_drop_class'], test_rbf_svm_order_of_mag_drop_preds))

test_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(openml_test[FEATURES]))
print(classification_report(openml_test['median_based_class'], test_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       0.45      0.40      0.42     37058
        loss       0.48      0.54      0.51     38680

    accuracy                           0.47     75738
   macro avg       0.47      0.47      0.47     75738
weighted avg       0.47      0.47      0.47     75738

              precision    recall  f1-score   support

        gain       0.04      0.24      0.07      2602
        loss       0.97      0.81      0.88     73136

    accuracy                           0.79     75738
   macro avg       0.51      0.52      0.48     75738
weighted avg       0.94      0.79      0.86     75738

              precision    recall  f1-score   support

        gain       0.10      0.37      0.15      3509
        loss       0.92      0.66      0.77     36267

    accuracy                           0.63     39776
   macro avg       0.51      0.51      0.46     39776
weighted avg       0.84      0.63      0.71     39776

              preci

In [63]:
college_debt_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(college_debt[FEATURES]))
print(classification_report(college_debt['class_pos_neg'], college_debt_rbf_svm_pos_neg_preds))

college_debt_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(college_debt[FEATURES]))
print(classification_report(college_debt['harsh_grad_class'], college_debt_rbf_svm_harsh_grad_preds))

college_debt_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(college_debt[FEATURES]))
print(classification_report(college_debt['2th_grad_drop_class'], college_debt_rbf_svm_2nd_grad_drop_preds))

college_debt_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(college_debt[FEATURES]))
print(classification_report(college_debt['order_of_mag_drop_class'], college_debt_rbf_svm_order_of_mag_drop_preds))

college_debt_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(college_debt[FEATURES]))
print(classification_report(college_debt['median_based_class'], college_debt_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       0.37      0.94      0.53       130
        loss       0.99      0.78      0.88       973

    accuracy                           0.80      1103
   macro avg       0.68      0.86      0.70      1103
weighted avg       0.92      0.80      0.83      1103

              precision    recall  f1-score   support

        gain       0.00      1.00      0.00         1
        loss       1.00      0.63      0.77      1102

    accuracy                           0.63      1103
   macro avg       0.50      0.81      0.39      1103
weighted avg       1.00      0.63      0.77      1103

              precision    recall  f1-score   support

        gain       0.01      1.00      0.01         3
        loss       1.00      0.47      0.64      1100

    accuracy                           0.47      1103
   macro avg       0.50      0.73      0.32      1103
weighted avg       1.00      0.47      0.64      1103

              preci

In [113]:
taxi_collision_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print(classification_report(taxi_collision['class_pos_neg'], taxi_collision_rbf_svm_pos_neg_preds))

taxi_collision_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print(classification_report(taxi_collision['harsh_grad_class'], taxi_collision_rbf_svm_harsh_grad_preds))

taxi_collision_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print(classification_report(taxi_collision['2th_grad_drop_class'], taxi_collision_rbf_svm_2nd_grad_drop_preds))

taxi_collision_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print(classification_report(taxi_collision['order_of_mag_drop_class'], taxi_collision_rbf_svm_order_of_mag_drop_preds))

taxi_collision_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print(classification_report(taxi_collision['median_based_class'], taxi_collision_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       1.00      0.71      0.83       447
        loss       0.00      0.00      0.00         0

    accuracy                           0.71       447
   macro avg       0.50      0.36      0.42       447
weighted avg       1.00      0.71      0.83       447

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         3
        loss       0.99      0.88      0.93       444

    accuracy                           0.87       447
   macro avg       0.50      0.44      0.47       447
weighted avg       0.99      0.87      0.93       447

              precision    recall  f1-score   support

        gain       0.00      0.00      0.00         4
        loss       0.99      0.68      0.80       443

    accuracy                           0.67       447
   macro avg       0.49      0.34      0.40       447
weighted avg       0.98      0.67      0.80       447

              preci

In [65]:
poverty_estimation_rbf_svm_pos_neg_preds = rbf_svm_pos_neg_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print(classification_report(poverty_estimation['class_pos_neg'], poverty_estimation_rbf_svm_pos_neg_preds))

poverty_estimation_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print(classification_report(poverty_estimation['harsh_grad_class'], poverty_estimation_rbf_svm_harsh_grad_preds))

poverty_estimation_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print(classification_report(poverty_estimation['2th_grad_drop_class'], poverty_estimation_rbf_svm_2nd_grad_drop_preds))

poverty_estimation_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print(classification_report(poverty_estimation['order_of_mag_drop_class'], poverty_estimation_rbf_svm_order_of_mag_drop_preds))

poverty_estimation_rbf_svm_median_based_preds = rbf_svm_median_based_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print(classification_report(poverty_estimation['median_based_class'], poverty_estimation_rbf_svm_median_based_preds))


              precision    recall  f1-score   support

        gain       0.09      0.23      0.13     11526
        loss       0.91      0.77      0.84    119402

    accuracy                           0.72    130928
   macro avg       0.50      0.50      0.48    130928
weighted avg       0.84      0.72      0.77    130928

              precision    recall  f1-score   support

        gain       0.00      1.00      0.00         1
        loss       1.00      0.88      0.93    130927

    accuracy                           0.88    130928
   macro avg       0.50      0.94      0.47    130928
weighted avg       1.00      0.88      0.93    130928

              precision    recall  f1-score   support

        gain       0.00      0.08      0.00        13
        loss       1.00      0.77      0.87    130915

    accuracy                           0.77    130928
   macro avg       0.50      0.42      0.44    130928
weighted avg       1.00      0.77      0.87    130928

              preci

### How about if we train an ordinary SVM-RBF model over openml_training using ALL_FEATURES and $\theta = 0$ and see how it works over all test and use case data?

In [66]:
rbf_svm_pos_neg = train_rbf_svm(openml_training[ALL_FEATURES],
                                openml_training['class_pos_neg'])

rbf_svm_harsh_grad = train_rbf_svm(openml_training[ALL_FEATURES],
                                   openml_training['harsh_grad_class'])

rbf_svm_2nd_grad_drop = train_rbf_svm(openml_training[ALL_FEATURES],
                                      openml_training['2th_grad_drop_class'])

rbf_svm_order_of_mag_drop = train_rbf_svm(openml_training[ALL_FEATURES],
                                          openml_training['order_of_mag_drop_class'])

rbf_svm_median_based = train_rbf_svm(openml_training[ALL_FEATURES],
                                     openml_training['median_based_class'])

In [67]:
test_rbf_svm_pos_neg_preds = rbf_svm_pos_neg.predict(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['class_pos_neg'], test_rbf_svm_pos_neg_preds))

test_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad.predict(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['harsh_grad_class'], test_rbf_svm_harsh_grad_preds))

test_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop.predict(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['2th_grad_drop_class'], test_rbf_svm_2nd_grad_drop_preds))

test_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop.predict(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['order_of_mag_drop_class'], test_rbf_svm_order_of_mag_drop_preds))

test_rbf_svm_median_based_preds = rbf_svm_median_based.predict(normalize_features(openml_test[ALL_FEATURES]))
print(classification_report(openml_test['median_based_class'], test_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       0.47      0.46      0.46     37058
        loss       0.49      0.51      0.50     38680

    accuracy                           0.48     75738
   macro avg       0.48      0.48      0.48     75738
weighted avg       0.48      0.48      0.48     75738

              precision    recall  f1-score   support

        gain       0.05      0.33      0.09      2602
        loss       0.97      0.80      0.88     73136

    accuracy                           0.78     75738
   macro avg       0.51      0.56      0.48     75738
weighted avg       0.94      0.78      0.85     75738

              precision    recall  f1-score   support

        gain       0.09      0.43      0.14      5590
        loss       0.93      0.63      0.76     70148

    accuracy                           0.62     75738
   macro avg       0.51      0.53      0.45     75738
weighted avg       0.87      0.62      0.71     75738

              preci

In [68]:
college_debt_rbf_svm_pos_neg_preds = rbf_svm_pos_neg.predict(normalize_features(college_debt[ALL_FEATURES]))
print(classification_report(college_debt['class_pos_neg'], college_debt_rbf_svm_pos_neg_preds))

college_debt_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad.predict(normalize_features(college_debt[ALL_FEATURES]))
print(classification_report(college_debt['harsh_grad_class'], college_debt_rbf_svm_harsh_grad_preds))

college_debt_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop.predict(normalize_features(college_debt[ALL_FEATURES]))
print(classification_report(college_debt['2th_grad_drop_class'], college_debt_rbf_svm_2nd_grad_drop_preds))

college_debt_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop.predict(normalize_features(college_debt[ALL_FEATURES]))
print(classification_report(college_debt['order_of_mag_drop_class'], college_debt_rbf_svm_order_of_mag_drop_preds))

college_debt_rbf_svm_median_based_preds = rbf_svm_median_based.predict(normalize_features(college_debt[ALL_FEATURES]))
print(classification_report(college_debt['median_based_class'], college_debt_rbf_svm_median_based_preds))


              precision    recall  f1-score   support

        gain       0.26      1.00      0.41       130
        loss       1.00      0.61      0.76       973

    accuracy                           0.66      1103
   macro avg       0.63      0.81      0.58      1103
weighted avg       0.91      0.66      0.72      1103

              precision    recall  f1-score   support

        gain       0.00      1.00      0.01         1
        loss       1.00      0.74      0.85      1102

    accuracy                           0.74      1103
   macro avg       0.50      0.87      0.43      1103
weighted avg       1.00      0.74      0.85      1103

              precision    recall  f1-score   support

        gain       0.01      1.00      0.03         3
        loss       1.00      0.80      0.89      1100

    accuracy                           0.80      1103
   macro avg       0.51      0.90      0.46      1103
weighted avg       1.00      0.80      0.89      1103

              preci

In [114]:
taxi_collision_rbf_svm_pos_neg_preds = rbf_svm_pos_neg.predict(normalize_features(taxi_collision[ALL_FEATURES]))
print(classification_report(taxi_collision['class_pos_neg'], taxi_collision_rbf_svm_pos_neg_preds))

taxi_collision_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad.predict(normalize_features(taxi_collision[ALL_FEATURES]))
print(classification_report(taxi_collision['harsh_grad_class'], taxi_collision_rbf_svm_harsh_grad_preds))

taxi_collision_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop.predict(normalize_features(taxi_collision[ALL_FEATURES]))
print(classification_report(taxi_collision['2th_grad_drop_class'], taxi_collision_rbf_svm_2nd_grad_drop_preds))

taxi_collision_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop.predict(normalize_features(taxi_collision[ALL_FEATURES]))
print(classification_report(taxi_collision['order_of_mag_drop_class'], taxi_collision_rbf_svm_order_of_mag_drop_preds))

taxi_collision_rbf_svm_median_based_preds = rbf_svm_median_based.predict(normalize_features(taxi_collision[ALL_FEATURES]))
print(classification_report(taxi_collision['median_based_class'], taxi_collision_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       1.00      0.68      0.81       447
        loss       0.00      0.00      0.00         0

    accuracy                           0.68       447
   macro avg       0.50      0.34      0.41       447
weighted avg       1.00      0.68      0.81       447

              precision    recall  f1-score   support

        gain       0.02      0.67      0.04         3
        loss       1.00      0.81      0.89       444

    accuracy                           0.81       447
   macro avg       0.51      0.74      0.47       447
weighted avg       0.99      0.81      0.89       447

              precision    recall  f1-score   support

        gain       0.04      1.00      0.07         4
        loss       1.00      0.77      0.87       443

    accuracy                           0.77       447
   macro avg       0.52      0.88      0.47       447
weighted avg       0.99      0.77      0.86       447

              preci

In [70]:
poverty_estimation_rbf_svm_pos_neg_preds = rbf_svm_pos_neg.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
print(classification_report(poverty_estimation['class_pos_neg'], poverty_estimation_rbf_svm_pos_neg_preds))

poverty_estimation_rbf_svm_harsh_grad_preds = rbf_svm_harsh_grad.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
print(classification_report(poverty_estimation['harsh_grad_class'], poverty_estimation_rbf_svm_harsh_grad_preds))

poverty_estimation_rbf_svm_2nd_grad_drop_preds = rbf_svm_2nd_grad_drop.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
print(classification_report(poverty_estimation['2th_grad_drop_class'], poverty_estimation_rbf_svm_2nd_grad_drop_preds))

poverty_estimation_rbf_svm_order_of_mag_drop_preds = rbf_svm_order_of_mag_drop.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
print(classification_report(poverty_estimation['order_of_mag_drop_class'], poverty_estimation_rbf_svm_order_of_mag_drop_preds))

poverty_estimation_rbf_svm_median_based_preds = rbf_svm_median_based.predict(normalize_features(poverty_estimation[ALL_FEATURES]))
print(classification_report(poverty_estimation['median_based_class'], poverty_estimation_rbf_svm_median_based_preds))

              precision    recall  f1-score   support

        gain       0.11      0.32      0.16     11526
        loss       0.92      0.74      0.82    119402

    accuracy                           0.70    130928
   macro avg       0.51      0.53      0.49    130928
weighted avg       0.85      0.70      0.76    130928

              precision    recall  f1-score   support

        gain       0.00      1.00      0.00         1
        loss       1.00      0.80      0.89    130927

    accuracy                           0.80    130928
   macro avg       0.50      0.90      0.44    130928
weighted avg       1.00      0.80      0.89    130928

              precision    recall  f1-score   support

        gain       0.00      0.31      0.00        13
        loss       1.00      0.64      0.78    130915

    accuracy                           0.64    130928
   macro avg       0.50      0.48      0.39    130928
weighted avg       1.00      0.64      0.78    130928

              preci

### Well, it looks like not even RBF is capable of working well when no restriction is made on $\theta$...

### Focusing on SVM-RBF trained over instances with $\theta = 1$ and tested over instances with $\theta = 0.5$, what happens when we changed our training data to something that has only one candidate per query? 

### Naturally, in this case the only policy that makes sense is positive-negative, as there's only one candidate per query in the training data.

### How does SVM-RBF behave in this case? And how does it compare with Random Forests in the same scenario?

In [73]:
openml_training_single = pd.read_csv('../classification/training-simplified-data-generation.csv')
openml_training_single['class_pos_neg'] = ['gain' if row['gain_in_r2_score'] > 0 else 'loss' for index, row in openml_training_single.iterrows()]

In [74]:
THETA = 1
openml_training_single_high_containment = openml_training_single.loc[openml_training_single['containment_fraction'] >= THETA]
openml_training_single_high_containment.shape

(7566, 36)

### Rather small dataset... But let's see.

In [75]:
rbf_svm_single_pos_neg_high_containment = train_rbf_svm(openml_training_single_high_containment[FEATURES], 
                                                        openml_training_single_high_containment['class_pos_neg'])

In [76]:
THETA = 0.5
openml_test_high_containment = openml_test.loc[openml_test['containment_fraction'] >= THETA]
college_debt_high_containment = college_debt.loc[college_debt['containment_fraction'] >= THETA]
taxi_collision_high_containment = taxi_collision.loc[taxi_collision['containment_fraction'] >= THETA]
poverty_estimation_high_containment = poverty_estimation.loc[poverty_estimation['containment_fraction'] >= THETA]

In [77]:
test_high_containment_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['class_pos_neg'], test_high_containment_rbf_svm_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.49      0.40      0.44     23212
        loss       0.50      0.59      0.54     23169

    accuracy                           0.49     46381
   macro avg       0.49      0.49      0.49     46381
weighted avg       0.49      0.49      0.49     46381



In [79]:
college_debt_high_containment_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['class_pos_neg'], college_debt_high_containment_rbf_svm_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.79      1.00      0.88        92
        loss       0.00      0.00      0.00        25

    accuracy                           0.79       117
   macro avg       0.39      0.50      0.44       117
weighted avg       0.62      0.79      0.69       117



In [115]:
taxi_collision_high_containment_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['class_pos_neg'], taxi_collision_high_containment_rbf_svm_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       1.00      0.67      0.80        18
        loss       0.00      0.00      0.00         0

    accuracy                           0.67        18
   macro avg       0.50      0.33      0.40        18
weighted avg       1.00      0.67      0.80        18



In [81]:
poverty_estimation_high_containment_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['class_pos_neg'], poverty_estimation_high_containment_rbf_svm_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       1.00      0.73      0.84        11
        loss       0.00      0.00      0.00         0

    accuracy                           0.73        11
   macro avg       0.50      0.36      0.42        11
weighted avg       1.00      0.73      0.84        11



### Now let's see how random forest trained over the same dataset works:

In [82]:
model_single_pos_neg_high_containment = train_model(openml_training_single_high_containment[FEATURES], 
                                                    openml_training_single_high_containment['class_pos_neg'])

In [116]:
test_high_containment_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(openml_test_high_containment[FEATURES]))
print(classification_report(openml_test_high_containment['class_pos_neg'], test_high_containment_model_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.51      0.89      0.65     23212
        loss       0.58      0.15      0.24     23169

    accuracy                           0.52     46381
   macro avg       0.55      0.52      0.44     46381
weighted avg       0.55      0.52      0.44     46381



In [84]:
college_debt_high_containment_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(college_debt_high_containment[FEATURES]))
print(classification_report(college_debt_high_containment['class_pos_neg'], college_debt_high_containment_model_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.84      0.98      0.90        92
        loss       0.80      0.32      0.46        25

    accuracy                           0.84       117
   macro avg       0.82      0.65      0.68       117
weighted avg       0.83      0.84      0.81       117



In [117]:
taxi_collision_high_containment_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(taxi_collision_high_containment[FEATURES]))
print(classification_report(taxi_collision_high_containment['class_pos_neg'], taxi_collision_high_containment_model_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       1.00      0.89      0.94        18
        loss       0.00      0.00      0.00         0

    accuracy                           0.89        18
   macro avg       0.50      0.44      0.47        18
weighted avg       1.00      0.89      0.94        18



In [86]:
poverty_estimation_high_containment_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(poverty_estimation_high_containment[FEATURES]))
print(classification_report(poverty_estimation_high_containment['class_pos_neg'], poverty_estimation_high_containment_model_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       1.00      0.82      0.90        11
        loss       0.00      0.00      0.00         0

    accuracy                           0.82        11
   macro avg       0.50      0.41      0.45        11
weighted avg       1.00      0.82      0.90        11



### Well, it looks like we get our best results when we train over this "simpler" dataset! And in this case, random forests behave best again... 

### What if we stick to random forests and this dataset, train with $\theta = 1$ and test over the entire data (i.e., $\theta = 0$)? How does it behave and how does it compare against an SVM-RBF trained over this simpler dataset with the same configuration of $\theta$ values?

In [87]:
test_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(openml_test[FEATURES]))
print(classification_report(openml_test['class_pos_neg'], test_model_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.51      0.71      0.60     37058
        loss       0.56      0.36      0.44     38680

    accuracy                           0.53     75738
   macro avg       0.54      0.53      0.52     75738
weighted avg       0.54      0.53      0.52     75738



In [89]:
college_debt_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(college_debt[FEATURES]))
print(classification_report(college_debt['class_pos_neg'], college_debt_model_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.12      1.00      0.21       130
        loss       1.00      0.01      0.03       973

    accuracy                           0.13      1103
   macro avg       0.56      0.51      0.12      1103
weighted avg       0.90      0.13      0.05      1103



In [118]:
taxi_collision_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print(classification_report(taxi_collision['class_pos_neg'], taxi_collision_model_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       1.00      0.85      0.92       447
        loss       0.00      0.00      0.00         0

    accuracy                           0.85       447
   macro avg       0.50      0.42      0.46       447
weighted avg       1.00      0.85      0.92       447



In [91]:
poverty_estimation_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print(classification_report(poverty_estimation['class_pos_neg'], poverty_estimation_model_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.09      1.00      0.17     11526
        loss       1.00      0.05      0.10    119402

    accuracy                           0.13    130928
   macro avg       0.55      0.53      0.13    130928
weighted avg       0.92      0.13      0.10    130928



### Now let's see how the SVM-RBF behaves...

In [92]:
test_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(openml_test[FEATURES]))
print(classification_report(openml_test['class_pos_neg'], test_rbf_svm_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.46      0.38      0.42     37058
        loss       0.49      0.57      0.52     38680

    accuracy                           0.48     75738
   macro avg       0.47      0.47      0.47     75738
weighted avg       0.47      0.48      0.47     75738



In [93]:
college_debt_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(college_debt[FEATURES]))
print(classification_report(college_debt['class_pos_neg'], college_debt_rbf_svm_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.37      0.89      0.52       130
        loss       0.98      0.79      0.88       973

    accuracy                           0.81      1103
   macro avg       0.67      0.84      0.70      1103
weighted avg       0.91      0.81      0.84      1103



In [119]:
taxi_collision_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print(classification_report(taxi_collision['class_pos_neg'], taxi_collision_rbf_svm_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       1.00      0.44      0.61       447
        loss       0.00      0.00      0.00         0

    accuracy                           0.44       447
   macro avg       0.50      0.22      0.30       447
weighted avg       1.00      0.44      0.61       447



In [95]:
poverty_estimation_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print(classification_report(poverty_estimation['class_pos_neg'], poverty_estimation_rbf_svm_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.11      0.36      0.17     11526
        loss       0.92      0.72      0.81    119402

    accuracy                           0.69    130928
   macro avg       0.52      0.54      0.49    130928
weighted avg       0.85      0.69      0.76    130928



### Ok, so it looks like training over this simpler dataset is the way to go. How about the main decision between whether to use SVM-RBF or Random Forest? It looks like the former behaves consistently better over case studies, but the latter performs better for openml_test. 

### Let me try comparing both models over a different instance of openml_test (openml_test_single).

In [96]:
openml_test_single = pd.read_csv('../classification/test-simplified-data-generation.csv')
openml_test_single['class_pos_neg'] = ['gain' if row['gain_in_r2_score'] > 0 else 'loss' for index, row in openml_test_single.iterrows()]

THETA = 0.5
openml_test_single_high_containment = openml_test_single.loc[openml_test_single['containment_fraction'] >= THETA]


In [97]:
# first, random forest.

single_test_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(openml_test_single[FEATURES]))
print(classification_report(openml_test_single['class_pos_neg'], single_test_model_single_pos_neg_preds))

# then, svm-rbf.
single_test_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(openml_test_single[FEATURES]))
print(classification_report(openml_test_single['class_pos_neg'], single_test_rbf_svm_single_pos_neg_preds))

              precision    recall  f1-score   support

        gain       0.62      0.74      0.67      2496
        loss       0.50      0.36      0.42      1780

    accuracy                           0.58      4276
   macro avg       0.56      0.55      0.55      4276
weighted avg       0.57      0.58      0.57      4276

              precision    recall  f1-score   support

        gain       0.62      0.50      0.56      2496
        loss       0.45      0.57      0.50      1780

    accuracy                           0.53      4276
   macro avg       0.54      0.54      0.53      4276
weighted avg       0.55      0.53      0.53      4276



### Yea... Well, it seems like Random Forest overfits more than SVM-RBF, but both are somewhat comparable. What if we used an ensemble and got the prediction as the "maximum"?

### To this end, let's check how xgboost behaves first. We'll start training a positive/negative xgboost model setting $\theta = 1$ for the training data.

In [120]:
from xgboost import XGBClassifier

In [121]:
def train_xgboost(features, classes):
    '''
    Builds a xgboost classifier using features to predict associated classes
    '''

    feature_scaler = StandardScaler()
    features_train = feature_scaler.fit_transform(features)
    
    # using the standard xgbc for now
    xgbc = XGBClassifier()
    xgbc.fit(features_train, classes)

    return xgbc

In [123]:
xgboost_pos_neg_high_containment = train_xgboost(openml_training_high_containment[FEATURES],
                                                 openml_training_high_containment['class_pos_neg'])

In [126]:
test_xgboost_training_single_pos_neg_preds = xgboost_pos_neg_high_containment.predict(normalize_features(openml_test[FEATURES]))
print('******* XGBOOST --- OPENML_TRAINING_SINLE --- OPENML_TEST --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(openml_test['class_pos_neg'], test_xgboost_training_single_pos_neg_preds))

test_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(openml_test[FEATURES]))
print('******* SVM-RBF --- OPENML_TRAINING_SINLE --- OPENML_TEST --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(openml_test['class_pos_neg'], test_rbf_svm_single_pos_neg_preds))

test_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(openml_test[FEATURES]))
print('******* RANDOM FOREST --- OPENML_TRAINING_SINLE --- OPENML_TEST --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(openml_test['class_pos_neg'], test_model_single_pos_neg_preds))


******* XGBOOST --- OPENML_TRAINING_SINLE --- OPENML_TEST --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   support

        gain       0.55      0.13      0.22     37058
        loss       0.52      0.89      0.66     38680

    accuracy                           0.52     75738
   macro avg       0.53      0.51      0.44     75738
weighted avg       0.53      0.52      0.44     75738

******* SVM-RBF --- OPENML_TRAINING_SINLE --- OPENML_TEST --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   support

        gain       0.46      0.38      0.42     37058
        loss       0.49      0.57      0.52     38680

    accuracy                           0.48     75738
   macro avg       0.47      0.47      0.47     75738
weighted avg       0.47      0.48      0.47     75738

******* RANDOM FOREST --- OPENML_TRAINING_SINLE --- OPENML_TEST --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   sup

### Whoah, xgboost is *very* bad over openml_test. Let's see how it compares with the other classifiers over the case studies.

In [128]:
college_debt_xgboost_training_single_pos_neg_preds = xgboost_pos_neg_high_containment.predict(normalize_features(college_debt[FEATURES]))
print('******* XGBOOST --- OPENML_TRAINING_SINLE --- COLLEGE_DEBT --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(college_debt['class_pos_neg'], college_debt_xgboost_training_single_pos_neg_preds))

college_debt_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(college_debt[FEATURES]))
print('******* SVM-RBF --- OPENML_TRAINING_SINLE --- COLLEGE_DEBT --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(college_debt['class_pos_neg'], college_debt_rbf_svm_single_pos_neg_preds))

college_debt_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(college_debt[FEATURES]))
print('******* RANDOM FOREST --- OPENML_TRAINING_SINLE --- COLLEGE_DEBT --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(college_debt['class_pos_neg'], college_debt_model_single_pos_neg_preds))

******* XGBOOST --- OPENML_TRAINING_SINLE --- COLLEGE_DEBT --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   support

        gain       0.18      1.00      0.30       130
        loss       1.00      0.39      0.56       973

    accuracy                           0.46      1103
   macro avg       0.59      0.69      0.43      1103
weighted avg       0.90      0.46      0.53      1103

******* SVM-RBF --- OPENML_TRAINING_SINLE --- COLLEGE_DEBT --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   support

        gain       0.37      0.89      0.52       130
        loss       0.98      0.79      0.88       973

    accuracy                           0.81      1103
   macro avg       0.67      0.84      0.70      1103
weighted avg       0.91      0.81      0.84      1103

******* RANDOM FOREST --- OPENML_TRAINING_SINLE --- COLLEGE_DEBT --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   

In [130]:
taxi_collision_xgboost_training_single_pos_neg_preds = xgboost_pos_neg_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print('******* XGBOOST --- OPENML_TRAINING_SINLE --- TAXI_COLLISION --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(taxi_collision['class_pos_neg'], taxi_collision_xgboost_training_single_pos_neg_preds))

taxi_collision_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print('******* SVM-RBF --- OPENML_TRAINING_SINLE --- TAXI_COLLISION --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(taxi_collision['class_pos_neg'], taxi_collision_rbf_svm_single_pos_neg_preds))

taxi_collision_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(taxi_collision[FEATURES]))
print('******* RANDOM FOREST --- OPENML_TRAINING_SINLE --- TAXI_COLLISION --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(taxi_collision['class_pos_neg'], taxi_collision_model_single_pos_neg_preds))


******* XGBOOST --- OPENML_TRAINING_SINLE --- TAXI_COLLISION --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   support

        gain       1.00      0.22      0.37       447
        loss       0.00      0.00      0.00         0

    accuracy                           0.22       447
   macro avg       0.50      0.11      0.18       447
weighted avg       1.00      0.22      0.37       447

******* SVM-RBF --- OPENML_TRAINING_SINLE --- TAXI_COLLISION --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   support

        gain       1.00      0.44      0.61       447
        loss       0.00      0.00      0.00         0

    accuracy                           0.44       447
   macro avg       0.50      0.22      0.30       447
weighted avg       1.00      0.44      0.61       447

******* RANDOM FOREST --- OPENML_TRAINING_SINLE --- TAXI_COLLISION --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-sc

In [131]:
poverty_estimation_xgboost_training_single_pos_neg_preds = xgboost_pos_neg_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print('******* XGBOOST --- OPENML_TRAINING_SINLE --- POVERTY_ESTIMATION --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(poverty_estimation['class_pos_neg'], poverty_estimation_xgboost_training_single_pos_neg_preds))

poverty_estimation_rbf_svm_single_pos_neg_preds = rbf_svm_single_pos_neg_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print('******* SVM-RBF --- OPENML_TRAINING_SINLE --- POVERTY_ESTIMATION --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(poverty_estimation['class_pos_neg'], poverty_estimation_rbf_svm_single_pos_neg_preds))

poverty_estimation_model_single_pos_neg_preds = model_single_pos_neg_high_containment.predict(normalize_features(poverty_estimation[FEATURES]))
print('******* RANDOM FOREST --- OPENML_TRAINING_SINLE --- POVERTY_ESTIMATION --- POSITIVE/NEGATIVE CLASSES ********')
print(classification_report(poverty_estimation['class_pos_neg'], poverty_estimation_model_single_pos_neg_preds))


******* XGBOOST --- OPENML_TRAINING_SINLE --- POVERTY_ESTIMATION --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   support

        gain       0.13      0.24      0.17     11526
        loss       0.92      0.85      0.88    119402

    accuracy                           0.80    130928
   macro avg       0.53      0.54      0.53    130928
weighted avg       0.85      0.80      0.82    130928

******* SVM-RBF --- OPENML_TRAINING_SINLE --- POVERTY_ESTIMATION --- POSITIVE/NEGATIVE CLASSES ********
              precision    recall  f1-score   support

        gain       0.11      0.36      0.17     11526
        loss       0.92      0.72      0.81    119402

    accuracy                           0.69    130928
   macro avg       0.52      0.54      0.49    130928
weighted avg       0.85      0.69      0.76    130928

******* RANDOM FOREST --- OPENML_TRAINING_SINLE --- POVERTY_ESTIMATION --- POSITIVE/NEGATIVE CLASSES ********
              precision    r