## Imports

In [1]:
import math
import numpy as np
import pandas as pd

from imblearn.datasets import fetch_datasets

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import *

from tqdm.notebook import tqdm
import pickle

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [3]:
datasets = fetch_datasets()

In [4]:
DATASETS_13 = [
    'ecoli',
    'satimage',
    'abalone',
    'us_crime',
    'yeast_ml8',
    'scene',
    'coil_2000',
    'solar_flare_m0',
    'oil',
    'wine_quality',
    'yeast_me2',
    'ozone_level',
    'abalone_19'
]

## Model

In [5]:
def resample_train_X_y(X_train, y_train, corruption_ratio):
    minority_data = X_train[y_train==1]
    majority_data = X_train[y_train==0]

    oversampled_minority_data = minority_data[np.random.choice(minority_data.shape[0], len(majority_data), replace=True)]
    columns = np.split(oversampled_minority_data, oversampled_minority_data.shape[1], axis=1)
    corrupted_columns = [np.zeros(len(column)) for column in columns]
    corrupted_data = np.vstack(corrupted_columns).T   
    
    prob_matrix = np.random.choice(
        [1, 0], 
        corrupted_data.shape[0] * corrupted_data.shape[1], 
        p = [corruption_ratio, 1-corruption_ratio]
    ).reshape(corrupted_data.shape)
    
    resampled_minority_data = (1.0 - prob_matrix) * oversampled_minority_data + prob_matrix * corrupted_data
    X_res = np.concatenate((resampled_minority_data, majority_data))    
    y_res = np.concatenate((np.ones(len(resampled_minority_data)), np.zeros(len(majority_data))))
    
    return X_res, y_res

In [6]:
def run_model(X, y, corruption_ratio, random_state, n_folds=5):
    pred_probas = np.zeros(len(X))
    
    folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
        X_train = X[train_idx]
        X_valid = X[valid_idx]

        y_train = y[train_idx]
        y_valid = y[valid_idx]
        
        X_res, y_res = resample_train_X_y(X_train, y_train, corruption_ratio)

        model = RandomForestClassifier(random_state=random_state)    
        model = model.fit(X_res, y_res)

        valid_pred_proba = model.predict_proba(X_valid)[:,1]    
        pred_probas[valid_idx] += valid_pred_proba
        
    return pred_probas    

In [7]:
def calc_gmeans(y, pred_probas):
    fpr, tpr, thresholds = roc_curve(y, pred_probas)
    gmeans = np.sqrt(tpr * (1-fpr))
    ix = np.argmax(gmeans)
    return gmeans[ix]

def calc_f1(y, pred_probas):
    precision, recall, thresholds = precision_recall_curve(y, pred_probas)
    fscore = []
    for p, r in zip(precision, recall):
        f1 = (2 * p * r) / (p + r)
        if 0.0 <= f1 <= 1.0:
            fscore.append(f1)    
    ix = np.argmax(fscore)
    return fscore[ix]

def train_and_evaluate(dataset_name, corruption_ratio, num_runs=10):
    gmeans_scores, f1_scores = [], []
    
    dataset = datasets[dataset_name]
    X = dataset.data
    y = np.where(dataset.target == -1, 0, 1) 
    
    for run in range(num_runs):
        pred_probas = run_model(X, y, corruption_ratio, random_state=run)
        gmeans_scores.append(calc_gmeans(y, pred_probas))
        f1_scores.append(calc_f1(y, pred_probas))
        
    return gmeans_scores, f1_scores    

## Train and Evaluate

In [8]:
for corruption_ratio in (0.1, 0.25, 0.5):
    print("\n====================")
    print("Corruption Ratio %.2f" % corruption_ratio)
    print("====================")
    results = {}

    for i, dataset_name in enumerate(DATASETS_13):
        print("Set #%d: %s" % ((i+1), dataset_name))
        gmeans_scores, f1_scores = train_and_evaluate(dataset_name, corruption_ratio)
        results[dataset_name] = (np.mean(gmeans_scores), np.std(gmeans_scores), np.mean(f1_scores), np.std(f1_scores))
    
    print("====================")
    print("Dataset\t\tGMeans (mean)\tGMeans(std)\tF1 (mean)\tF1(std)")
    for dataset, r in results.items():
        print("%-15s\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" % (dataset, *(r))) 
    
    with open("results\ex05_1_zero_out_%.2f.pk" % corruption_ratio, "wb") as f:
        pickle.dump(results, f)


Corruption Ratio 0.10
Set #1: ecoli
Set #2: satimage
Set #3: abalone
Set #4: us_crime
Set #5: yeast_ml8
Set #6: scene
Set #7: coil_2000
Set #8: solar_flare_m0
Set #9: oil
Set #10: wine_quality
Set #11: yeast_me2
Set #12: ozone_level
Set #13: abalone_19
Dataset		GMeans (mean)	GMeans(std)	F1 (mean)	F1(std)
ecoli          	0.8637		0.0177		0.6067		0.0291
satimage       	0.8865		0.0027		0.6810		0.0025
abalone        	0.7812		0.0046		0.3964		0.0072
us_crime       	0.8573		0.0051		0.5460		0.0174
yeast_ml8      	0.5866		0.0155		0.1743		0.0084
scene          	0.6679		0.0042		0.2740		0.0109
coil_2000      	0.6245		0.0092		0.1754		0.0039
solar_flare_m0 	0.7192		0.0199		0.2363		0.0193
oil            	0.8737		0.0182		0.5865		0.0316
wine_quality   	0.8099		0.0082		0.4469		0.0189
yeast_me2      	0.8537		0.0132		0.4120		0.0370
ozone_level    	0.8189		0.0119		0.3788		0.0134
abalone_19     	0.6807		0.0348		0.0705		0.0187

Corruption Ratio 0.25
Set #1: ecoli
Set #2: satimage
Set #3: abalone
Set #4: us_c