## Imports

In [1]:
import math
import numpy as np
import pandas as pd

from imblearn.datasets import fetch_datasets

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import *

from tqdm.notebook import tqdm
import pickle

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [3]:
datasets = fetch_datasets()

In [4]:
DATASETS_13 = [
    'ecoli',
    'satimage',
    'abalone',
    'us_crime',
    'yeast_ml8',
    'scene',
    'coil_2000',
    'solar_flare_m0',
    'oil',
    'wine_quality',
    'yeast_me2',
    'ozone_level',
    'abalone_19'
]

## Model

In [5]:
def resample_train_X_y(X_train, y_train):
    minority_data = X_train[y_train==1]
    majority_data = X_train[y_train==0]

    oversampled_minority_data = minority_data[np.random.choice(minority_data.shape[0], len(majority_data), replace=True)]
    
    X_res = np.concatenate((oversampled_minority_data, majority_data))    
    y_res = np.concatenate((np.ones(len(oversampled_minority_data)), np.zeros(len(majority_data))))
    
    return X_res, y_res

In [6]:
def run_model(X, y, random_state, n_folds=5):
    pred_probas = np.zeros(len(X))
    
    folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
        X_train = X[train_idx]
        X_valid = X[valid_idx]

        y_train = y[train_idx]
        y_valid = y[valid_idx]
        
        X_res, y_res = resample_train_X_y(X_train, y_train)

        model = RandomForestClassifier(random_state=random_state)    
        model = model.fit(X_res, y_res)

        valid_pred_proba = model.predict_proba(X_valid)[:,1]    
        pred_probas[valid_idx] += valid_pred_proba
        
    return pred_probas    

In [7]:
def calc_gmeans(y, pred_probas):
    fpr, tpr, thresholds = roc_curve(y, pred_probas)
    gmeans = np.sqrt(tpr * (1-fpr))
    ix = np.argmax(gmeans)
    return gmeans[ix]

def calc_f1(y, pred_probas):
    precision, recall, thresholds = precision_recall_curve(y, pred_probas)
    fscore = []
    for p, r in zip(precision, recall):
        f1 = (2 * p * r) / (p + r)
        if 0.0 <= f1 <= 1.0:
            fscore.append(f1)    
    ix = np.argmax(fscore)
    return fscore[ix]

def train_and_evaluate(dataset_name, num_runs=10):
    gmeans_scores, f1_scores = [], []
    
    dataset = datasets[dataset_name]
    X = dataset.data
    y = np.where(dataset.target == -1, 0, 1) 
    
    for run in range(num_runs):
        pred_probas = run_model(X, y, random_state=run)
        gmeans_scores.append(calc_gmeans(y, pred_probas))
        f1_scores.append(calc_f1(y, pred_probas))
        
    return gmeans_scores, f1_scores    

## Train and Evaluate

In [8]:
results = {}

for i, dataset_name in enumerate(DATASETS_13):
    print("Set #%d: %s" % ((i+1), dataset_name))
    gmeans_scores, f1_scores = train_and_evaluate(dataset_name)
    results[dataset_name] = (np.mean(gmeans_scores), np.std(gmeans_scores), np.mean(f1_scores), np.std(f1_scores))

Set #1: ecoli
Set #2: satimage
Set #3: abalone
Set #4: us_crime
Set #5: yeast_ml8
Set #6: scene
Set #7: coil_2000
Set #8: solar_flare_m0
Set #9: oil
Set #10: wine_quality
Set #11: yeast_me2
Set #12: ozone_level
Set #13: abalone_19


## Display Results

In [9]:
print("Dataset\t\tGMeans (mean)\tGMeans(std)\tF1 (mean)\tF1(std)")
for dataset, r in results.items():
    print("%-15s\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" % (dataset, *(r)))

Dataset		GMeans (mean)	GMeans(std)	F1 (mean)	F1(std)
ecoli          	0.8818		0.0090		0.6212		0.0255
satimage       	0.8987		0.0028		0.6920		0.0035
abalone        	0.7816		0.0063		0.4013		0.0090
us_crime       	0.8533		0.0057		0.5249		0.0105
yeast_ml8      	0.5764		0.0119		0.1705		0.0070
scene          	0.7350		0.0081		0.3178		0.0133
coil_2000      	0.6591		0.0046		0.2139		0.0031
solar_flare_m0 	0.7118		0.0192		0.2111		0.0215
oil            	0.8728		0.0126		0.6037		0.0247
wine_quality   	0.8108		0.0069		0.4264		0.0184
yeast_me2      	0.8531		0.0120		0.4592		0.0342
ozone_level    	0.8153		0.0110		0.3696		0.0109
abalone_19     	0.6666		0.0330		0.0894		0.0198


## Save Results

In [10]:
with open("results\ex03_minority_oversample.pk", "wb") as f:
    pickle.dump(results, f)