## Imports

In [1]:
import math
import numpy as np
import pandas as pd

from imblearn.datasets import fetch_datasets

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import *

from tqdm.notebook import tqdm
import pickle

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [3]:
datasets = fetch_datasets()

## Model

In [4]:
def resample_train_X_y(X_train, y_train):
    return X_train, y_train

In [5]:
def run_model(X, y, random_state, n_folds=5):
    pred_probas = np.zeros(len(X))
    
    folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
        X_train = X[train_idx]
        X_valid = X[valid_idx]

        y_train = y[train_idx]
        y_valid = y[valid_idx]
        
        X_res, y_res = resample_train_X_y(X_train, y_train)

        model = RandomForestClassifier(random_state=random_state)    
        model = model.fit(X_res, y_res)

        valid_pred_proba = model.predict_proba(X_valid)[:,1]    
        pred_probas[valid_idx] += valid_pred_proba
        
    return pred_probas    

In [6]:
def calc_gmeans(y, pred_probas):
    fpr, tpr, thresholds = roc_curve(y, pred_probas)
    gmeans = np.sqrt(tpr * (1-fpr))
    ix = np.argmax(gmeans)
    return gmeans[ix]

def calc_f1(y, pred_probas):
    precision, recall, thresholds = precision_recall_curve(y, pred_probas)
    fscore = []
    for p, r in zip(precision, recall):
        f1 = (2 * p * r) / (p + r)
        if 0.0 <= f1 <= 1.0:
            fscore.append(f1)    
    ix = np.argmax(fscore)
    return fscore[ix]

def train_and_evaluate(dataset_name, num_runs=10):
    gmeans_scores, f1_scores = [], []
    
    dataset = datasets[dataset_name]
    X = dataset.data
    y = np.where(dataset.target == -1, 0, 1) 
    
    for run in range(num_runs):
        pred_probas = run_model(X, y, random_state=run)
        gmeans_scores.append(calc_gmeans(y, pred_probas))
        f1_scores.append(calc_f1(y, pred_probas))
        
    return gmeans_scores, f1_scores    

## Train and Evaluate

In [7]:
results = {}

for i, dataset_name in enumerate(datasets.keys()):
    print("Set #%d: %s" % ((i+1), dataset_name))
    gmeans_scores, f1_scores = train_and_evaluate(dataset_name)
    results[dataset_name] = (np.mean(gmeans_scores), np.std(gmeans_scores), np.mean(f1_scores), np.std(f1_scores))

Set #1: ecoli
Set #2: optical_digits
Set #3: satimage
Set #4: pen_digits
Set #5: abalone
Set #6: sick_euthyroid
Set #7: spectrometer
Set #8: car_eval_34
Set #9: isolet
Set #10: us_crime
Set #11: yeast_ml8
Set #12: scene
Set #13: libras_move
Set #14: thyroid_sick
Set #15: coil_2000
Set #16: arrhythmia
Set #17: solar_flare_m0
Set #18: oil
Set #19: car_eval_4
Set #20: wine_quality
Set #21: letter_img
Set #22: yeast_me2
Set #23: webpage
Set #24: ozone_level
Set #25: mammography
Set #26: protein_homo
Set #27: abalone_19


## Display Results

In [8]:
print("Dataset\t\tGMeans (mean)\tGMeans(std)\tF1 (mean)\tF1(std)")
for dataset, r in results.items():
    print("%-15s\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" % (dataset, *(r)))

Dataset		GMeans (mean)	GMeans(std)	F1 (mean)	F1(std)
ecoli          	0.8795		0.0108		0.6666		0.0327
optical_digits 	0.9845		0.0009		0.9526		0.0026
satimage       	0.8911		0.0033		0.6932		0.0047
pen_digits     	0.9965		0.0003		0.9913		0.0013
abalone        	0.7876		0.0042		0.3996		0.0062
sick_euthyroid 	0.9531		0.0014		0.8798		0.0067
spectrometer   	0.9558		0.0028		0.8829		0.0099
car_eval_34    	0.9921		0.0021		0.9327		0.0073
isolet         	0.9623		0.0024		0.8656		0.0050
us_crime       	0.8581		0.0073		0.5490		0.0111
yeast_ml8      	0.5574		0.0157		0.1608		0.0053
scene          	0.6914		0.0080		0.3015		0.0166
libras_move    	0.9433		0.0158		0.7993		0.0258
thyroid_sick   	0.9724		0.0045		0.8842		0.0059
coil_2000      	0.6530		0.0055		0.2079		0.0055
arrhythmia     	0.9114		0.0181		0.6320		0.0632
solar_flare_m0 	0.7257		0.0244		0.2542		0.0175
oil            	0.8448		0.0201		0.5733		0.0190
car_eval_4     	0.9967		0.0011		0.9515		0.0156
wine_quality   	0.8137		0.0090		0.4520		0.0102
letter_

## Save Results

In [9]:
with open("results\ex01_baseline.pk", "wb") as f:
    pickle.dump(results, f)