In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc
from imblearn.datasets import fetch_datasets
from pathlib import Path
import os
import sys
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.metrics import geometric_mean_score, classification_report_imbalanced

from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import (recall_score, roc_auc_score, confusion_matrix, precision_score, precision_recall_curve,
                             f1_score, balanced_accuracy_score, accuracy_score, average_precision_score)

In [16]:
def load_data(name) :
    
    dataset =  fetch_datasets()[name]
    X,y,title = dataset['data'], dataset['target'], dataset['DESCR']
    y_enc = LabelEncoder().fit_transform(y)
        
    return X,y_enc,title

def test_metrics(dataset, clf, random_state=5) :

    # get back the train test split
    X,y, title = load_data(dataset)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, stratify = y, random_state = random_state)
    
    res_df = pd.DataFrame({
        "dataset":title, 
        "oversampler" : "NoOversampling"
        }, index=[0])

    res_df['classifier'] = clf

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    probas = clf.predict_proba(X_test)

    # get metrics
    res_df['balanced_accuracy'] = balanced_accuracy_score(y_test,prediction)
    res_df['geometric_mean'] = geometric_mean_score(y_test, prediction)
    res_df['f1_score'] = f1_score(y_test, prediction)
    res_df['precision'] = precision_score(y_test,prediction)
    res_df['recall'] = recall_score(y_test,prediction)
    res_df['avg_precision (AUPRC)'] = average_precision_score(y_test, prediction)

    return res_df


def base_case(dataset, clf) :
    
    X,y,title = load_data(dataset)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.35, stratify=y, random_state=5)
    
    clf.fit(X_train,y_train)
    
    probas = clf.predict_proba(X_test)[:,1]
    
    precision, recall, _ = precision_recall_curve(y_test, probas)
    #aucscore = auc(recall, precision)
    
    return precision, recall, aucscore

In [17]:
datasets = list(fetch_datasets().keys())
clf = xgb.XGBClassifier(verbosity=0, random_state = 0, use_label_encoder=False)
dfs = []
for dataset in datasets :
    print(dataset)
    dfs.append(test_metrics(dataset, clf))
    
df = pd.concat(dfs)

ecoli
optical_digits
satimage
pen_digits
abalone
sick_euthyroid
spectrometer
car_eval_34
isolet
us_crime
yeast_ml8


  _warn_prf(average, modifier, msg_start, len(result))


scene
libras_move
thyroid_sick
coil_2000
arrhythmia
solar_flare_m0
oil
car_eval_4
wine_quality
letter_img
yeast_me2
webpage
ozone_level
mammography
protein_homo
abalone_19


  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
df.to_csv("base_case_xgb.csv", index=False)

In [19]:
df

Unnamed: 0,dataset,oversampler,classifier,balanced_accuracy,geometric_mean,f1_score,precision,recall,avg_precision (AUPRC)
0,ecoli,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.786616,0.759895,0.7,0.875,0.583333,0.555462
0,optical_digits,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.960852,0.960125,0.952113,0.982558,0.923497,0.914937
0,satimage,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.82205,0.806286,0.724868,0.80117,0.661836,0.563199
0,pen_digits,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.986917,0.986834,0.985465,0.997059,0.974138,0.973754
0,abalone,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.539616,0.344201,0.159204,0.222222,0.124031,0.109506
0,sick_euthyroid,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.922958,0.921198,0.84,0.815534,0.865979,0.718688
0,spectrometer,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.896894,0.891645,0.857143,0.923077,0.8,0.755507
0,car_eval_34,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.97158,0.971431,0.913043,0.875,0.954545,0.83873
0,isolet,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.878788,0.871077,0.836565,0.92638,0.762626,0.724742
0,us_crime,NoOversampling,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.719327,0.670955,0.534884,0.638889,0.46,0.33486
