In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("krkopt.data", header=None)

In [3]:
data.columns = ["wkc", "wkr", "wrc", "wrr", "bkc", "bkr", "opt rank" ]

In [4]:
data

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr,opt rank
0,a,1,b,3,c,2,draw
1,a,1,c,1,c,2,draw
2,a,1,c,1,d,1,draw
3,a,1,c,1,d,2,draw
4,a,1,c,2,c,1,draw
...,...,...,...,...,...,...,...
28051,b,1,g,7,e,5,sixteen
28052,b,1,g,7,e,6,sixteen
28053,b,1,g,7,e,7,sixteen
28054,b,1,g,7,f,5,sixteen


In [5]:
X = data.iloc[:, 0:6]
y = data['opt rank']

In [6]:
X

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr
0,a,1,b,3,c,2
1,a,1,c,1,c,2
2,a,1,c,1,d,1
3,a,1,c,1,d,2
4,a,1,c,2,c,1
...,...,...,...,...,...,...
28051,b,1,g,7,e,5
28052,b,1,g,7,e,6
28053,b,1,g,7,e,7
28054,b,1,g,7,f,5


In [7]:
X["wkc"]=X["wkc"].astype('category')
X["wrc"]=X["wrc"].astype('category')
X["bkc"]=X["bkc"].astype('category')

In [8]:
X["wkc"]=X["wkc"].cat.codes
X["wrc"]=X["wrc"].cat.codes
X["bkc"]=X["bkc"].cat.codes


In [9]:
X

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr
0,0,1,1,3,2,2
1,0,1,2,1,2,2
2,0,1,2,1,3,1
3,0,1,2,1,3,2
4,0,1,2,2,2,1
...,...,...,...,...,...,...
28051,1,1,6,7,4,5
28052,1,1,6,7,4,6
28053,1,1,6,7,4,7
28054,1,1,6,7,5,5


In [10]:
y = y.astype('category')
y = y.cat.codes

In [11]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [12]:
X

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr
0,0,1,1,3,2,2
1,0,1,2,1,2,2
2,0,1,2,1,3,1
3,0,1,2,1,3,2
4,0,1,2,2,2,1
...,...,...,...,...,...,...
28051,1,1,6,7,4,5
28052,1,1,6,7,4,6
28053,1,1,6,7,4,7
28054,1,1,6,7,5,5


In [13]:
cat_cols = list(X.columns)
cat_cols

['wkc', 'wkr', 'wrc', 'wrr', 'bkc', 'bkr']

In [14]:
X = pd.get_dummies(X,columns=cat_cols)

In [15]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.utils import to_categorical


In [16]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, test_size=0.2,
                                                   random_state = 1)

from imblearn.over_sampling import SMOTE
oversample = SMOTE()


X_smote, y_smote = oversample.fit_resample(X, y)

X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote,
                                                   y_smote, test_size=0.2,
                                                   random_state = 1)



y_train = to_categorical(y_train)
y_test  = to_categorical(y_test)
y_train_smote = to_categorical(y_train_smote)
y_test_smote = to_categorical(y_test_smote)

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

scaler = StandardScaler().fit(X_train_smote)
X_train_smote = scaler.transform(X_train_smote)
X_test_smote = scaler.transform(X_test_smote)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


checkpoint_filepath = '/tmp/checkpoint'


In [19]:
from sklearn.metrics import confusion_matrix, precision_score, \
f1_score, cohen_kappa_score, recall_score

In [20]:
def make_my_model_multi( units_per_layer, input_s, output_s, activation_='relu'):
    model = Sequential()
    depth = len(units_per_layer)
    model.add(Dense(units_per_layer[0], activation=activation_, input_shape=(input_s,)))
    for i in range(1, depth):
        model.add(Dense(units_per_layer[i], activation=activation_))
    model.add(Dense(output_s, activation = 'softmax'))   
    
    return model




#En este caso, usamos categorical_crossentropy como función de coste y 
#permitimos elegir otro batch para poder hacer uso de 
def compile_fit_multiclass(modelo, X_train, X_test, y_train, batch, epochs, verbose=0):
    modelo.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])
    
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=True)

    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                        save_weights_only=True,
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        verbose=False)

    modelo.fit(X_train, y_train, epochs=epochs, batch_size=batch, verbose=verbose, validation_split=0.2, callbacks = [early_stopping, model_checkpoint])
    model.load_weights(checkpoint_filepath)
    predictions = modelo.predict(X_test)
    return predictions

def compute_metrics_multiclass(y_test, y_pred):
    results=[]
    results.append(precision_score(y_test, np.round(y_pred), average="micro"))
    results.append(recall_score(y_test, np.round(y_pred), average="micro"))
    results.append(f1_score(y_test, np.round(y_pred), average="micro"))
    results.append(cohen_kappa_score(y_test, np.round(y_pred)))
    return results

In [21]:
from tensorflow.keras.layers import Dropout

def make_my_model_multi_dropout( units_per_layer, input_s, output_s, activation_='relu'):
    model = Sequential()
    depth = len(units_per_layer)
    model.add(Dense(units_per_layer[0], activation=activation_, input_shape=(input_s,)))
    for i in range(1, depth):
        if isinstance(units_per_layer[i], str):
            a = units_per_layer[i]
            dropout_r = float(a)
            model.add(Dropout(dropout_r))
        else:
            model.add(Dense(units_per_layer[i], activation=activation_))
    model.add(Dense(output_s, activation = 'softmax'))   
    
    return model

In [22]:
y.shape

(28056,)

In [23]:
import imblearn
from collections import Counter

In [24]:
counter = Counter(y)
for k,v in counter.items():
	per = v / len(y) * 100
	print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=0, n=2796 (9.966%)
Class=17, n=27 (0.096%)
Class=8, n=78 (0.278%)
Class=16, n=246 (0.877%)
Class=14, n=81 (0.289%)
Class=5, n=198 (0.706%)
Class=4, n=471 (1.679%)
Class=10, n=592 (2.110%)
Class=9, n=683 (2.434%)
Class=1, n=1433 (5.108%)
Class=7, n=1712 (6.102%)
Class=12, n=1985 (7.075%)
Class=2, n=2854 (10.173%)
Class=15, n=3597 (12.821%)
Class=13, n=4194 (14.949%)
Class=6, n=4553 (16.228%)
Class=3, n=2166 (7.720%)
Class=11, n=390 (1.390%)


In [25]:
X_train.shape, np.argmax(y_train, axis=1).shape

((22444, 40), (22444,))

In [26]:
X_train_smote.shape, y_train_smote.shape

((65563, 40), (65563, 18))

In [27]:
X_train.shape, X_train_smote.shape, y_train.shape, y_train_smote.shape, X_test.shape, y_test.shape

((22444, 40), (65563, 40), (22444, 18), (65563, 18), (5612, 40), (5612, 18))

In [None]:
y_pred_smote_dropout

About

If you use imbalanced-learn in a scientific publication, we would appreciate citations to the following paper:

@article{JMLR:v18:16-365,
author  = {Guillaume  Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas},
title   = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning},
journal = {Journal of Machine Learning Research},
year    = {2017},
volume  = {18},
number  = {17},
pages   = {1-5},
url     = {http://jmlr.org/papers/v18/16-365}
}

Most classification algorithms will only perform optimally when the number of samples of each class is roughly the same. Highly skewed datasets, where the minority is heavily outnumbered by one or more classes, have proven to be a challenge while at the same time becoming more and more common.

One way of addressing this issue is by re-sampling the dataset as to offset this imbalance with the hope of arriving at a more robust and fair decision boundary than you would otherwise.

In [47]:
import tensorflow as tf
tf.version.VERSION

'2.4.1'

# Pruebas

In [48]:
results = []
seed = 1

In [49]:
size_config = [50, 100, 150, 200, 250]
for size in size_config:
    layer_config = [[size], [size]*2, [size]*3, [size]*4, [size]*5, [size]*6]
    for layers in layer_config:
        np.random.seed(seed)
        tf.random.set_seed(seed)
        print(layers)
        model = make_my_model_multi(layers, 40, 18, activation_='relu' )
        preds = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 300, verbose=0)
        metrics = compute_metrics_multiclass(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        confusion = confusion_matrix(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        aux = { "layer config" : layers,
               #"Model": model,
               "Predictions" : preds,
               "Metrics" : metrics,
               "Confusion" : confusion

        }
        print(metrics)
        results.append(aux)
    

[50]
[0.7132929436920884, 0.7132929436920884, 0.7132929436920883, 0.6795940399725571]
[50, 50]
Epoch 00118: early stopping
[0.7359230220955096, 0.7359230220955096, 0.7359230220955095, 0.7051213677686434]
[50, 50, 50]
Epoch 00075: early stopping
[0.7464362081254454, 0.7464362081254454, 0.7464362081254454, 0.7169690264820239]
[50, 50, 50, 50]
Epoch 00059: early stopping
[0.7293300071275838, 0.7293300071275838, 0.7293300071275838, 0.6976471559645523]
[50, 50, 50, 50, 50]
Epoch 00053: early stopping
[0.7323592302209551, 0.7323592302209551, 0.7323592302209551, 0.7011602714040706]
[50, 50, 50, 50, 50, 50]
Epoch 00054: early stopping
[0.7245188880969351, 0.7245188880969351, 0.7245188880969351, 0.6923109986216298]
[100]
Epoch 00176: early stopping
[0.7405559515324305, 0.7405559515324305, 0.7405559515324305, 0.7101055066873646]
[100, 100]
Epoch 00053: early stopping
[0.7624732715609408, 0.7624732715609408, 0.7624732715609408, 0.7348246606533331]
[100, 100, 100]
Epoch 00042: early stopping
[0.78

In [50]:
import joblib
 
joblib.dump(results, 'results_1_onehot_joblib')

['results_1_onehot_joblib']

In [66]:
#aux = joblib.load("results_1_onehot_joblib")

# Lo mismo pero con smote

In [51]:
results_smote = []
seed = 1

In [52]:
size_config = [50, 100, 150, 200, 250]
for size in size_config:
    layer_config = [[size], [size]*2, [size]*3, [size]*4, [size]*5, [size]*6]
    for layers in layer_config:
        np.random.seed(seed)
        tf.random.set_seed(seed)
        print(layers)
        model = make_my_model_multi(layers, 40, 18, activation_='relu' )
        preds = compile_fit_multiclass(model, X_train_smote, X_test, y_train_smote, 256, 300, verbose=0)
        metrics = compute_metrics_multiclass(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        confusion = confusion_matrix(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        aux = { "layer config" : layers,
               #"Model": model,
               "Predictions" : preds,
               "Metrics" : metrics,
               "Confusion" : confusion

        }
        print(metrics)
        results_smote.append(aux)
    

[50]
[0.09853884533143265, 0.09853884533143265, 0.09853884533143265, 0.05990480607973814]
[50, 50]
Epoch 00270: early stopping
[0.06931575196008553, 0.06931575196008553, 0.06931575196008553, 0.037006904026981036]
[50, 50, 50]
Epoch 00155: early stopping
[0.06967213114754098, 0.06967213114754098, 0.06967213114754098, 0.03441175321042145]
[50, 50, 50, 50]
Epoch 00153: early stopping
[0.07448325017818959, 0.07448325017818959, 0.07448325017818959, 0.04118489587487906]
[50, 50, 50, 50, 50]
Epoch 00120: early stopping
[0.06789023521026372, 0.06789023521026372, 0.06789023521026372, 0.028956419894087593]
[50, 50, 50, 50, 50, 50]
Epoch 00200: early stopping
[0.05434782608695652, 0.05434782608695652, 0.05434782608695652, 0.018471319233337335]
[100]
[0.11101211689237349, 0.11101211689237349, 0.11101211689237349, 0.07134215115338971]
[100, 100]
Epoch 00285: early stopping
[0.1097647897362794, 0.1097647897362794, 0.1097647897362794, 0.07580671664360639]
[100, 100, 100]
Epoch 00226: early stopping
[

In [53]:
import joblib
 
joblib.dump(results_smote, 'results_smote_onehot_joblib')

['results_smote_onehot_joblib']

# Lo mismo pero con DROPOUT

# SIN SMOTE

In [54]:
results_dropout = []
seed = 1

In [55]:
size_config = [50, 100, 150, 200, 250]
dropout_rate = ["0.1", "0.2", "0.3"]

for size in size_config:
    for size_d in (dropout_rate):
        layer_config_dense = [[size], [size]*2, [size]*3, [size]*4, [size]*5, [size]*6]
        layer_config_dropout = [[size_d], [size_d]*2, [size_d]*3, [size_d]*4, [size_d]*5, [size_d]*6]
        for layers_dense, layers_dropout in zip(layer_config_dense, layer_config_dropout):
            final_design = [None]*(len(layers_dense)+len(layers_dropout))
            final_design[::2] = layers_dense
            final_design[1::2] = layers_dropout
            np.random.seed(seed)
            tf.random.set_seed(seed)
            print(final_design)
            model = make_my_model_multi_dropout(final_design, 40, 18, activation_='relu' )
            preds = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 300, verbose=0)
            metrics = compute_metrics_multiclass(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
            confusion = confusion_matrix(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
            aux = { "layer config" : final_design,
                   #"Model": model,
                   "Predictions" : preds,
                   "Metrics" : metrics,
                   "Confusion" : confusion

            }
            print(metrics)
            results_dropout.append(aux)
    

[50, '0.1']
[0.6911974340698503, 0.6911974340698503, 0.6911974340698503, 0.6544424729231852]
[50, '0.1', 50, '0.1']
Epoch 00249: early stopping
[0.7549893086243763, 0.7549893086243763, 0.7549893086243763, 0.7260805176671765]
[50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00244: early stopping
[0.7685317177476836, 0.7685317177476836, 0.7685317177476836, 0.7412666342935919]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00204: early stopping
[0.7628296507483963, 0.7628296507483963, 0.7628296507483963, 0.7349489199206065]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00204: early stopping
[0.755167498218104, 0.755167498218104, 0.755167498218104, 0.726566737550184]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00138: early stopping
[0.7446543121881682, 0.7446543121881682, 0.7446543121881682, 0.7145851936293055]
[50, '0.2']
[0.6797933000712758, 0.6797933000712758, 0.6797933000712758, 0.6415147506179972]
[50, '0.2', 50, '0.2']
Epoch 00194: early stoppi

In [56]:
joblib.dump(results_dropout, 'results_dropout_one_hot')

['results_dropout_one_hot']

# CON SMOTE

In [28]:
results_dropout_smote = []
seed = 1

In [29]:
size_config = [50, 100, 150, 200, 250]
dropout_rate = ["0.1", "0.2", "0.3"]

for size in size_config:
    for size_d in (dropout_rate):
        layer_config_dense = [[size], [size]*2, [size]*3, [size]*4, [size]*5, [size]*6]
        layer_config_dropout = [[size_d], [size_d]*2, [size_d]*3, [size_d]*4, [size_d]*5, [size_d]*6]
        for layers_dense, layers_dropout in zip(layer_config_dense, layer_config_dropout):
            final_design = [None]*(len(layers_dense)+len(layers_dropout))
            final_design[::2] = layers_dense
            final_design[1::2] = layers_dropout
            np.random.seed(seed)
            tf.random.set_seed(seed)
            print(final_design)
            model = make_my_model_multi_dropout(final_design, 40, 18, activation_='relu' )
            preds = compile_fit_multiclass(model, X_train_smote, X_test, y_train_smote, 256, 300, verbose=0)
            metrics = compute_metrics_multiclass(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
            confusion = confusion_matrix(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
            aux = { "layer config" : final_design,
                   #"Model": model,
                   "Predictions" : preds,
                   "Metrics" : metrics,
                   "Confusion" : confusion

            }
            print(metrics)
            results_dropout_smote.append(aux)
    

[50, '0.1']
[0.17925873129009265, 0.17925873129009265, 0.17925873129009265, 0.12232832383335368]
[50, '0.1', 50, '0.1']
[0.1261582323592302, 0.1261582323592302, 0.1261582323592302, 0.08089709192596017]
[50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00248: early stopping
[0.0650392017106201, 0.0650392017106201, 0.0650392017106201, 0.03040751666739172]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00189: early stopping
[0.042230933713471135, 0.042230933713471135, 0.042230933713471135, 0.01270048583862049]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00194: early stopping
[0.038132573057733425, 0.038132573057733425, 0.038132573057733425, 0.005452686722108413]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00232: early stopping
[0.0345687811831789, 0.0345687811831789, 0.0345687811831789, 0.008312049551169154]
[50, '0.2']
[0.1626870990734141, 0.1626870990734141, 0.1626870990734141, 0.10466572690499598]
[50, '0.2', 50, '0.2']
Epoch 00255: early stoppi

In [30]:
import joblib
joblib.dump(results_dropout_smote, 'results_dropout_smote_one_hot')

['results_dropout_smote_one_hot']