In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("krkopt.data", header=None)

In [3]:
data.columns = ["wkc", "wkr", "wrc", "wrr", "bkc", "bkr", "opt rank" ]

In [4]:
data

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr,opt rank
0,a,1,b,3,c,2,draw
1,a,1,c,1,c,2,draw
2,a,1,c,1,d,1,draw
3,a,1,c,1,d,2,draw
4,a,1,c,2,c,1,draw
...,...,...,...,...,...,...,...
28051,b,1,g,7,e,5,sixteen
28052,b,1,g,7,e,6,sixteen
28053,b,1,g,7,e,7,sixteen
28054,b,1,g,7,f,5,sixteen


In [5]:
X = data.iloc[:, 0:6]
y = data['opt rank']

In [6]:
X

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr
0,a,1,b,3,c,2
1,a,1,c,1,c,2
2,a,1,c,1,d,1
3,a,1,c,1,d,2
4,a,1,c,2,c,1
...,...,...,...,...,...,...
28051,b,1,g,7,e,5
28052,b,1,g,7,e,6
28053,b,1,g,7,e,7
28054,b,1,g,7,f,5


In [7]:
X["wkc"]=X["wkc"].astype('category')
X["wrc"]=X["wrc"].astype('category')
X["bkc"]=X["bkc"].astype('category')

In [8]:
X["wkc"]=X["wkc"].cat.codes
X["wrc"]=X["wrc"].cat.codes
X["bkc"]=X["bkc"].cat.codes


In [9]:
X

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr
0,0,1,1,3,2,2
1,0,1,2,1,2,2
2,0,1,2,1,3,1
3,0,1,2,1,3,2
4,0,1,2,2,2,1
...,...,...,...,...,...,...
28051,1,1,6,7,4,5
28052,1,1,6,7,4,6
28053,1,1,6,7,4,7
28054,1,1,6,7,5,5


In [10]:
y = y.astype('category')
y = y.cat.codes

In [11]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [262]:
import keras
import tensorflow as tf
from tensorflow.keras.utils import to_categorical


In [263]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, test_size=0.2,
                                                   random_state = 1)

from imblearn.over_sampling import SMOTE
oversample = SMOTE()


X_smote, y_smote = oversample.fit_resample(X, y)

X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote,
                                                   y_smote, test_size=0.2,
                                                   random_state = 1)



y_train = to_categorical(y_train)
y_test  = to_categorical(y_test)
y_train_smote = to_categorical(y_train_smote)
y_test_smote = to_categorical(y_test_smote)

In [264]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

scaler = StandardScaler().fit(X_train_smote)
X_train_smote = scaler.transform(X_train_smote)
X_test_smote = scaler.transform(X_test_smote)

In [265]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


checkpoint_filepath = '/tmp/checkpoint'


In [285]:
def make_my_model_multi( units_per_layer, input_s, output_s, activation_='relu'):
    model = Sequential()
    depth = len(units_per_layer)
    model.add(Dense(units_per_layer[0], activation=activation_, input_shape=(input_s,)))
    for i in range(1, depth):
        model.add(Dense(units_per_layer[i], activation=activation_))
    model.add(Dense(output_s, activation = 'softmax'))   
    
    return model

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=True)

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=True)


#En este caso, usamos categorical_crossentropy como función de coste y 
#permitimos elegir otro batch para poder hacer uso de 
def compile_fit_multiclass(modelo, X_train, X_test, y_train, batch, epochs, verbose=0):
    modelo.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])
    modelo.fit(X_train, y_train, epochs=epochs, batch_size=batch, verbose=verbose, validation_split=0.2, callbacks = [early_stopping, model_checkpoint])
    #model.load_weights(checkpoint_filepath)
    predictions = modelo.predict(X_test)
    return predictions

def compute_metrics_multiclass(y_test, y_pred):
    results=[]
    results.append(precision_score(y_test, np.round(y_pred), average="micro"))
    results.append(recall_score(y_test, np.round(y_pred), average="micro"))
    results.append(f1_score(y_test, np.round(y_pred), average="micro"))
    results.append(cohen_kappa_score(y_test, np.round(y_pred)))
    return results

In [286]:
y.shape

(28056,)

In [287]:
seed = 1
np.random.seed(seed)
model = make_my_model_multi([100, 100, 100, 100], 6, 18, activation_='relu' )

In [307]:
modelo = make_my_model_multi([100, 100, 100, 100], 6, 18, activation_='relu' )
modelo.summary()
compile_fit_multiclass(modelo, X_train, X_test, y_train, 256, 200, verbose=1)

Model: "sequential_49"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_274 (Dense)            (None, 100)               700       
_________________________________________________________________
dense_275 (Dense)            (None, 100)               10100     
_________________________________________________________________
dense_276 (Dense)            (None, 100)               10100     
_________________________________________________________________
dense_277 (Dense)            (None, 100)               10100     
_________________________________________________________________
dense_278 (Dense)            (None, 18)                1818      
Total params: 32,818
Trainable params: 32,818
Non-trainable params: 0
_________________________________________________________________
Epoch 1/200

Epoch 00001: val_loss did not improve from 0.56067
Epoch 2/200

Epoch 00002: val_loss did not improve 

KeyboardInterrupt: 

In [288]:
y_pred = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 200, verbose=1)

Epoch 1/200

Epoch 00001: val_loss improved from inf to 1.88884, saving model to /tmp/checkpoint
Epoch 2/200

Epoch 00002: val_loss improved from 1.88884 to 1.60093, saving model to /tmp/checkpoint
Epoch 3/200

Epoch 00003: val_loss improved from 1.60093 to 1.41441, saving model to /tmp/checkpoint
Epoch 4/200

Epoch 00004: val_loss improved from 1.41441 to 1.37815, saving model to /tmp/checkpoint
Epoch 5/200

Epoch 00005: val_loss improved from 1.37815 to 1.27830, saving model to /tmp/checkpoint
Epoch 6/200

Epoch 00006: val_loss improved from 1.27830 to 1.23596, saving model to /tmp/checkpoint
Epoch 7/200

Epoch 00007: val_loss improved from 1.23596 to 1.20539, saving model to /tmp/checkpoint
Epoch 8/200

Epoch 00008: val_loss improved from 1.20539 to 1.18942, saving model to /tmp/checkpoint
Epoch 9/200

Epoch 00009: val_loss improved from 1.18942 to 1.13699, saving model to /tmp/checkpoint
Epoch 10/200

Epoch 00010: val_loss improved from 1.13699 to 1.10972, saving model to /tmp/chec

In [289]:
#PRUEBA
y_pred = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 200, verbose=1)

Epoch 1/200

Epoch 00001: val_loss did not improve from 0.59014
Epoch 2/200

Epoch 00002: val_loss did not improve from 0.59014
Epoch 3/200

Epoch 00003: val_loss improved from 0.59014 to 0.58417, saving model to /tmp/checkpoint
Epoch 4/200

Epoch 00004: val_loss did not improve from 0.58417
Epoch 5/200

Epoch 00005: val_loss did not improve from 0.58417
Epoch 6/200

Epoch 00006: val_loss did not improve from 0.58417
Epoch 7/200

Epoch 00007: val_loss did not improve from 0.58417
Epoch 8/200

Epoch 00008: val_loss did not improve from 0.58417
Epoch 9/200

Epoch 00009: val_loss did not improve from 0.58417
Epoch 10/200

Epoch 00010: val_loss did not improve from 0.58417
Epoch 11/200

Epoch 00011: val_loss improved from 0.58417 to 0.57727, saving model to /tmp/checkpoint
Epoch 12/200

Epoch 00012: val_loss did not improve from 0.57727
Epoch 13/200

Epoch 00013: val_loss did not improve from 0.57727
Epoch 14/200

Epoch 00014: val_loss did not improve from 0.57727
Epoch 15/200

Epoch 00015

In [290]:
from sklearn.metrics import confusion_matrix, precision_score, \
f1_score, cohen_kappa_score, recall_score

In [291]:

confusion = confusion_matrix(np.argmax(y_pred, axis = 1), np.argmax(y_test, axis = 1))
print(confusion)

[[575   0   1   1   0   0   1   0   0   2   2   0   0   0   0   0   0   0]
 [  1 202   2   0   0   0   0  31   3  58   0   0   5   0   0   0   0   1]
 [  3   6 493   0   0   0   0  21   0   3   0   0 141  11   0 115   0   0]
 [  3   0   1 366   0   0  99   0   0   0   0   9   0   4   0   0   0   0]
 [  0   1   0   0  63   9   0   0   1   3   6   0   0   0   0   0   0   0]
 [  0   0   0   0   1  31   0   0   0   1   1   0   0   0   4   0   0   0]
 [  0   0   0  38   0   0 664   0   0   0   0   0   0  51   0  11   0   0]
 [  0  37   4   0   1   0   0 240   0   3   1   0  29   0   0   0   1   0]
 [  2   0   0   0   0   0   0   0   3   1   0   0   0   0   0   0   0   2]
 [  0  18   0   0   2   0   0   2   0  77  20   0   2   0   0   0   0   0]
 [  0   1   0   0  10   1   0   1   0  14  91   0   1   0   1   0   0   0]
 [  1   0   0  34   0   0   0   0   0   0   0  60   0   0   0   0   0   0]
 [  1   6  16   0   0   0   0  36   0   1   0   0 198   0   0   3   0   0]
 [  1   0  11   0   0   0

In [292]:
compute_metrics_multiclass(np.argmax(y_pred, axis = 1), np.argmax(y_test, axis = 1))

[0.766928011404134, 0.766928011404134, 0.766928011404134, 0.7398246040107403]

In [293]:
import imblearn
from collections import Counter

In [294]:
counter = Counter(y)
for k,v in counter.items():
	per = v / len(y) * 100
	print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=0, n=2796 (9.966%)
Class=17, n=27 (0.096%)
Class=8, n=78 (0.278%)
Class=16, n=246 (0.877%)
Class=14, n=81 (0.289%)
Class=5, n=198 (0.706%)
Class=4, n=471 (1.679%)
Class=10, n=592 (2.110%)
Class=9, n=683 (2.434%)
Class=1, n=1433 (5.108%)
Class=7, n=1712 (6.102%)
Class=12, n=1985 (7.075%)
Class=2, n=2854 (10.173%)
Class=15, n=3597 (12.821%)
Class=13, n=4194 (14.949%)
Class=6, n=4553 (16.228%)
Class=3, n=2166 (7.720%)
Class=11, n=390 (1.390%)


In [276]:
X_train.shape, np.argmax(y_train, axis=1).shape

((22444, 6), (22444,))

In [277]:
X_train_smote.shape, y_train_smote.shape

((65563, 6), (65563, 18))

In [278]:
X_train.shape, X_train_smote.shape, y_train.shape, y_train_smote.shape, X_test.shape, y_test.shape

((22444, 6), (65563, 6), (22444, 18), (65563, 18), (5612, 6), (5612, 18))

In [299]:
np.random.seed(seed)
model = make_my_model_multi([100, 100, 100, 100], 6, 18, activation_='relu' )
model.summary()

Model: "sequential_42"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_239 (Dense)            (None, 100)               700       
_________________________________________________________________
dense_240 (Dense)            (None, 100)               10100     
_________________________________________________________________
dense_241 (Dense)            (None, 100)               10100     
_________________________________________________________________
dense_242 (Dense)            (None, 100)               10100     
_________________________________________________________________
dense_243 (Dense)            (None, 18)                1818      
Total params: 32,818
Trainable params: 32,818
Non-trainable params: 0
_________________________________________________________________


In [300]:
#tf.random.set_seed(seed)
y_pred = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 200, verbose=1)

Epoch 1/200

Epoch 00001: val_loss did not improve from 0.56067
Epoch 2/200

Epoch 00002: val_loss did not improve from 0.56067
Epoch 3/200

Epoch 00003: val_loss did not improve from 0.56067
Epoch 4/200

Epoch 00004: val_loss did not improve from 0.56067
Epoch 5/200

Epoch 00005: val_loss did not improve from 0.56067
Epoch 6/200

Epoch 00006: val_loss did not improve from 0.56067
Epoch 7/200

Epoch 00007: val_loss did not improve from 0.56067
Epoch 8/200

Epoch 00008: val_loss did not improve from 0.56067
Epoch 9/200

Epoch 00009: val_loss did not improve from 0.56067
Epoch 10/200

Epoch 00010: val_loss did not improve from 0.56067
Epoch 11/200

Epoch 00011: val_loss did not improve from 0.56067
Epoch 12/200

Epoch 00012: val_loss did not improve from 0.56067
Epoch 13/200

Epoch 00013: val_loss did not improve from 0.56067
Epoch 14/200

Epoch 00014: val_loss did not improve from 0.56067
Epoch 15/200

Epoch 00015: val_loss did not improve from 0.56067
Epoch 16/200

Epoch 00016: val_los

In [281]:
confusion = confusion_matrix(np.argmax(y_pred, axis = 1), np.argmax(y_test, axis = 1))
print(confusion)

[[581   0   1   0   0   1   2   0   0   1   1   0   0   1   0   0   0   0]
 [  0 165   0   0   0   1   0  19   1  40   3   0   1   0   0   0   0   0]
 [  0   8 321   0   0   0   0  20   0   0   0   0  65   3   0  38   0   0]
 [  1   0   0 392   0   0  88   0   0   0   0  18   0   1   0   0   0   0]
 [  0   2   0   0  63   8   0   0   1   1  11   0   0   0   2   0   0   0]
 [  0   0   0   0   1  32   0   0   0   0   0   0   0   0   4   0   0   0]
 [  1   0   1  33   0   0 734   0   0   0   0   0   0  92   0  13   0   0]
 [  1  50  10   0   1   0   0 191   0   8   1   0  29   0   0   0   0   0]
 [  1   0   0   0   0   0   0   0   3   1   0   0   0   0   0   0   0   0]
 [  0  27   0   0   3   0   0   1   0  95  21   0   2   0   0   0   0   1]
 [  1   1   0   0  10   0   0   3   0  14  83   0   0   0   0   0   0   0]
 [  0   0   0  14   0   0   0   0   0   0   0  51   0   0   0   0   0   0]
 [  0  16  54   0   1   0   0  94   0   3   1   0 265   2   0   5   0   0]
 [  1   0  21   0   0   0

In [250]:
compute_metrics_multiclass(np.argmax(y_pred, axis = 1), np.argmax(y_test, axis = 1))

[0.7972202423378475,
 0.7972202423378475,
 0.7972202423378475,
 0.7730948505233155]

In [251]:
np.argmax(y_pred_smote, axis=1)

array([ 0,  9,  7, ...,  6, 12,  7])

In [252]:
y_pred_smote.shape, y_test.shape

((5612, 18), (5612, 18))

In [182]:
#model.save("de_momento_mejor.h5")

# SMOTE NO VA BIEN

# Seguimos probando con con dropout

In [253]:
from keras.layers import Dropout

def make_my_model_multi_dropout( units_per_layer, input_s, output_s, activation_='relu', dropout_r=0.2):
    model = Sequential()
    depth = len(units_per_layer)
    model.add(Dense(units_per_layer[0], activation=activation_, input_shape=(input_s,)))
    for i in range(1, depth):
        model.add(Dropout(dropout_r))
        model.add(Dense(units_per_layer[i], activation=activation_))
    model.add(Dense(output_s, activation = 'softmax'))   
    
    return model

In [254]:
seed = 1
np.random.seed(seed)
model = make_my_model_multi_dropout([100, 100, 100, 100, 100, 100, 100], 6, 18, activation_='relu', dropout_r=0.2)
model.summary()
tf.random.set_seed(
    seed
)

Model: "sequential_35"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_201 (Dense)            (None, 100)               700       
_________________________________________________________________
dropout_63 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_202 (Dense)            (None, 100)               10100     
_________________________________________________________________
dropout_64 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_203 (Dense)            (None, 100)               10100     
_________________________________________________________________
dropout_65 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_204 (Dense)            (None, 100)             

In [255]:
y_pred_dropout = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 100, verbose=1)

Epoch 1/100

Epoch 00001: val_loss did not improve from 0.53002
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.53002
Epoch 3/100

Epoch 00003: val_loss did not improve from 0.53002
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.53002
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.53002
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.53002
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.53002
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.53002
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.53002
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.53002
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.53002
Epoch 12/100

Epoch 00012: val_loss did not improve from 0.53002
Epoch 13/100

Epoch 00013: val_loss did not improve from 0.53002
Epoch 14/100

Epoch 00014: val_loss did not improve from 0.53002
Epoch 15/100

Epoch 00015: val_loss did not improve from 0.53002
Epoch 16/100

Epoch 00016: val_los

In [256]:
compute_metrics_multiclass(np.argmax(y_pred_dropout, axis = 1), np.argmax(y_test, axis = 1))

[0.7026015680684248,
 0.7026015680684248,
 0.7026015680684248,
 0.6672738422513143]

In [257]:
confusion = confusion_matrix(np.argmax(y_pred_dropout, axis = 1), np.argmax(y_test, axis = 1))
print(confusion)

[[587   0   5   1   3   1   3   4   0   4   2   0   2   3   1   2   1   0]
 [  0 176   2   0   0   0   0  31   3  39   2   0   4   0   0   0   0   1]
 [  0   5 333   0   0   0   0  22   0   1   0   0 125   3   0  48   0   0]
 [  0   0   0 322   0   0  84   0   0   0   0  20   0   1   0   0   0   0]
 [  1   4   1   0  53  21   0   1   1   2   6   0   0   0   4   0   2   1]
 [  0   0   0   0   1  18   0   0   0   0   1   0   0   0   4   0   0   0]
 [  0   0   0  66   0   0 713   0   0   0   0   0   0 128   0   5   0   0]
 [  0  62   3   0   0   0   0 185   0  11   0   0  32   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   8   1   0   4   0   0   6   0  83  24   0   4   0   0   0   0   0]
 [  0   4   0   0  19   3   0   1   0  22  86   0   0   0   0   0   0   0]
 [  0   0   0  50   0   0   0   0   0   0   0  49   0   0   0   0   0   0]
 [  0  10  47   0   0   0   1  81   0   1   0   0 201   0   0   4   0   0]
 [  0   0   6   0   0   0

In [258]:
y_pred_smote_dropout

array([[2.8154516e-01, 6.0741701e-03, 8.9084560e-20, ..., 2.3139848e-27,
        2.2271243e-24, 7.2761255e-17],
       [7.8917797e-15, 9.4136043e-04, 3.3395847e-25, ..., 2.2663237e-32,
        7.6817718e-33, 0.0000000e+00],
       [2.3555582e-05, 4.7595045e-04, 2.0985033e-04, ..., 2.5210764e-07,
        3.3643008e-37, 5.2167481e-33],
       ...,
       [4.4545148e-36, 3.0350266e-22, 1.1636992e-07, ..., 3.6213912e-06,
        0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 2.4361512e-05, 1.2092740e-01, ..., 1.7107483e-08,
        6.8622674e-30, 0.0000000e+00],
       [0.0000000e+00, 8.7710114e-06, 4.4293178e-04, ..., 3.3236392e-07,
        3.3713820e-16, 0.0000000e+00]], dtype=float32)

About

If you use imbalanced-learn in a scientific publication, we would appreciate citations to the following paper:

@article{JMLR:v18:16-365,
author  = {Guillaume  Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas},
title   = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning},
journal = {Journal of Machine Learning Research},
year    = {2017},
volume  = {18},
number  = {17},
pages   = {1-5},
url     = {http://jmlr.org/papers/v18/16-365}
}

Most classification algorithms will only perform optimally when the number of samples of each class is roughly the same. Highly skewed datasets, where the minority is heavily outnumbered by one or more classes, have proven to be a challenge while at the same time becoming more and more common.

One way of addressing this issue is by re-sampling the dataset as to offset this imbalance with the hope of arriving at a more robust and fair decision boundary than you would otherwise.

In [308]:
import tensorflow as tf
tf.version.VERSION

'2.4.1'