In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("krkopt.data", header=None)

In [3]:
data.columns = ["wkc", "wkr", "wrc", "wrr", "bkc", "bkr", "opt rank" ]

In [4]:
data

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr,opt rank
0,a,1,b,3,c,2,draw
1,a,1,c,1,c,2,draw
2,a,1,c,1,d,1,draw
3,a,1,c,1,d,2,draw
4,a,1,c,2,c,1,draw
...,...,...,...,...,...,...,...
28051,b,1,g,7,e,5,sixteen
28052,b,1,g,7,e,6,sixteen
28053,b,1,g,7,e,7,sixteen
28054,b,1,g,7,f,5,sixteen


In [5]:
X = data.iloc[:, 0:6]
y = data['opt rank']

In [6]:
X

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr
0,a,1,b,3,c,2
1,a,1,c,1,c,2
2,a,1,c,1,d,1
3,a,1,c,1,d,2
4,a,1,c,2,c,1
...,...,...,...,...,...,...
28051,b,1,g,7,e,5
28052,b,1,g,7,e,6
28053,b,1,g,7,e,7
28054,b,1,g,7,f,5


In [7]:
X["wkc"]=X["wkc"].astype('category')
X["wrc"]=X["wrc"].astype('category')
X["bkc"]=X["bkc"].astype('category')

In [8]:
X["wkc"]=X["wkc"].cat.codes
X["wrc"]=X["wrc"].cat.codes
X["bkc"]=X["bkc"].cat.codes


In [9]:
X

Unnamed: 0,wkc,wkr,wrc,wrr,bkc,bkr
0,0,1,1,3,2,2
1,0,1,2,1,2,2
2,0,1,2,1,3,1
3,0,1,2,1,3,2
4,0,1,2,2,2,1
...,...,...,...,...,...,...
28051,1,1,6,7,4,5
28052,1,1,6,7,4,6
28053,1,1,6,7,4,7
28054,1,1,6,7,5,5


In [10]:
y = y.astype('category')
y = y.cat.codes

In [11]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [12]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.utils import to_categorical


In [13]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, test_size=0.2,
                                                   random_state = 1)

from imblearn.over_sampling import SMOTE
oversample = SMOTE()


X_smote, y_smote = oversample.fit_resample(X, y)

X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote,
                                                   y_smote, test_size=0.2,
                                                   random_state = 1)



y_train = to_categorical(y_train)
y_test  = to_categorical(y_test)
y_train_smote = to_categorical(y_train_smote)
y_test_smote = to_categorical(y_test_smote)

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

scaler = StandardScaler().fit(X_train_smote)
X_train_smote = scaler.transform(X_train_smote)
X_test_smote = scaler.transform(X_test_smote)

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


checkpoint_filepath = '/tmp/checkpoint'


In [44]:
from sklearn.metrics import confusion_matrix, precision_score, \
f1_score, cohen_kappa_score, recall_score

In [37]:
def make_my_model_multi( units_per_layer, input_s, output_s, activation_='relu'):
    model = Sequential()
    depth = len(units_per_layer)
    model.add(Dense(units_per_layer[0], activation=activation_, input_shape=(input_s,)))
    for i in range(1, depth):
        model.add(Dense(units_per_layer[i], activation=activation_))
    model.add(Dense(output_s, activation = 'softmax'))   
    
    return model




#En este caso, usamos categorical_crossentropy como función de coste y 
#permitimos elegir otro batch para poder hacer uso de 
def compile_fit_multiclass(modelo, X_train, X_test, y_train, batch, epochs, verbose=0):
    modelo.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])
    
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=True)

    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                        save_weights_only=True,
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        verbose=False)

    modelo.fit(X_train, y_train, epochs=epochs, batch_size=batch, verbose=verbose, validation_split=0.2, callbacks = [early_stopping, model_checkpoint])
    model.load_weights(checkpoint_filepath)
    predictions = modelo.predict(X_test)
    return predictions

def compute_metrics_multiclass(y_test, y_pred):
    results=[]
    results.append(precision_score(y_test, np.round(y_pred), average="micro"))
    results.append(recall_score(y_test, np.round(y_pred), average="micro"))
    results.append(f1_score(y_test, np.round(y_pred), average="micro"))
    results.append(cohen_kappa_score(y_test, np.round(y_pred)))
    return results

In [21]:
y.shape

(28056,)

In [22]:
model = make_my_model_multi([100, 100, 100, 100], 6, 18, activation_='relu' )
compile_fit_multiclass(model, X_train, X_test, y_train, 256, 200, verbose=1)

Epoch 1/200

Epoch 00001: val_loss improved from inf to 1.88759, saving model to /tmp/checkpoint
Epoch 2/200

Epoch 00002: val_loss improved from 1.88759 to 1.57163, saving model to /tmp/checkpoint
Epoch 3/200

Epoch 00003: val_loss improved from 1.57163 to 1.41950, saving model to /tmp/checkpoint
Epoch 4/200

Epoch 00004: val_loss improved from 1.41950 to 1.30558, saving model to /tmp/checkpoint
Epoch 5/200

Epoch 00005: val_loss improved from 1.30558 to 1.23875, saving model to /tmp/checkpoint
Epoch 6/200

Epoch 00006: val_loss improved from 1.23875 to 1.20205, saving model to /tmp/checkpoint
Epoch 7/200

Epoch 00007: val_loss improved from 1.20205 to 1.16663, saving model to /tmp/checkpoint
Epoch 8/200

Epoch 00008: val_loss improved from 1.16663 to 1.13999, saving model to /tmp/checkpoint
Epoch 9/200

Epoch 00009: val_loss improved from 1.13999 to 1.12300, saving model to /tmp/checkpoint
Epoch 10/200

Epoch 00010: val_loss improved from 1.12300 to 1.09925, saving model to /tmp/chec

array([[9.9997938e-01, 3.8969483e-07, 1.1432915e-06, ..., 9.3834991e-08,
        7.5836556e-23, 9.8305384e-16],
       [9.9865711e-01, 1.2328293e-03, 1.7691092e-10, ..., 8.6936827e-15,
        1.1686669e-15, 4.1706679e-16],
       [1.0000000e+00, 3.9300362e-11, 2.9170282e-09, ..., 2.4407842e-10,
        4.1138622e-33, 2.7843786e-23],
       ...,
       [1.2893009e-23, 9.0897445e-27, 2.5235700e-10, ..., 5.2984404e-07,
        1.2759539e-37, 0.0000000e+00],
       [1.0234097e-15, 9.8983740e-08, 3.5829630e-01, ..., 6.3416368e-01,
        2.4048834e-17, 2.1154063e-18],
       [2.0043516e-31, 6.8536350e-15, 6.0135624e-03, ..., 9.0714496e-01,
        4.0428989e-23, 8.0205815e-27]], dtype=float32)

In [23]:
del model
model = make_my_model_multi([100, 100, 100, 100], 6, 18, activation_='relu' )
model.summary()
y_pred = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 200, verbose=1)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 100)               700       
_________________________________________________________________
dense_16 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_17 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_18 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_19 (Dense)             (None, 18)                1818      
Total params: 32,818
Trainable params: 32,818
Non-trainable params: 0
_________________________________________________________________
Epoch 1/200

Epoch 00001: val_loss improved from inf to 1.89837, saving model to /tmp/checkpoint
Epoch 2/200

Epoch 

In [None]:
#PRUEBA
y_pred = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 200, verbose=1)

In [None]:

confusion = confusion_matrix(np.argmax(y_pred, axis = 1), np.argmax(y_test, axis = 1))
print(confusion)

In [None]:
compute_metrics_multiclass(np.argmax(y_pred, axis = 1), np.argmax(y_test, axis = 1))

In [None]:
import imblearn
from collections import Counter

In [None]:
counter = Counter(y)
for k,v in counter.items():
	per = v / len(y) * 100
	print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

In [None]:
X_train.shape, np.argmax(y_train, axis=1).shape

In [None]:
X_train_smote.shape, y_train_smote.shape

In [None]:
X_train.shape, X_train_smote.shape, y_train.shape, y_train_smote.shape, X_test.shape, y_test.shape

In [None]:
np.random.seed(seed)
model = make_my_model_multi([100, 100, 100, 100], 6, 18, activation_='relu' )
model.summary()

In [None]:
#tf.random.set_seed(seed)
model.summary()
y_pred = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 200, verbose=1)

In [None]:
confusion = confusion_matrix(np.argmax(y_pred, axis = 1), np.argmax(y_test, axis = 1))
print(confusion)

In [None]:
compute_metrics_multiclass(np.argmax(y_pred, axis = 1), np.argmax(y_test, axis = 1))

In [None]:
np.argmax(y_pred, axis=1)

In [None]:
#model.save("de_momento_mejor.h5")

# SMOTE NO VA BIEN

# Seguimos probando con con dropout

In [None]:
from tensorflow.keras.layers import Dropout

def make_my_model_multi_dropout( units_per_layer, input_s, output_s, activation_='relu', dropout_r=0.2):
    model = Sequential()
    depth = len(units_per_layer)
    model.add(Dense(units_per_layer[0], activation=activation_, input_shape=(input_s,)))
    for i in range(1, depth):
        model.add(Dropout(dropout_r))
        model.add(Dense(units_per_layer[i], activation=activation_))
    model.add(Dense(output_s, activation = 'softmax'))   
    
    return model

In [None]:
seed = 1
np.random.seed(seed)
model = make_my_model_multi_dropout([100, 100, 100, 100, 100, 100, 100], 6, 18, activation_='relu', dropout_r=0.2)
model.summary()
tf.random.set_seed(
    seed
)

In [None]:
y_pred_dropout = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 100, verbose=1)

In [None]:
compute_metrics_multiclass(np.argmax(y_pred_dropout, axis = 1), np.argmax(y_test, axis = 1))

In [None]:
confusion = confusion_matrix(np.argmax(y_pred_dropout, axis = 1), np.argmax(y_test, axis = 1))
print(confusion)

In [None]:
y_pred_smote_dropout

About

If you use imbalanced-learn in a scientific publication, we would appreciate citations to the following paper:

@article{JMLR:v18:16-365,
author  = {Guillaume  Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas},
title   = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning},
journal = {Journal of Machine Learning Research},
year    = {2017},
volume  = {18},
number  = {17},
pages   = {1-5},
url     = {http://jmlr.org/papers/v18/16-365}
}

Most classification algorithms will only perform optimally when the number of samples of each class is roughly the same. Highly skewed datasets, where the minority is heavily outnumbered by one or more classes, have proven to be a challenge while at the same time becoming more and more common.

One way of addressing this issue is by re-sampling the dataset as to offset this imbalance with the hope of arriving at a more robust and fair decision boundary than you would otherwise.

In [None]:
import tensorflow as tf
tf.version.VERSION

# Pruebas

In [47]:
results = []
seed = 1

In [54]:
size_config = [50, 100, 150, 200, 250]
for size in size_config:
    layer_config = [[size], [size]*2, [size]*3, [size]*4, [size]*5, [size]*6]
    for layers in layer_config:
        np.random.seed(seed)
        tf.random.set_seed(seed)
        print(layers)
        model = make_my_model_multi(layers, 6, 18, activation_='relu' )
        preds = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 300, verbose=0)
        metrics = compute_metrics_multiclass(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        confusion = confusion_matrix(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        aux = { "layer config" : layers,
               #"Model": model,
               "Predictions" : preds,
               "Metrics" : metrics,
               "Confusion" : confusion

        }
        print(metrics)
        results.append(aux)
    

[50]
[0.5563079116179616, 0.5563079116179616, 0.5563079116179616, 0.5014778486804247]
[50, 50]
Epoch 00278: early stopping
[0.6776550249465432, 0.6776550249465432, 0.6776550249465432, 0.6394872591777784]
[50, 50, 50]
Epoch 00189: early stopping
[0.7004632929436921, 0.7004632929436921, 0.7004632929436921, 0.66527050745813]
[50, 50, 50, 50]
Epoch 00135: early stopping
[0.7033143264433357, 0.7033143264433357, 0.7033143264433357, 0.6683596780441357]
[50, 50, 50, 50, 50]
Epoch 00083: early stopping
[0.7140057020669993, 0.7140057020669993, 0.7140057020669993, 0.6802859773047576]
[50, 50, 50, 50, 50, 50]
Epoch 00101: early stopping
[0.7325374198146828, 0.7325374198146828, 0.7325374198146829, 0.7014741703016625]
[100]
[0.5841054882394868, 0.5841054882394868, 0.5841054882394868, 0.532926380750063]
[100, 100]
Epoch 00226: early stopping
[0.7145402708481825, 0.7145402708481825, 0.7145402708481825, 0.6810448690167336]
[100, 100, 100]
Epoch 00145: early stopping
[0.7808267997148967, 0.7808267997148

In [62]:
import pickle

In [65]:
import joblib
 
joblib.dump(results, 'results_1_joblib')

['results_1_joblib']

In [66]:
aux = joblib.load("results_1_joblib")