Repetimos los mismos pasos que en el apartado anterior la diferencia de que codificamos los datos de las variables predictoras usando _dummy variables_ o codificación _one-hot_. Así, compararemos el rendimiento de los modelos.

In [6]:
import numpy as np
import pandas as pd
data = pd.read_csv("krkopt.data", header=None)
data.columns = ["wkc", "wkr", "wrc", "wrr", "bkc", "bkr", "opt rank" ]
X = data.iloc[:, 0:6]
y = data['opt rank']
X["wkc"]=X["wkc"].astype('category')
X["wrc"]=X["wrc"].astype('category')
X["bkc"]=X["bkc"].astype('category')
X["wkc"]=X["wkc"].cat.codes
X["wrc"]=X["wrc"].cat.codes
X["bkc"]=X["bkc"].cat.codes
y = y.astype('category')
y = y.cat.codes

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

cat_cols = list(X.columns)

Codificamos usando _get_dummies_ de pandas. Veremos que ahora tenemos 40 variables predictoras.

In [7]:
X = pd.get_dummies(X,columns=cat_cols)
X

Unnamed: 0,wkc_0,wkc_1,wkc_2,wkc_3,wkr_1,wkr_2,wkr_3,wkr_4,wrc_0,wrc_1,...,bkc_6,bkc_7,bkr_1,bkr_2,bkr_3,bkr_4,bkr_5,bkr_6,bkr_7,bkr_8
0,1,0,0,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28051,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
28052,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
28053,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
28054,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [15]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, test_size=0.2,
                                                   random_state = 1)

from imblearn.over_sampling import SMOTE
oversample = SMOTE()


X_smote, y_smote = oversample.fit_resample(X, y)

X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote,
                                                   y_smote, test_size=0.2,
                                                   random_state = 1)



y_train = to_categorical(y_train)
y_test  = to_categorical(y_test)
y_train_smote = to_categorical(y_train_smote)
y_test_smote = to_categorical(y_test_smote)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

scaler = StandardScaler().fit(X_train_smote)
X_train_smote = scaler.transform(X_train_smote)
X_test_smote = scaler.transform(X_test_smote)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


checkpoint_filepath = '/tmp/checkpoint'

from sklearn.metrics import confusion_matrix, precision_score, \
f1_score, cohen_kappa_score, recall_score

### Pruebas
Todos los casos son los mismos que en los datos con la anterior codificación. Guardaremos en archivos todos los objetos de cada conjunto de experimentos.

#### Datos  _one_hot_

In [48]:
results = []
seed = 1

In [49]:
size_config = [50, 100, 150, 200, 250]
for size in size_config:
    layer_config = [[size], [size]*2, [size]*3, [size]*4, [size]*5, [size]*6]
    for layers in layer_config:
        np.random.seed(seed)
        tf.random.set_seed(seed)
        print(layers)
        model = make_my_model_multi(layers, 40, 18, activation_='relu' )
        preds = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 300, verbose=0)
        metrics = compute_metrics_multiclass(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        confusion = confusion_matrix(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        aux = { "layer config" : layers,
               #"Model": model,
               "Predictions" : preds,
               "Metrics" : metrics,
               "Confusion" : confusion

        }
        print(metrics)
        results.append(aux)
    

[50]
[0.7132929436920884, 0.7132929436920884, 0.7132929436920883, 0.6795940399725571]
[50, 50]
Epoch 00118: early stopping
[0.7359230220955096, 0.7359230220955096, 0.7359230220955095, 0.7051213677686434]
[50, 50, 50]
Epoch 00075: early stopping
[0.7464362081254454, 0.7464362081254454, 0.7464362081254454, 0.7169690264820239]
[50, 50, 50, 50]
Epoch 00059: early stopping
[0.7293300071275838, 0.7293300071275838, 0.7293300071275838, 0.6976471559645523]
[50, 50, 50, 50, 50]
Epoch 00053: early stopping
[0.7323592302209551, 0.7323592302209551, 0.7323592302209551, 0.7011602714040706]
[50, 50, 50, 50, 50, 50]
Epoch 00054: early stopping
[0.7245188880969351, 0.7245188880969351, 0.7245188880969351, 0.6923109986216298]
[100]
Epoch 00176: early stopping
[0.7405559515324305, 0.7405559515324305, 0.7405559515324305, 0.7101055066873646]
[100, 100]
Epoch 00053: early stopping
[0.7624732715609408, 0.7624732715609408, 0.7624732715609408, 0.7348246606533331]
[100, 100, 100]
Epoch 00042: early stopping
[0.78

In [50]:
import joblib
 
joblib.dump(results, 'results_1_onehot_joblib')

['results_1_onehot_joblib']

### Datos _one_hot_ con _SMOTE_

In [51]:
results_smote = []
seed = 1

In [52]:
size_config = [50, 100, 150, 200, 250]
for size in size_config:
    layer_config = [[size], [size]*2, [size]*3, [size]*4, [size]*5, [size]*6]
    for layers in layer_config:
        np.random.seed(seed)
        tf.random.set_seed(seed)
        print(layers)
        model = make_my_model_multi(layers, 40, 18, activation_='relu' )
        preds = compile_fit_multiclass(model, X_train_smote, X_test, y_train_smote, 256, 300, verbose=0)
        metrics = compute_metrics_multiclass(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        confusion = confusion_matrix(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
        aux = { "layer config" : layers,
               #"Model": model,
               "Predictions" : preds,
               "Metrics" : metrics,
               "Confusion" : confusion

        }
        print(metrics)
        results_smote.append(aux)
    

[50]
[0.09853884533143265, 0.09853884533143265, 0.09853884533143265, 0.05990480607973814]
[50, 50]
Epoch 00270: early stopping
[0.06931575196008553, 0.06931575196008553, 0.06931575196008553, 0.037006904026981036]
[50, 50, 50]
Epoch 00155: early stopping
[0.06967213114754098, 0.06967213114754098, 0.06967213114754098, 0.03441175321042145]
[50, 50, 50, 50]
Epoch 00153: early stopping
[0.07448325017818959, 0.07448325017818959, 0.07448325017818959, 0.04118489587487906]
[50, 50, 50, 50, 50]
Epoch 00120: early stopping
[0.06789023521026372, 0.06789023521026372, 0.06789023521026372, 0.028956419894087593]
[50, 50, 50, 50, 50, 50]
Epoch 00200: early stopping
[0.05434782608695652, 0.05434782608695652, 0.05434782608695652, 0.018471319233337335]
[100]
[0.11101211689237349, 0.11101211689237349, 0.11101211689237349, 0.07134215115338971]
[100, 100]
Epoch 00285: early stopping
[0.1097647897362794, 0.1097647897362794, 0.1097647897362794, 0.07580671664360639]
[100, 100, 100]
Epoch 00226: early stopping
[

In [53]:
joblib.dump(results_smote, 'results_smote_onehot_joblib')

['results_smote_onehot_joblib']

### Datos _one_hot_ con _dropout_

In [54]:
results_dropout = []
seed = 1

In [55]:
size_config = [50, 100, 150, 200, 250]
dropout_rate = ["0.1", "0.2", "0.3"]

for size in size_config:
    for size_d in (dropout_rate):
        layer_config_dense = [[size], [size]*2, [size]*3, [size]*4, [size]*5, [size]*6]
        layer_config_dropout = [[size_d], [size_d]*2, [size_d]*3, [size_d]*4, [size_d]*5, [size_d]*6]
        for layers_dense, layers_dropout in zip(layer_config_dense, layer_config_dropout):
            final_design = [None]*(len(layers_dense)+len(layers_dropout))
            final_design[::2] = layers_dense
            final_design[1::2] = layers_dropout
            np.random.seed(seed)
            tf.random.set_seed(seed)
            print(final_design)
            model = make_my_model_multi_dropout(final_design, 40, 18, activation_='relu' )
            preds = compile_fit_multiclass(model, X_train, X_test, y_train, 256, 300, verbose=0)
            metrics = compute_metrics_multiclass(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
            confusion = confusion_matrix(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
            aux = { "layer config" : final_design,
                   #"Model": model,
                   "Predictions" : preds,
                   "Metrics" : metrics,
                   "Confusion" : confusion

            }
            print(metrics)
            results_dropout.append(aux)
    

[50, '0.1']
[0.6911974340698503, 0.6911974340698503, 0.6911974340698503, 0.6544424729231852]
[50, '0.1', 50, '0.1']
Epoch 00249: early stopping
[0.7549893086243763, 0.7549893086243763, 0.7549893086243763, 0.7260805176671765]
[50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00244: early stopping
[0.7685317177476836, 0.7685317177476836, 0.7685317177476836, 0.7412666342935919]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00204: early stopping
[0.7628296507483963, 0.7628296507483963, 0.7628296507483963, 0.7349489199206065]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00204: early stopping
[0.755167498218104, 0.755167498218104, 0.755167498218104, 0.726566737550184]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00138: early stopping
[0.7446543121881682, 0.7446543121881682, 0.7446543121881682, 0.7145851936293055]
[50, '0.2']
[0.6797933000712758, 0.6797933000712758, 0.6797933000712758, 0.6415147506179972]
[50, '0.2', 50, '0.2']
Epoch 00194: early stoppi

In [56]:
joblib.dump(results_dropout, 'results_dropout_one_hot')

['results_dropout_one_hot']

#### Datos _one_hot_ _SMOTE dropout_

In [28]:
results_dropout_smote = []
seed = 1

In [29]:
size_config = [50, 100, 150, 200, 250]
dropout_rate = ["0.1", "0.2", "0.3"]

for size in size_config:
    for size_d in (dropout_rate):
        layer_config_dense = [[size], [size]*2, [size]*3, [size]*4, [size]*5, [size]*6]
        layer_config_dropout = [[size_d], [size_d]*2, [size_d]*3, [size_d]*4, [size_d]*5, [size_d]*6]
        for layers_dense, layers_dropout in zip(layer_config_dense, layer_config_dropout):
            final_design = [None]*(len(layers_dense)+len(layers_dropout))
            final_design[::2] = layers_dense
            final_design[1::2] = layers_dropout
            np.random.seed(seed)
            tf.random.set_seed(seed)
            print(final_design)
            model = make_my_model_multi_dropout(final_design, 40, 18, activation_='relu' )
            preds = compile_fit_multiclass(model, X_train_smote, X_test, y_train_smote, 256, 300, verbose=0)
            metrics = compute_metrics_multiclass(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
            confusion = confusion_matrix(np.argmax(preds, axis = 1), np.argmax(y_test, axis = 1))
            aux = { "layer config" : final_design,
                   #"Model": model,
                   "Predictions" : preds,
                   "Metrics" : metrics,
                   "Confusion" : confusion

            }
            print(metrics)
            results_dropout_smote.append(aux)
    

[50, '0.1']
[0.17925873129009265, 0.17925873129009265, 0.17925873129009265, 0.12232832383335368]
[50, '0.1', 50, '0.1']
[0.1261582323592302, 0.1261582323592302, 0.1261582323592302, 0.08089709192596017]
[50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00248: early stopping
[0.0650392017106201, 0.0650392017106201, 0.0650392017106201, 0.03040751666739172]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00189: early stopping
[0.042230933713471135, 0.042230933713471135, 0.042230933713471135, 0.01270048583862049]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00194: early stopping
[0.038132573057733425, 0.038132573057733425, 0.038132573057733425, 0.005452686722108413]
[50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1', 50, '0.1']
Epoch 00232: early stopping
[0.0345687811831789, 0.0345687811831789, 0.0345687811831789, 0.008312049551169154]
[50, '0.2']
[0.1626870990734141, 0.1626870990734141, 0.1626870990734141, 0.10466572690499598]
[50, '0.2', 50, '0.2']
Epoch 00255: early stoppi

In [30]:
import joblib
joblib.dump(results_dropout_smote, 'results_dropout_smote_one_hot')

['results_dropout_smote_one_hot']