In [1]:
pip install scikeras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikeras
  Downloading scikeras-0.8.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.8.0


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import joblib
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, make_scorer, r2_score, accuracy_score, log_loss, confusion_matrix
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
#from scikeras.wrappers import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import GlorotUniform, Zeros

Prüfe ob eine CPU für das Training verfügbar ist.

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


Um die Ergebnisse reproduzierbar zu machen erstellen wir einen random_state und intialisieren die seeds für die RNG.

In [3]:
random_state = 1
tf.random.set_seed(random_state)
np.random.seed(random_state)

Lade die erstellten Trainingsdaten.

In [4]:
data = pd.read_csv('/content/drive/MyDrive/WR2 Brrr/Trainingsdaten_Proj3/training_dataset_ver5.csv', header = None)
print(data.shape)
data.head()

(38163, 4801)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4791,4792,4793,4794,4795,4796,4797,4798,4799,4800
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


In [5]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1:].values
X[:10,]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]])

Im folgenden wird der Datensatz auf das Intervall [0,1] normalisiert und ein Train-Test split durchgeführt. Die Normalisierung ist streng genommen nicht notwendig, da unsere Trainingssdaten bereits normalisiert ist.

In [6]:
normalizer = MinMaxScaler()
X = normalizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state)

Lege die Parameterverteilungen für den RandomSearch fest und erstelle das zugehörige dictionary. In der Variable metrics werden die gewünschten Metriken eingetragen, welche während des fittings evaluiert werden. Da diese aber nicht direkt im RandomSearch eingesehen werden können, werden sie vorerst nicht benötigt.

In [7]:
batch_size      = [8,16,32,64]
epochs          = [50,100,150]
learning_rate   = [0.002,0.01,0.1]
n_hidden_layers = [2,3,4,5]
layer_size      = [100,200,500,1000]
dropout_rate    = [0, 0.1, 0.2, 0.5]

parameters = {
    'batch_size': batch_size,
    'epochs': epochs,
    'learning_rate' : learning_rate,
    'n_hidden_layers' : n_hidden_layers,
    'layer_size' : layer_size,
    'dropout_rate' : dropout_rate
    }

# metrics = [tf.keras.metrics.BinaryAccuracy(),
#            tf.keras.metrics.BinaryCrossentropy(),
#            tf.keras.metrics.AUC(),
#            tf.keras.metrics.Precision(),
#            tf.keras.metrics.Recall(),
#            tf.keras.metrics.TrueNegatives()
#            ]
metrics = None

Der KerasClassifier Wrapper nimmt eine build_fn als Argument, in welcher das Keras Modell erstellt wird. Erstelle in dieser die Netzwerk Architektur und lege die möglichen Hyperparameter fest.

In unserem Fall ist die Modell Architektur ein dichtes FFN welches folgendermaßen aufgebaut ist:
Input -> DenseLayer -> ActivationFN -> DropoutLayer -> DenseLayer -> ... -> DropoutLayer -> OutputLayer(1 Neuron) -> SigmoidFN

In [8]:
def build_model(learning_rate,n_hidden_layers,layer_size,dropout_rate):
    model = Sequential()
    
    for i in range(n_hidden_layers):
        model.add(Dense(units=layer_size,activation = 'relu'))
        model.add(tf.keras.layers.Dropout(dropout_rate, seed = random_state+i)) 
    # Output Layer
    model.add(Dense(1, activation = 'sigmoid'))
        
    model.compile(loss = 'binary_crossentropy',
                  optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  metrics = metrics)
    return model
    
model = KerasClassifier(build_model, verbose = 0)

  from ipykernel import kernelapp as app


Cross-Validation Strategy. Da die generierten Trainingsdaten recht umfangreich sind, eignet sich ein 4er-Split. Der CV wird zudem mit einem random state versehen.

In [9]:
cv = StratifiedKFold(n_splits = 4, shuffle = True, random_state = random_state)

Das Format in welchem der Output in tf.Keras generiert werden macht es notwendig, dass der scikit-learn Cross-Entropy scorer mit einer Toleranz versehen wird.

In [10]:
float32neg_log_loss = make_scorer(log_loss, eps = 1e-7)

## Random Search


---


Erstelle den RandomSearchCV. Interessante Auswertungen sind die Genauigkeit, der Cross-Entropy loss sowie der Recall. Erstelle zudem einen zusätzlichen random state für die Suche, um mehrere kürzere Durchlaufe starten zu können, ohne dabei andere Parameter zu beeinflussen.

In [11]:
randomStateSearch = 4
randomsearch = RandomizedSearchCV(estimator = model,
                                  param_distributions = parameters,
                                  n_iter = 15,
                                  scoring={'accuracy':'accuracy',
                                           'ce':float32neg_log_loss,
                                           #'conf_matrix': confusion_matrix_scorer,
                                           'recall': 'recall'},   
                                  refit = False,
                                  cv=cv,
                                  random_state = randomStateSearch,
                                  return_train_score=True,
                                  verbose = 4)

Führe die RandomSearch durch und speichere die Ergebnisse ab.

In [12]:
randomsearch.fit(X_train, y_train, verbose = 0)
result_df = pd.DataFrame.from_dict(randomsearch.cv_results_)
result_df.to_pickle('/content/drive/MyDrive/WR2 Brrr/Trainingsdaten_Proj3/randomsearch/randsearch_results_rs'+ str(randomStateSearch)+ '.pkl')
result_df.head()

Fitting 4 folds for each of 15 candidates, totalling 60 fits
[CV 1/4] END batch_size=16, dropout_rate=0.5, epochs=150, layer_size=1000, learning_rate=0.01, n_hidden_layers=4; accuracy: (train=0.836, test=0.836) ce: (train=2.636, test=2.637) recall: (train=0.000, test=0.000) total time=12.1min
[CV 2/4] END batch_size=16, dropout_rate=0.5, epochs=150, layer_size=1000, learning_rate=0.01, n_hidden_layers=4; accuracy: (train=0.836, test=0.836) ce: (train=2.636, test=2.637) recall: (train=0.000, test=0.000) total time=11.9min
[CV 3/4] END batch_size=16, dropout_rate=0.5, epochs=150, layer_size=1000, learning_rate=0.01, n_hidden_layers=4; accuracy: (train=0.836, test=0.836) ce: (train=2.637, test=2.636) recall: (train=0.000, test=0.000) total time=11.9min
[CV 4/4] END batch_size=16, dropout_rate=0.5, epochs=150, layer_size=1000, learning_rate=0.01, n_hidden_layers=4; accuracy: (train=0.836, test=0.836) ce: (train=2.637, test=2.636) recall: (train=0.000, test=0.000) total time=11.8min
[CV 1/4

KeyboardInterrupt: ignored

In [15]:
rs = [0,1,2,3,10,11,12]
names = ['randsearch_results_rs'+ str(rs_)+ '.pkl' for rs_ in rs]
dfs = []
for name in names:
  dfs.append(pd.read_pickle('/content/drive/MyDrive/WR2 Brrr/Trainingsdaten_Proj3/randomsearch/' + name))

results = pd.concat(dfs, axis = 'index', ignore_index = True).sort_values(by = ["mean_test_accuracy"])
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_hidden_layers,param_learning_rate,param_layer_size,param_epochs,param_dropout_rate,param_batch_size,...,split3_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,split3_train_recall,mean_train_recall,std_train_recall
33,619.682706,7.535112,0.589411,0.009955,4,0.1,200,150,0.5,16,...,1.0,0.25,0.433013,7,0.0,0.0,0.0,1.0,0.25,0.433013
29,303.706894,0.71604,0.572701,0.015192,3,0.1,500,150,0.5,32,...,1.0,0.25,0.433013,7,0.0,0.0,0.0,1.0,0.25,0.433013
19,490.330584,3.196641,0.522753,0.002357,5,0.1,200,150,0.0,16,...,0.0,0.25,0.433013,4,1.0,0.0,0.0,0.0,0.25,0.433013
12,327.589694,0.749676,0.504929,0.007579,3,0.1,200,100,0.5,16,...,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0
37,47.477273,0.113815,0.515726,0.005492,4,0.1,100,50,0.1,64,...,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0
36,107.003731,0.339967,0.517914,0.004168,5,0.1,500,100,0.5,64,...,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0
35,362.295192,2.014605,0.560357,0.032817,5,0.01,500,50,0.1,8,...,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0
30,358.676418,1.283413,0.540255,0.00757,2,0.1,100,50,0.5,8,...,0.0,0.0,0.0,9,0.0,0.0,0.0,0.0,0.0,0.0
27,59.868248,0.301121,0.584943,0.015693,5,0.01,500,50,0.1,64,...,0.0,0.0,0.0,9,0.0,0.0,0.0,0.0,0.0,0.0
24,638.202751,1.481174,0.502302,0.006337,3,0.01,500,100,0.2,8,...,0.0,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
results.to_csv('/content/drive/MyDrive/WR2 Brrr/Trainingsdaten_Proj3/randomsearch_results.csv')