In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# set the graphs to show in the jupyter notebook
%matplotlib inline

# adds a progressbar
from tqdm.notebook import tqdm

# set seaborn graphs to a better style
sns.set(style="ticks")

import keras
keras.__version__

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, Adadelta, RMSprop
import tensorflow.keras.backend as K
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

In [2]:
#import data 
#shuffle data
#split into y/x 
#split into train and test 
#encode y labels 
#build a model 
#run a model 
#evaluate 
#save model

In [3]:
#import data 

from sklearn.utils import shuffle

directory = 'datasets/dataset_cleaned.csv'
data = pd.read_csv(directory)
data['Labels'] = data['Labels'].replace(['del'],'.')
del data['Unnamed: 0']
data = shuffle(data)
data

Unnamed: 0,Labels,0,1,2,3,4,5,6,7,8,...,53,54,55,56,57,58,59,60,61,62
53998,Y,0.491100,0.826996,-1.299260e-06,0.366563,0.745827,-0.050008,0.285710,0.628791,-0.080379,...,-0.075436,0.760098,0.547673,-0.131350,0.806474,0.496832,-0.140929,0.846854,0.441856,-0.135173
40403,S,0.562349,0.863381,-6.767563e-07,0.495044,0.791981,-0.026944,0.443729,0.717393,-0.049209,...,-0.041925,0.656484,0.668333,-0.063937,0.629001,0.715880,-0.046804,0.618677,0.749107,-0.026630
18237,I,0.250759,0.574622,-5.879739e-07,0.169337,0.523644,-0.036972,0.106884,0.438267,-0.055409,...,-0.006327,0.257312,0.270694,-0.024642,0.249807,0.221193,-0.024565,0.239658,0.174144,-0.016240
24909,K,0.661325,0.666261,5.183612e-07,0.620992,0.608582,-0.015742,0.616760,0.532263,-0.025623,...,-0.049076,0.743674,0.519036,-0.073979,0.714875,0.551055,-0.060267,0.705684,0.580870,-0.041592
26577,L,0.515438,0.899788,1.135414e-07,0.418374,0.896182,-0.048632,0.313544,0.849570,-0.077231,...,-0.055731,0.542848,0.686816,-0.099190,0.551430,0.745663,-0.085944,0.554496,0.793940,-0.064152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34700,Q,0.029234,0.616080,7.040018e-07,0.043925,0.621266,-0.065828,0.117935,0.629496,-0.097873,...,-0.050653,0.269079,0.676441,-0.078415,0.309944,0.708634,-0.086428,0.340070,0.733694,-0.088558
34915,Q,0.312193,0.544782,1.741863e-06,0.366993,0.611259,-0.163451,0.490807,0.661597,-0.225987,...,0.038014,0.723058,0.413354,-0.018468,0.714523,0.476294,-0.037164,0.687471,0.502014,-0.035660
19949,I,0.212166,0.959157,-4.874714e-07,0.139144,0.916790,-0.020007,0.104880,0.834984,-0.033459,...,-0.038584,0.309114,0.687479,-0.060470,0.316068,0.633678,-0.064158,0.321095,0.584429,-0.061633
6739,D,0.523529,0.652290,-7.667634e-08,0.408075,0.575278,0.046501,0.345407,0.514470,0.037550,...,-0.099222,0.441690,0.306186,-0.114212,0.374689,0.300663,-0.111589,0.328549,0.323968,-0.107837


In [4]:
#split into y/x 

y_data = np.array((data['Labels']))
x_data = data.drop('Labels', axis=1)

In [5]:
y_data

array(['Y', 'S', 'I', ..., 'I', 'D', 'T'], dtype=object)

In [7]:
#split into train and test 

from sklearn import preprocessing

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_data)
encoded_y = encoder.transform(y_data)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y)

threshold = 10000 # len(y_data)/5

x_test = x_data[:threshold]
x_train = x_data[threshold:]

y_test = dummy_y[:threshold]
y_train = dummy_y[threshold:]

print(x_test.shape, x_train.shape, y_test.shape, y_train.shape)

(10000, 63) (49321, 63) (10000, 28) (49321, 28)


In [8]:
model = Sequential()
model.add(Dense(60, input_shape = (63,), activation = "swish"))
model.add(Dense(15, activation = "swish"))
model.add(Dropout(0.2))
model.add(Dense(28, activation = "softmax"))
model.compile(Adam(lr = 0.01), "categorical_crossentropy", metrics = ["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 60)                3840      
                                                                 
 dense_1 (Dense)             (None, 15)                915       
                                                                 
 dropout (Dropout)           (None, 15)                0         
                                                                 
 dense_2 (Dense)             (None, 28)                448       
                                                                 
Total params: 5,203
Trainable params: 5,203
Non-trainable params: 0
_________________________________________________________________


2022-05-25 21:56:47.610319: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super(Adam, self).__init__(name, **kwargs)


In [8]:
model.fit(x_train, y_train, epochs=30, validation_data = (x_test, y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe9bdf92dc0>

In [9]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(x_test)
y_test_class = np.argmax(y_test, axis=1)
y_pred_class = np.argmax(y_pred, axis=1)
confusion_matrix(y_test_class, y_pred_class)

array([[  0,   0,   0,  13,   0,   0,   0,   1,  39,   0,   0,   0,   0,
          0,   0,  40,   0,   6,   0,  53,   0,  80,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   2,   0, 196,   0, 161,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0, 150,   0, 189,   0,   0,   0,   0,
          0,   0],
       [  6,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0, 107,   0, 165,   0,   0,   0,   0,
          0,   0],
       [ 16,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0, 196,   0, 179,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0, 189,   0, 212,   0,   0,   0,   0,
          0,   0],
       [  

In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_test_class, y_pred_class))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95       212
           1       1.00      0.98      0.99       344
           2       1.00      1.00      1.00       342
           3       0.97      1.00      0.98       303
           4       1.00      0.95      0.97       336
           5       0.88      1.00      0.94       366
           6       1.00      1.00      1.00       427
           7       0.97      1.00      0.98       441
           8       0.98      0.98      0.98       398
           9       0.98      0.97      0.98       393
          10       1.00      0.96      0.98       400
          11       1.00      0.98      0.99       433
          12       1.00      1.00      1.00       434
          13       0.77      1.00      0.87       212
          14       1.00      0.62      0.76       169
          15       0.97      0.99      0.98       383
          16       1.00      0.97      0.98       339
          17       1.00    

In [11]:
model.save('model.tf', include_optimizer = True)

2022-05-23 21:39:53.501359: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: model_A5.tf/assets
