In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
pd.options.display.float_format = '{:,.3f}'.format

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [5]:
from numpy import isnan
from sklearn.impute import KNNImputer

In [6]:
#data WITHOUT near-zero value columns
data = pd.read_csv('data_prep_no_zv.csv')

In [7]:
data.shape

(1848, 567)

In [8]:
#make sure dataset has only numbers
data = data.apply(pd.to_numeric)

In [9]:
#data with near-zero value columns
#data = pd.read_csv('data_prep.csv')

In [10]:
print("Number of null values in dataset: ",data.isnull().values.sum())
print("Percentage: ",(data.isnull().sum().sum()/data.size)*100) #ποσοστο

Number of null values in dataset:  21545
Percentage:  2.056181619673683


In [11]:
data['G1'].value_counts()

0.000    1523
1.000     240
2.000      60
3.000      14
4.000      11
Name: G1, dtype: int64

In [12]:
#data['G1']=data['G1'].fillna(0,inplace=True)
data['G1'].isnull().values.sum()

0

### impute and scale dataframe

In [13]:
#load data and split to dependent and independent variables
X = data.loc[:, data.columns != 'G1'].values #select all but the desired one for y
y = data['G1'].values

#X and y are numpy ndarray type variables
print(X.shape,y.shape)
X = np.array(X)
y = np.array(y)

(1848, 566) (1848,)


In [14]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [15]:
# define imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

In [16]:
# fit on the dataset
imputer.fit(X)

KNNImputer(add_indicator=False, copy=True, metric='nan_euclidean',
           missing_values=nan, n_neighbors=5, weights='uniform')

In [17]:
# transform the dataset
X= imputer.transform(X)

In [18]:
print('Missing: %d' % sum(isnan(X).flatten()))

Missing: 0


### prepare data fo model

In [19]:
#split to train and test set
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

### Building the ANN
Χτιζω με τη Sequential

In [20]:
ann = tf.keras.models.Sequential() #create object instance

#### adding input layer and first hidden layer, using dense class library.

In [21]:
ann.add(tf.keras.layers.Dense(units=6,activation='relu'))

#### Add the second hidden layer

In [22]:

ann.add(tf.keras.layers.Dense(units=6,activation='relu'))

#### Add the output layer <br> again fully connected with the previous. The output layer must have the dimensions of the array we want to predict

In [23]:
ann.add(tf.keras.layers.Dense(units=5,activation='softmax'))


#### compiling ANN

In [24]:
ann.compile(optimizer='adam',loss= 'sparse_categorical_crossentropy',metrics=['sparse_categorical_accuracy'])


#### training to the whole set



In [25]:
#the default hyperparameter for batch_size is 32
#epochs improve the accuracy

ann.fit(X_train,y_train,batch_size=32,epochs=100,verbose=2)

Train on 1478 samples
Epoch 1/100
1478/1478 - 1s - loss: 0.8072 - sparse_categorical_accuracy: 0.8187
Epoch 2/100
1478/1478 - 0s - loss: 0.5466 - sparse_categorical_accuracy: 0.8187
Epoch 3/100
1478/1478 - 0s - loss: 0.4801 - sparse_categorical_accuracy: 0.8187
Epoch 4/100
1478/1478 - 0s - loss: 0.4349 - sparse_categorical_accuracy: 0.8241
Epoch 5/100
1478/1478 - 0s - loss: 0.3957 - sparse_categorical_accuracy: 0.8369
Epoch 6/100
1478/1478 - 0s - loss: 0.3683 - sparse_categorical_accuracy: 0.8525
Epoch 7/100
1478/1478 - 0s - loss: 0.3440 - sparse_categorical_accuracy: 0.8572
Epoch 8/100
1478/1478 - 0s - loss: 0.3276 - sparse_categorical_accuracy: 0.8647
Epoch 9/100
1478/1478 - 0s - loss: 0.3098 - sparse_categorical_accuracy: 0.8681
Epoch 10/100
1478/1478 - 0s - loss: 0.2869 - sparse_categorical_accuracy: 0.8769
Epoch 11/100
1478/1478 - 0s - loss: 0.2734 - sparse_categorical_accuracy: 0.8850
Epoch 12/100
1478/1478 - 0s - loss: 0.2580 - sparse_categorical_accuracy: 0.8917
Epoch 13/100
14

<tensorflow.python.keras.callbacks.History at 0x1d5ee2248c8>

In [26]:
y_pred = ann.predict(X_test,verbose=1) #Returns:Numpy array(s) of probabilities predictions



In [27]:
y_classes =ann.predict_classes(X_test,verbose=1) #Returns the predicted classes (only available in Sequential=>βλ oneNote). 
#Ειναι function του sequential τροπου που χτισαμε το ΝΝ το οποιο επιστρεφει απευθειας τισ κλασεις  



In [28]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [29]:
accuracy = accuracy_score(y_test,y_classes)

In [30]:
mcm = multilabel_confusion_matrix(y_test,y_classes)

In [31]:
print(mcm, accuracy)

[[[ 48   9]
  [ 16 297]]

 [[307  25]
  [  9  29]]

 [[352   3]
  [ 10   5]]

 [[366   0]
  [  3   1]]

 [[369   1]
  [  0   0]]] 0.8972972972972973


### Try with encoder

In [32]:
from tensorflow.keras.models import load_model

In [33]:
# load the model from file
encoder = load_model('encoder_half.h5')



In [34]:
# encode the train data
X_train_encode = encoder.predict(X_train)

In [35]:
# encode the test data
X_test_encode = encoder.predict(X_test)

In [36]:
print(X_train_encode.shape,X_test_encode.shape)

(1478, 283) (370, 283)


In [40]:
ann2 = tf.keras.models.Sequential()
ann2.add(tf.keras.layers.Dense(units=6,activation='relu'))
ann2.add(tf.keras.layers.Dense(units=6,activation='relu'))
ann2.add(tf.keras.layers.Dense(units=5,activation='softmax'))
ann2.compile(optimizer='adam',loss= 'sparse_categorical_crossentropy',metrics=['sparse_categorical_accuracy'])
ann2.fit(X_train_encode,y_train,batch_size=32,epochs=100,verbose=2)

Train on 1478 samples
Epoch 1/100
1478/1478 - 1s - loss: 1.9362 - sparse_categorical_accuracy: 0.0670
Epoch 2/100
1478/1478 - 0s - loss: 1.1474 - sparse_categorical_accuracy: 0.7097
Epoch 3/100
1478/1478 - 0s - loss: 0.7359 - sparse_categorical_accuracy: 0.8200
Epoch 4/100
1478/1478 - 0s - loss: 0.5865 - sparse_categorical_accuracy: 0.8221
Epoch 5/100
1478/1478 - 0s - loss: 0.5342 - sparse_categorical_accuracy: 0.8234
Epoch 6/100
1478/1478 - 0s - loss: 0.5090 - sparse_categorical_accuracy: 0.8214
Epoch 7/100
1478/1478 - 0s - loss: 0.4923 - sparse_categorical_accuracy: 0.8234
Epoch 8/100
1478/1478 - 0s - loss: 0.4754 - sparse_categorical_accuracy: 0.8254
Epoch 9/100
1478/1478 - 0s - loss: 0.4628 - sparse_categorical_accuracy: 0.8268
Epoch 10/100
1478/1478 - 0s - loss: 0.4507 - sparse_categorical_accuracy: 0.8302
Epoch 11/100
1478/1478 - 0s - loss: 0.4378 - sparse_categorical_accuracy: 0.8315
Epoch 12/100
1478/1478 - 0s - loss: 0.4271 - sparse_categorical_accuracy: 0.8309
Epoch 13/100
14

<tensorflow.python.keras.callbacks.History at 0x1d5f2d41dc8>

In [41]:
y_pred = ann2.predict(X_test_encode,verbose=1)



In [42]:
y_classes =ann2.predict_classes(X_test_encode,verbose=1)



In [43]:
accuracy = accuracy_score(y_test,y_classes)

In [44]:
mcm = multilabel_confusion_matrix(y_test,y_classes)

In [45]:
print(mcm, accuracy)

[[[ 38  19]
  [ 25 288]]

 [[304  28]
  [ 20  18]]

 [[351   4]
  [ 11   4]]

 [[363   3]
  [  3   1]]

 [[365   5]
  [  0   0]]] 0.8405405405405405
