In [175]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow import keras
from sklearn.metrics import f1_score

### loading data

In [2]:
data = pd.read_csv("data.csv", header=None)
columns = ["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15", "label"]
data.columns = columns
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,label
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


#### handling missing data

In [3]:
data = data.replace("?", np.nan)

In [4]:
data.isna().sum()

A1       12
A2       12
A3        0
A4        6
A5        6
A6        9
A7        9
A8        0
A9        0
A10       0
A11       0
A12       0
A13       0
A14      13
A15       0
label     0
dtype: int64

In [5]:
data.dropna(inplace=True)

In [6]:
data.isna().sum()

A1       0
A2       0
A3       0
A4       0
A5       0
A6       0
A7       0
A8       0
A9       0
A10      0
A11      0
A12      0
A13      0
A14      0
A15      0
label    0
dtype: int64

#### handling categorical variables

In [7]:
le = LabelEncoder()
data["A1"] = le.fit_transform(data["A1"])
data["A4"] = le.fit_transform(data["A4"])
data["A5"] = le.fit_transform(data["A5"])
data["A6"] = le.fit_transform(data["A6"])
data["A7"] = le.fit_transform(data["A7"])
data["A9"] = le.fit_transform(data["A9"])
data["A10"] = le.fit_transform(data["A10"])
data["A12"] = le.fit_transform(data["A12"])
data["A13"] = le.fit_transform(data["A13"])
data["label"] = le.fit_transform(data["label"])
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,label
0,1,30.83,0.0,1,0,12,7,1.25,1,1,1,0,0,202,0,0
1,0,58.67,4.46,1,0,10,3,3.04,1,1,6,0,0,43,560,0
2,0,24.5,0.5,1,0,10,3,1.5,1,0,0,0,0,280,824,0
3,1,27.83,1.54,1,0,12,7,3.75,1,1,5,1,0,100,3,0
4,1,20.17,5.625,1,0,12,7,1.71,1,0,0,0,2,120,0,0


In [8]:
features = ["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15"]
result = ["label"]
X = data[features]
Y = data[result]
Y = Y.values

#### standardize data

In [9]:
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

#### cross validation

In [212]:
folds = 5
f_score_per_fold = []
kfold = KFold(n_splits=folds, shuffle=True)

In [213]:
batch_size = 50
no_epochs = 25
verbosity = 2
fold_no = 1
loss_function = "binary_crossentropy" 
activation='tanh'
optimizer='sgd'
lambdaValue = 0.02
for train, test in kfold.split(X, Y):    
    # layers
    model = Sequential()
    model.add(Dense(15, activation=activation, input_shape=(15,)))
    model.add(Dense(25, activation=activation,kernel_regularizer=keras.regularizers.l2(lambdaValue)))
    model.add(Dense(15, activation=activation,kernel_regularizer=keras.regularizers.l2(lambdaValue)))
#     model.add(Dense(15, activation=activation))
#     model.add(Dense(15, activation=activation))
#     model.add(Dense(15, activation=activation))
#     model.add(Dense(25, activation=activation))
    model.add(Dense(1, activation='sigmoid'))

    # model
    model.compile(loss=loss_function,
              optimizer=optimizer,
              metrics=['accuracy'])

    print('------------------------------------------------------------------------')
    print(f'fold {fold_no} ...')

    # Fit data to model
    model.fit(X[train], Y[train],
                  batch_size=batch_size,
                  epochs=no_epochs,
                  verbose=verbosity)

    pred = model.predict(X[test])
    y_pred = []
    for i in pred:
        if(i[0] > 0.5):
            y_pred.append(1)
        else:
            y_pred.append(0)
    f_val = f1_score(Y[test], y_pred, average='macro')
    f_score_per_fold.append(f_val)
    print("F score for fold: "+str(fold_no)+" is: "+str(f_val))

    # Increase fold number
    fold_no = fold_no + 1

------------------------------------------------------------------------
fold 1 ...
Epoch 1/25
522/522 - 2s - loss: 1.5103 - acc: 0.4904
Epoch 2/25
522/522 - 0s - loss: 1.4461 - acc: 0.5785
Epoch 3/25
522/522 - 0s - loss: 1.3962 - acc: 0.6839
Epoch 4/25
522/522 - 0s - loss: 1.3545 - acc: 0.7241
Epoch 5/25
522/522 - 0s - loss: 1.3185 - acc: 0.7586
Epoch 6/25
522/522 - 0s - loss: 1.2879 - acc: 0.7854
Epoch 7/25
522/522 - 0s - loss: 1.2607 - acc: 0.7969
Epoch 8/25
522/522 - 0s - loss: 1.2360 - acc: 0.8084
Epoch 9/25
522/522 - 0s - loss: 1.2141 - acc: 0.8084
Epoch 10/25
522/522 - 0s - loss: 1.1936 - acc: 0.8199
Epoch 11/25
522/522 - 0s - loss: 1.1749 - acc: 0.8238
Epoch 12/25
522/522 - 0s - loss: 1.1581 - acc: 0.8295
Epoch 13/25
522/522 - 0s - loss: 1.1422 - acc: 0.8467
Epoch 14/25
522/522 - 0s - loss: 1.1271 - acc: 0.8448
Epoch 15/25
522/522 - 0s - loss: 1.1129 - acc: 0.8525
Epoch 16/25
522/522 - 0s - loss: 1.0994 - acc: 0.8563
Epoch 17/25
522/522 - 0s - loss: 1.0868 - acc: 0.8621
Epoch 1

In [214]:
print('------------------------------------------------------------------------')
print('F Score per fold')
for i in range(0, len(f_score_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - F Score: {f_score_per_fold[i]}')
print('------------------------------------------------------------------------')
print(f'> Avg F score: {np.mean(f_score_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
F Score per fold
------------------------------------------------------------------------
> Fold 1 - F Score: 0.868729737695255
------------------------------------------------------------------------
> Fold 2 - F Score: 0.8555147058823529
------------------------------------------------------------------------
> Fold 3 - F Score: 0.7824857019863477
------------------------------------------------------------------------
> Fold 4 - F Score: 0.8686009869790119
------------------------------------------------------------------------
> Fold 5 - F Score: 0.8750300408555636
------------------------------------------------------------------------
> Avg F score: 0.8500722346797062
------------------------------------------------------------------------
