In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.activations import relu, sigmoid
from keras import utils

### Reading data

In [18]:
data = pd.read_csv('data/crx.data')

In [19]:
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


### Null filling

In [20]:
# categorical null fill with most observed value
data['A1'].fillna(data['A1'].value_counts().index[0], inplace=True)
data['A4'].fillna(data['A4'].value_counts().index[0], inplace=True)
data['A5'].fillna(data['A5'].value_counts().index[0], inplace=True)
data['A6'].fillna(data['A6'].value_counts().index[0], inplace=True)
data['A7'].fillna(data['A7'].value_counts().index[0], inplace=True)

# nominal null fill with mean
data['A2'].fillna(data['A2'].mean(), inplace=True)
data['A14'].fillna(round(data['A14'].mean()), inplace=True)

In [21]:
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


### Categorical label encoding

In [22]:
# label encode binary categorical
label_encoder_A1 = LabelEncoder()
data['A1'] = label_encoder_A1.fit_transform(data['A1']) # a-0; b-1
label_encoder_A4 = LabelEncoder()
data['A9'] = label_encoder_A1.fit_transform(data['A9']) # t, f
label_encoder_A10 = LabelEncoder()
data['A10'] = label_encoder_A1.fit_transform(data['A10']) # t, f
label_encoder_A12 = LabelEncoder()
data['A12'] = label_encoder_A1.fit_transform(data['A12']) # t, f

# one-hot non-binary categorical
data = pd.get_dummies(data, columns=['A4','A5','A6','A7','A13'])

# data['A4'] = label_encoder_A1.fit_transform(data['A4']) # u-1; y-2; l-0; t-not found
# label_encoder_A5 = LabelEncoder()
# data['A5'] = label_encoder_A5.fit_transform(data['A5']) # g, p, gg
# label_encoder_A6 = LabelEncoder()
# data['A6'] = label_encoder_A1.fit_transform(data['A6']) # c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff
# label_encoder_A7 = LabelEncoder()
# data['A7'] = label_encoder_A1.fit_transform(data['A7']) # v, h, bb, j, n, z, dd, ff, o
# label_encoder_A9 = LabelEncoder()
# label_encoder_A13 = LabelEncoder()
# data['A13'] = label_encoder_A1.fit_transform(data['A13']) # g, p, s

data['A16'].replace(to_replace='+', value=1, inplace=True)
data['A16'].replace(to_replace='-', value=0, inplace=True)

In [23]:
data

Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A14,A15,...,A7_ff,A7_h,A7_j,A7_n,A7_o,A7_v,A7_z,A13_g,A13_p,A13_s
0,1,30.83,0.000,1.25,1,1,1,0,202.0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,58.67,4.460,3.04,1,1,6,0,43.0,560,...,0,1,0,0,0,0,0,1,0,0
2,0,24.50,0.500,1.50,1,0,0,0,280.0,824,...,0,1,0,0,0,0,0,1,0,0
3,1,27.83,1.540,3.75,1,1,5,1,100.0,3,...,0,0,0,0,0,1,0,1,0,0
4,1,20.17,5.625,1.71,1,0,0,0,120.0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21.08,10.085,1.25,0,0,0,0,260.0,0,...,0,1,0,0,0,0,0,1,0,0
686,0,22.67,0.750,2.00,0,1,2,1,200.0,394,...,0,0,0,0,0,1,0,1,0,0
687,0,25.25,13.500,2.00,0,1,1,1,200.0,1,...,1,0,0,0,0,0,0,1,0,0
688,1,17.92,0.205,0.04,0,0,0,0,280.0,750,...,0,0,0,0,0,1,0,1,0,0


In [24]:
data.shape

(690, 43)

### Independent Dependent variable split

In [35]:
data_A16 = data['A16']
data = data.drop('A16', axis=1)
data['A16'] = data_A16

X = data.iloc[:, 0:-1]
Y = data['A16']

Y = utils.to_categorical(Y, 2)

### Scale all : 0-1 range

In [36]:
# scaler = MinMaxScaler(feature_range=(0, 1))
# X = scaler.fit_transform(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [37]:
X

array([[ 0.66143783, -0.0623209 , -0.95661321, ...,  0.32249031,
        -0.10830607, -0.30007898],
       [-1.51185789,  2.28810134, -0.06005053, ...,  0.32249031,
        -0.10830607, -0.30007898],
       [-1.51185789, -0.59673802, -0.8561017 , ...,  0.32249031,
        -0.10830607, -0.30007898],
       ...,
       [-1.51185789, -0.53341846,  1.7571976 , ...,  0.32249031,
        -0.10830607, -0.30007898],
       [ 0.66143783, -1.15226167, -0.91540349, ...,  0.32249031,
        -0.10830607, -0.30007898],
       [ 0.66143783,  0.28973588, -0.27816051, ...,  0.32249031,
        -0.10830607, -0.30007898]])

In [38]:
X.shape

(690, 42)

## Building neural net

### f1 function

In [39]:
import keras.backend as K

# def get_f1(y_true, y_pred): #taken from old keras source code
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
#     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#     precision = true_positives / (predicted_positives + K.epsilon())
#     recall = true_positives / (possible_positives + K.epsilon())
#     f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
#     return f1_val

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def get_f1(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [47]:
from keras.regularizers import l2,l1

model = Sequential()
model.add(Dense(42, input_dim=42, activation='relu'))
model.add(Dense(6, activation='relu', kernel_regularizer=l2(0.1)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[get_f1])
# model.compile(loss='mean_squared_error', optimizer='adam', metrics=[get_f1])

### Average f1 scores with 5-fold cv

In [48]:
# cross_val_score(model, X, Y, cv=5, scoring= 'accuracy')  ### doesn't work with Keras

In [49]:
# manual method
f1_scores=[]
cv = KFold(n_splits=5, shuffle=True)
for train_index, test_index in cv.split(X):
#     print("Train Index: ", train_index, "\n")
#     print("Test Index: ", test_index)
    
    X_train, X_test, Y_train, Y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]
    
    model.fit(X_train, Y_train, batch_size=10, epochs=50, verbose=0)
    scores = model.evaluate(X_test, Y_test, verbose=0)
    print(scores)
    f1_scores.append(scores[1])

print(np.mean(f1_scores))

[0.6442298651605413, 0.8125]
[0.37513000113160716, 0.8999999761581421]
[0.16352006091155868, 0.949999988079071]
[0.2281848021607468, 0.9312499761581421]
[0.22412489970093188, 0.9312499761581421]
0.9049999833106994
