In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.activations import relu, sigmoid

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Reading data

In [2]:
data = pd.read_csv('data/crx.data')

In [3]:
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
5,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360.0,0,+
6,b,33.17,1.04,u,g,r,h,6.5,t,f,0,t,g,164.0,31285,+
7,a,22.92,11.585,u,g,cc,v,0.04,t,f,0,f,g,80.0,1349,+
8,b,54.42,0.5,y,p,k,h,3.96,t,f,0,f,g,180.0,314,+
9,b,42.5,4.915,y,p,w,v,3.165,t,f,0,t,g,52.0,1442,+


### Null filling

In [4]:
# categorical null fill with most observed value
data['A1'].fillna(data['A1'].value_counts().index[0], inplace=True)
data['A4'].fillna(data['A4'].value_counts().index[0], inplace=True)
data['A5'].fillna(data['A5'].value_counts().index[0], inplace=True)
data['A6'].fillna(data['A6'].value_counts().index[0], inplace=True)
data['A7'].fillna(data['A7'].value_counts().index[0], inplace=True)

# nominal null fill with mean
data['A2'].fillna(data['A2'].mean(), inplace=True)
data['A14'].fillna(round(data['A14'].mean()), inplace=True)

In [5]:
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
5,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360.0,0,+
6,b,33.17,1.04,u,g,r,h,6.5,t,f,0,t,g,164.0,31285,+
7,a,22.92,11.585,u,g,cc,v,0.04,t,f,0,f,g,80.0,1349,+
8,b,54.42,0.5,y,p,k,h,3.96,t,f,0,f,g,180.0,314,+
9,b,42.5,4.915,y,p,w,v,3.165,t,f,0,t,g,52.0,1442,+


### Categorical label encoding

In [6]:
label_encoder_A1 = LabelEncoder()
data['A1'] = label_encoder_A1.fit_transform(data['A1']) # a-0; b-1
label_encoder_A4 = LabelEncoder()
data['A4'] = label_encoder_A1.fit_transform(data['A4']) # u-1; y-2; l-0; t-not found
label_encoder_A5 = LabelEncoder()
data['A5'] = label_encoder_A5.fit_transform(data['A5'])
label_encoder_A6 = LabelEncoder()
data['A6'] = label_encoder_A1.fit_transform(data['A6'])
label_encoder_A7 = LabelEncoder()
data['A7'] = label_encoder_A1.fit_transform(data['A7'])
label_encoder_A9 = LabelEncoder()
data['A9'] = label_encoder_A1.fit_transform(data['A9'])
label_encoder_A10 = LabelEncoder()
data['A10'] = label_encoder_A1.fit_transform(data['A10'])
label_encoder_A12 = LabelEncoder()
data['A12'] = label_encoder_A1.fit_transform(data['A12'])
label_encoder_A13 = LabelEncoder()
data['A13'] = label_encoder_A1.fit_transform(data['A13'])

data['A16'].replace(to_replace='+', value=1, inplace=True)
data['A16'].replace(to_replace='-', value=0, inplace=True)

In [7]:
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,30.83,0.0,1,0,12,7,1.25,1,1,1,0,0,202.0,0,1
1,0,58.67,4.46,1,0,10,3,3.04,1,1,6,0,0,43.0,560,1
2,0,24.5,0.5,1,0,10,3,1.5,1,0,0,0,0,280.0,824,1
3,1,27.83,1.54,1,0,12,7,3.75,1,1,5,1,0,100.0,3,1
4,1,20.17,5.625,1,0,12,7,1.71,1,0,0,0,2,120.0,0,1
5,1,32.08,4.0,1,0,9,7,2.5,1,0,0,1,0,360.0,0,1
6,1,33.17,1.04,1,0,11,3,6.5,1,0,0,1,0,164.0,31285,1
7,0,22.92,11.585,1,0,2,7,0.04,1,0,0,0,0,80.0,1349,1
8,1,54.42,0.5,2,2,8,3,3.96,1,0,0,0,0,180.0,314,1
9,1,42.5,4.915,2,2,12,7,3.165,1,0,0,1,0,52.0,1442,1


### Independent Dependent variable split

In [11]:
X = data.iloc[:, 0:-1]
Y = data['A16']

### Scale all : 0-1 range

In [15]:
# scaler = MinMaxScaler(feature_range=(0, 1))
# X = scaler.fit_transform(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [16]:
X

array([[1.00000000e+00, 2.56842105e-01, 0.00000000e+00, ...,
        0.00000000e+00, 1.01000000e-01, 0.00000000e+00],
       [0.00000000e+00, 6.75488722e-01, 1.59285714e-01, ...,
        0.00000000e+00, 2.15000000e-02, 5.60000000e-03],
       [0.00000000e+00, 1.61654135e-01, 1.78571429e-02, ...,
        0.00000000e+00, 1.40000000e-01, 8.24000000e-03],
       ...,
       [0.00000000e+00, 1.72932331e-01, 4.82142857e-01, ...,
        0.00000000e+00, 1.00000000e-01, 1.00000000e-05],
       [1.00000000e+00, 6.27067669e-02, 7.32142857e-03, ...,
        0.00000000e+00, 1.40000000e-01, 7.50000000e-03],
       [1.00000000e+00, 3.19548872e-01, 1.20535714e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

## Building neural net

### Average f1 scores with 5-fold cv

In [23]:
cross_val_score(best_svr, X, y, cv=5)

NameError: name 'best_svr' is not defined

In [20]:
# manual method
"""
f1_scores=[]
cv = KFold(n_splits=5)
for train_index, test_index in cv.split(X):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)
    
    X_train, X_test, Y_train, Y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]
    
    # fit the model
    # scores.append(model.score(X_test, y_test))

print(np.mean(scores))
"""

'\nf1_scores=[]\ncv = KFold(n_splits=5)\nfor train_index, test_index in cv.split(X):\n    print("Train Index: ", train_index, "\n")\n    print("Test Index: ", test_index)\n    \n    X_train, X_test, Y_train, Y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]\n    \n    # fit the model\n    # scores.append(model.score(X_test, y_test))\n\nprint(np.mean(scores))\n'