In [1]:
from keras.utils import to_categorical
from keras.layers import Dropout
from keras.layers import Dense
from keras import models
from keras import optimizers
from keras import backend as K
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Prepare Dataset

In [2]:
def prepare(file):
    df = pd.read_csv(file).sample(frac = 1)
    y = df.label.tolist()
    X = np.matrix(df.drop(labels = ['label'], axis = 1)).astype(np.float)
    print(X.shape)
    
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    return X, y

# Logistic As Benchmark

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [4]:
def myMetrics(tn, fp, fn, tp):
    print('accuracy is: %s'%  float((tp+tn)/(tp+fp+fn+tn)))
    print('precition is: %s'% (tp/(tp+fp)))
    print('recall is: %s' % (tp/(tp+fn)))
    print('false positive ratio is: %s' % (fp/(fp+tn)))
    print('\n')

In [5]:
def log(X,y):
    clf = LogisticRegression().fit(X, y)
    y_pred = clf.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[1, 0]).ravel()
    myMetrics(tp, tn, fp, fn)

# XGB

In [6]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

def myXgb(X, y, X_test):
    
    X_A, X_B, Y_A, Y_B  = train_test_split(X, y, test_size=0.33)
    dtrain = xgb.DMatrix(X_A, label=Y_A)
    param = {'max_depth': 10}
    dtest = xgb.DMatrix(X_B, label = Y_B)
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    num_round = 5
    bst = xgb.train(param, dtrain, num_round, evallist)
    dtest = xgb.DMatrix(X_test)
    y_pred = bst.predict(dtest)
    
    return y_pred

# ANN

In [13]:
def recall(y_true, y_pred):
   
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
    
def precision(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    
    myPrecision = precision(y_true, y_pred)
    myRecall = recall(y_true, y_pred)
    return 2*((myPrecision*myRecall)/(myPrecision+myRecall+K.epsilon()))

In [47]:
model = models.Sequential()
model.add(Dense(15, input_dim=29, kernel_initializer='uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, kernel_initializer='uniform', activation='relu'))
sgd = optimizers.sgd(lr=0.0005, momentum=0.5, decay=0.000002, nesterov=False)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy',f1,recall,precision])

In [61]:
#get the training dataset

files = [('/scratch/by8jj/online training/train0.csv', '/scratch/by8jj/online training/test0.csv'),
         ('/scratch/by8jj/online training/train1.csv', '/scratch/by8jj/online training/test1.csv'),
         ('/scratch/by8jj/online training/train2.csv', '/scratch/by8jj/online training/test2.csv'),
         ('/scratch/by8jj/online training/train3.csv', '/scratch/by8jj/online training/test3.csv'),
         ('/scratch/by8jj/online training/train4.csv', '/scratch/by8jj/online training/test4.csv'),
         ('/scratch/by8jj/online training/train5.csv', '/scratch/by8jj/online training/test5.csv'),
         ('/scratch/by8jj/online training/train6.csv', '/scratch/by8jj/online training/test6.csv'),
        ]

for i in range(len(files)):
    
    print('round %s'%i)
    
    X, y = prepare(files[i][0])
    X_test, y_test = prepare(files[i][1])
    
    xgb_pred = myXgb(X, y, X_test)
    #log(X, y)
    
    if i<2:
        result = model.fit(X, y, epochs=10, batch_size=256, verbose=0, validation_split=0.3) 
        
    nn_pred = model.predict(X_test).tolist()
    
    tn, fp, fn, tp = confusion_matrix(y_test, [1 if x[0] > 0.0 else 0 for x in nn_pred], labels=[1, 0]).ravel()
    myMetrics(tn, fp, fn, tp)
    
    
    tn, fp, fn, tp = confusion_matrix(y_test, [1 if x > 0.1 else 0 for x in xgb_pred], labels=[1, 0]).ravel()
    myMetrics(tn, fp, fn, tp)
    
    tn, fp, fn, tp = confusion_matrix(y_test, [1 if (x[0] + y) > 0.15 else 0 for x, y  in zip(nn_pred, xgb_pred)], labels=[1, 0]).ravel()
    myMetrics(tn, fp, fn, tp)
    

round 0
(3475694, 29)
(2602119, 29)
[0]	eval-rmse:0.400948	train-rmse:0.400178
[1]	eval-rmse:0.341347	train-rmse:0.342036
[2]	eval-rmse:0.307854	train-rmse:0.307526
[3]	eval-rmse:0.289664	train-rmse:0.289245
[4]	eval-rmse:0.28057	train-rmse:0.278745
accuracy is: 0.9466934448424534
precition is: 0.9952039237177432
recall is: 0.8094686085140382
false positive ratio is: 0.0014734932697413948


accuracy is: 0.9459571218687539
precition is: 0.9960079252642043
recall is: 0.8061128944085448
false positive ratio is: 0.0012204103437850934


accuracy is: 0.9448395711341411
precition is: 0.9968231273538848
recall is: 0.8013596669516828
false positive ratio is: 0.0009646801068878266


round 1
(3474812, 29)
(2603637, 29)
[0]	eval-rmse:0.401072	train-rmse:0.400182
[1]	eval-rmse:0.341615	train-rmse:0.340197
[2]	eval-rmse:0.308283	train-rmse:0.307538
[3]	eval-rmse:0.290105	train-rmse:0.289271
[4]	eval-rmse:0.281141	train-rmse:0.278896
accuracy is: 0.9494814369284198
precition is: 0.9888046790226729
re

In [58]:
temp = [x[0]+y for x,y in zip(nn_pred, xgb_pred)]

In [59]:
temp1 = [1 if x > 0.1 else 0 for x in temp]

In [60]:
tn, fp, fn, tp = confusion_matrix(y_test, temp1, labels=[1, 0]).ravel()
myMetrics(tn, fp, fn, tp)

accuracy is: 0.9461112270422682
precition is: 0.9966089007258719
recall is: 0.80618718548941
false positive ratio is: 0.0010361575022938949


