# Load Required Packages

In [37]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV,RepeatedStratifiedKFold,cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import scipy.io
import pickle
import os, sys
from scipy.spatial.distance import pdist
import time 
import xlsxwriter
from sklearn.metrics import accuracy_score, classification_report,make_scorer, confusion_matrix,roc_auc_score
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
#from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from sklearn import ensemble
from sklearn.svm import SVC
import random
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Input, Dropout
from keras.layers import BatchNormalization
from keras.models import Model
from keras import initializers
from keras.optimizers import Adam
from keras.utils import to_categorical

# Loading Data and Generate balanced data set

## 1. Set Directories and Read labels

In [38]:
train_dir = "../data/train_set"

train_image_dir =  os.path.join(train_dir, 'images')
train_pt_dir =  os.path.join(train_dir, 'points' )
train_label_path =  os.path.join(train_dir,"label.csv")

labels = pd.read_csv('../data/train_set/label.csv')
y= labels['label'].to_numpy()

# read points
n = 3000
for i in range(1,n+1): 
    file = "%04d.mat"%(i)
    points_path = os.path.join(train_pt_dir, file)
    mat = scipy.io.loadmat(points_path)
    if 'faceCoordinatesUnwarped' in mat:
        cords = mat['faceCoordinatesUnwarped'] 
    else:
        cords = mat['faceCoordinates2']

    distance = sklearn.metrics.pairwise_distances(cords)
    flatten_distance = distance[np.triu_indices(len(cords[:,0]), k = 1)]
    if i==1:
        distances = np.mat([flatten_distance])
    else:
        distances = np.append(distances, np.mat([flatten_distance]), axis = 0)

## 2. Create balanced data set 

In [39]:
print('The number of class 0 is ' + str(n-sum(y)))
print('The number of class 1 is ' + str(sum(y)))
print('Only %.2f'% (sum(y)/n*100) + '% of total dataset are class 1. ')
print('So, it is an unbalanced dataset, we need to do some data preprocessing.')
print('Here, we are using oversampling to generate more class 1 datasets.')

mat_1 = np.add(np.where(y == 1),1)
n_oversample = (n-sum(y))-sum(y)

for i in range(n_oversample):
    samples_index = random.sample(list(list(mat_1)[0]), 2)

    #file = "%04d.mat"%(str(samples_index[0]))
    p_path = str(samples_index[0]).zfill(4)+'.mat'
    points_path = os.path.join(train_pt_dir, p_path)
    mat = scipy.io.loadmat(points_path)
    if 'faceCoordinatesUnwarped' in mat:
        cords_0 = mat['faceCoordinatesUnwarped'] 
    else:
        cords_0 = mat['faceCoordinates2']
  
    p_path = str(samples_index[1]).zfill(4)+'.mat'
    points_path = os.path.join(train_pt_dir, p_path)
    mat = scipy.io.loadmat(points_path)
    if 'faceCoordinatesUnwarped' in mat:
        cords_1 = mat['faceCoordinatesUnwarped'] 
    else:
        cords_1 = mat['faceCoordinates2']

    cords_new = (cords_0 + cords_1) / 2
    distance = sklearn.metrics.pairwise_distances(cords_new)
    flatten_distance = distance[np.triu_indices(len(cords_new[:,0]), k = 1)]
    distances = np.append(distances, np.mat([flatten_distance]), axis = 0)
    y = np.append(y,np.array(1))


The number of class 0 is 2402
The number of class 1 is 598
Only 19.93% of total dataset are class 1. 
So, it is an unbalanced dataset, we need to do some data preprocessing.
Here, we are using oversampling to generate more class 1 datasets.


In [40]:
# Save as csv files
distances_df = pd.DataFrame(distances)
y_df = pd.DataFrame(y)
distances_df.to_csv("../data/output/X_balanced.csv")
y_df.to_csv("../data/output/y_balanced.csv")

## 3. Create train and test features and labels from Balanced Data set

In [41]:
X_train,X_test,y_train,y_test = train_test_split(distances,y,test_size=0.2,random_state=666)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
print(sum(y_train),sum(y_test))

(3843, 3003) (961, 3003) (3843,) (961,)
1918 484


# Part I Baseline Model: GBM on Original data set

## 1. Reload original data set & Provide directories for training/testing data

In [42]:
#Orig data set: 2402 points with label 0, 598 points with label 1

info = pd.read_csv(train_label_path)
info.head(5)
train_index, test_index = train_test_split(info['Index'], test_size=0.2, random_state=42)

# Read mat files and store coordinates in readmat_matrix list
m = []
for idx in info['Index']: 
    file = "%04d.mat"%(idx)
    m.append(scipy.io.loadmat(os.path.join(train_pt_dir, file)))

readmat= [x[[i for i in x.keys() if not i in ['__header__', '__version__', '__globals__']][0]] for x in m]



## 2. Create train and test features and labels

In [43]:
train_points = np.array([readmat[i-1] for i in train_index])
test_points = [readmat[i-1] for i in test_index]

#print(train_points[0].shape)

train_features = np.array([pdist(train_points[i],metric='euclidean') for i in range(len(train_points))])
test_features = np.array([pdist(test_points[i],metric='euclidean') for i in range(len(test_points))])
#print(train_features.shape, test_features.shape)

#pdist returns condensed matrix, hence 3003 rows instead of 6006
train_labels=info.index[train_index-1]
test_labels=info.index[test_index-1]
#print(train_labels)

In [44]:
colnames = ['feature'+str(i) for i in range(train_features.shape[1])]
train_features = pd.DataFrame(train_features,columns=colnames)
train_labels = pd.DataFrame(list(info['label'].iloc[train_index-1]),columns=['labels'])['labels']
#print(train_labels)
test_features = pd.DataFrame(test_features,columns=colnames)
test_labels = pd.DataFrame(list(info['label'].iloc[test_index-1]),columns=['labels'])['labels']

In [45]:
train_features.head(5)

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature2993,feature2994,feature2995,feature2996,feature2997,feature2998,feature2999,feature3000,feature3001,feature3002
0,38.063196,19.993422,5.752959,21.422734,39.576571,21.181455,10.784682,19.993232,160.545773,122.160819,...,51.117497,101.748654,152.195915,202.637628,51.253897,102.383877,153.441337,51.292388,102.54435,51.304453
1,30.340658,16.144816,8.099614,18.881226,32.982027,18.409906,8.759513,16.857081,135.350948,108.250797,...,44.677349,89.258559,133.606899,177.348525,44.822146,89.512774,133.798749,44.80793,89.405559,44.800222
2,38.314598,23.083361,14.392694,22.848741,31.143079,17.813201,15.241226,25.000822,141.163299,109.971393,...,55.300554,102.842625,145.375558,200.640173,48.308656,91.421165,147.339076,43.178082,99.21592,56.074628
3,43.449462,23.631751,10.243177,22.534345,40.526353,21.141838,11.331717,25.421174,186.226844,138.256224,...,64.249114,127.436392,189.729347,252.121656,64.221913,128.022285,191.761254,64.335005,128.607704,64.381014
4,33.611191,18.531421,8.499133,19.298344,35.429306,19.617849,7.84714,19.282997,165.616053,131.582536,...,65.828678,118.552737,160.219707,207.868098,53.738951,96.107815,145.058824,42.441912,91.778186,49.627279


## 3. Train a GBM model using random parameters on ORIGINAL data set

In [146]:
gbm = GradientBoostingClassifier(learning_rate=0.1,max_depth=2,n_estimators=100) 
start_time=time.time()
gbm.fit(train_features, train_labels)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))


Training  model takes 117.436 seconds


In [148]:
print('Accuracy of the GBM on test set: {:.3f}'.format(gbm.score(test_features,test_labels)))

start = time.time()
prediction = gbm.predict(test_features)
end = time.time()

predprob = gbm.predict_proba(test_features)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(test_labels)!= prediction))
print('Classification report \n', classification_report(test_labels, prediction))

#Since the class distribution is imbalanced/ skewed, we should look at the confusion matrix and AUC
print('Confusion Matrix \n', confusion_matrix(test_labels, prediction))
print('AUC is: {:.4f}'.format(roc_auc_score(test_labels, predprob)))


Accuracy of the GBM on test set: 0.797
Predicting test data takes 0.025 seconds
Classification error rate: 0.20333333333333334
Classification report 
               precision    recall  f1-score   support

           0       0.80      0.98      0.88       461
           1       0.74      0.19      0.30       139

    accuracy                           0.80       600
   macro avg       0.77      0.58      0.59       600
weighted avg       0.79      0.80      0.75       600

Confusion Matrix 
 [[452   9]
 [113  26]]
AUC is: 0.7969


## 4. GBM Cross Validation and Parameter tuning

### 4.1 CV on GBM learning rate and max_depth

In [23]:
param_grid = {'learning_rate':[0.05,0.1], 'max_depth': [1,2,3]}
grid = GridSearchCV(GradientBoostingClassifier(),param_grid,refit=True,verbose=3)
grid.fit(train_features,train_labels)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] learning_rate=0.05, max_depth=1 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... learning_rate=0.05, max_depth=1, score=0.821, total=  45.8s
[CV] learning_rate=0.05, max_depth=1 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   45.7s remaining:    0.0s


[CV] ..... learning_rate=0.05, max_depth=1, score=0.812, total=  47.3s
[CV] learning_rate=0.05, max_depth=1 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s


[CV] ..... learning_rate=0.05, max_depth=1, score=0.815, total=  46.6s
[CV] learning_rate=0.05, max_depth=1 .................................
[CV] ..... learning_rate=0.05, max_depth=1, score=0.817, total=  47.4s
[CV] learning_rate=0.05, max_depth=1 .................................
[CV] ..... learning_rate=0.05, max_depth=1, score=0.817, total=  46.8s
[CV] learning_rate=0.05, max_depth=2 .................................
[CV] ..... learning_rate=0.05, max_depth=2, score=0.825, total= 1.6min
[CV] learning_rate=0.05, max_depth=2 .................................
[CV] ..... learning_rate=0.05, max_depth=2, score=0.823, total= 1.6min
[CV] learning_rate=0.05, max_depth=2 .................................
[CV] ..... learning_rate=0.05, max_depth=2, score=0.817, total= 1.6min
[CV] learning_rate=0.05, max_depth=2 .................................
[CV] ..... learning_rate=0.05, max_depth=2, score=0.842, total= 1.5min
[CV] learning_rate=0.05, max_depth=2 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 47.0min finished


GridSearchCV(estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.05, 0.1], 'max_depth': [1, 2, 3]},
             verbose=3)

In [129]:
print(grid.best_params_)
print(grid.best_estimator_)


{'learning_rate': 0.1, 'max_depth': 2}
GradientBoostingClassifier(max_depth=2)


### 4.2 CV on GBM with n_estimators 

In [29]:
param_grid2 = {'n_estimators':[50,100,250,500]} 
grid2 = GridSearchCV(GradientBoostingClassifier(learning_rate = 0.1, max_depth = 2),param_grid= param_grid2,refit=True,verbose=3)
grid2.fit(train_features,train_labels)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..................... n_estimators=50, score=0.823, total=  45.4s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   45.3s remaining:    0.0s


[CV] ..................... n_estimators=50, score=0.823, total=  48.9s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s


[CV] ..................... n_estimators=50, score=0.812, total=  47.0s
[CV] n_estimators=50 .................................................
[CV] ..................... n_estimators=50, score=0.838, total=  44.4s
[CV] n_estimators=50 .................................................
[CV] ..................... n_estimators=50, score=0.825, total=  44.2s
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.833, total= 1.5min
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.833, total= 1.5min
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.821, total= 1.5min
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.846, total= 1.5min
[CV] n_estimators=100 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 67.7min finished


GridSearchCV(estimator=GradientBoostingClassifier(max_depth=2),
             param_grid={'n_estimators': [50, 100, 250, 500]}, verbose=3)

In [31]:
print(grid2.best_params_)
print(grid2.best_estimator_)
grid2_predictions = grid2.predict(test_features)
print(confusion_matrix(test_labels,grid2_predictions))
print(classification_report(test_labels,grid2_predictions))

{'n_estimators': 500}
GradientBoostingClassifier(max_depth=2, n_estimators=500)
[[439  22]
 [ 88  51]]
              precision    recall  f1-score   support

           0       0.83      0.95      0.89       461
           1       0.70      0.37      0.48       139

    accuracy                           0.82       600
   macro avg       0.77      0.66      0.68       600
weighted avg       0.80      0.82      0.79       600



## 5. Final Parameter for baseline GBM set at: learning_rate=0.1, n_estimators=500, max_depth=2

In [36]:
#Training baseline: GBM using best parameters found above through CV

gbm_best = GradientBoostingClassifier(learning_rate=0.1,max_depth=2,n_estimators=500) 
start_time=time.time()
gbm_best.fit(train_features, train_labels)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))


Training  model takes 900.249 seconds


In [63]:
print('Accuracy of the GBM on test set: {:.3f}'.format(gbm_best.score(test_features,test_labels)))

start = time.time()
baseline_pred = gbm_best.predict(test_features)
end = time.time()

baseline_predprob = gbm_best.predict_proba(test_features)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(test_labels)!= baseline_pred))
print('Classification report \n', classification_report(test_labels, baseline_pred))

#Since the class distribution is imbalanced/ skewed, we should look at the confusion matrix and AUC
print('Confusion Matrix \n', confusion_matrix(test_labels, baseline_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(test_labels, baseline_predprob)))

Accuracy of the GBM on test set: 0.820
Predicting test data takes 0.024 seconds
Classification error rate: 0.18
Classification report 
               precision    recall  f1-score   support

           0       0.83      0.96      0.89       461
           1       0.72      0.37      0.49       139

    accuracy                           0.82       600
   macro avg       0.78      0.66      0.69       600
weighted avg       0.81      0.82      0.80       600

Confusion Matrix 
 [[441  20]
 [ 88  51]]
AUC is: 0.8100


### Cross validation improved accuracy from 0.797 to 0.82, and AUC from 0.797 to 0.81

In [38]:
#Save best gbm model
pickle.dump(gbm_best, open('../data/output/baseline_gbm.p','wb'))

#Load gbm model
#pickle.load(open('../data/output/baseline_gbm.p,'rb'))

## 6. Run best GBM on BALANCED DATA SET

In [63]:
# GBM using best parameters found above through CV
gbm_best = GradientBoostingClassifier(learning_rate=0.1,max_depth=2,n_estimators=500) 
start_time=time.time()
gbm_best.fit(X_train, y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

Training  model takes 1123.198 seconds


In [67]:
print('Accuracy of the GBM on test set: {:.3f}'.format(gbm_best.score(X_test,y_test)))

start = time.time()
balanced_pred = gbm_best.predict(X_test)
end = time.time()

balanced_predprob = gbm_best.predict_proba(X_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_test)!= balanced_pred))
print('Classification report \n', classification_report(y_test, balanced_pred))

#Since the class distribution is imbalanced/ skewed, we should look at the confusion matrix and AUC
print('Confusion Matrix \n', confusion_matrix(y_test, balanced_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_test, balanced_predprob)))

Accuracy of the GBM on test set: 0.853
Predicting test data takes 0.028 seconds
Classification error rate: 0.14672216441207075
Classification report 
               precision    recall  f1-score   support

           0       0.85      0.86      0.85       477
           1       0.86      0.85      0.85       484

    accuracy                           0.85       961
   macro avg       0.85      0.85      0.85       961
weighted avg       0.85      0.85      0.85       961

Confusion Matrix 
 [[410  67]
 [ 74 410]]
AUC is: 0.9334


### Balancing the data set improved accuracy from 0.82 to 0.85, and AUC from 0.81 to 0.93

# Part II: Other Models 

## 1. Standard SVM model using the BALANCED DATA SET

### 1.1 Cross Validation on standard SVM 

In [46]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid_svm = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid_svm.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.501, total=  54.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.9s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.501, total=  52.9s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.501, total= 1.3min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.501, total=  54.3s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.501, total= 1.0min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.501, total= 1.2min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.501, total=  54.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.501, total=  50.8s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.501, total=  54.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.501, total=  43.7s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.502, total=  43.9s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.503, total=  44.7s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.503, total=  44.4s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.507, total=  44.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.501, total=  43.3s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.502, total=  44.2s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.503, total=  45.2s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.507, total=  46.7s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.510, total=  46.5s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.510, total=  50.1s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.508, total=  44.0s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.516, total=  47.6s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.520, total=  45.9s


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 99.6min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [48]:
print(grid_svm.best_params_)
print(grid_svm.best_estimator_)


{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=10, gamma=0.0001)


### 1.2 Training Standard SVM using best parameters found through CV on BALANCED DATA SET

In [49]:
svm_best = SVC(C=10,gamma=0.0001,kernel='rbf',probability=True) 
start_time=time.time()
svm_best.fit(X_train, y_train)

print("Training  model takes %s seconds" % round((time.time() - start_time),3))
print('Accuracy of SVM on test set: {:.3f}'.format(svm_best.score(X_test,y_test)))

start = time.time()
svm_pred = svm_best.predict(X_test)
end = time.time()

svm_predprob = svm_best.predict_proba(X_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_test)!= svm_pred))
print('Classification report \n', classification_report(y_test, svm_pred))

print('Confusion Matrix \n', confusion_matrix(y_test, svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_test, svm_predprob)))

Training  model takes 278.711 seconds
Accuracy of SVM on test set: 0.515
Predicting test data takes 14.369 seconds
Classification error rate: 0.48491155046826223
Classification report 
               precision    recall  f1-score   support

           0       0.51      1.00      0.67       477
           1       0.91      0.04      0.08       484

    accuracy                           0.52       961
   macro avg       0.71      0.52      0.37       961
weighted avg       0.71      0.52      0.37       961

Confusion Matrix 
 [[475   2]
 [464  20]]
AUC is: 0.5185


In [50]:
#Save trained SVM model
pickle.dump(svm_best, open('../data/output/best_svm.p','wb'))

#Load trained SVM model
#pickle.load(open('../data/output/best_svm.p','rb'))

## 2. Weighted SVM on ORIGINAL UNBALANCED DATA SET

In [52]:
#Reference: https://machinelearningmastery.com/cost-sensitive-svm-for-imbalanced-classification

weighted_svm = SVC(gamma = 'scale', class_weight = 'balanced')

#CV Weighted SVM 
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(weighted_svm, train_features, train_labels, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.695


### 2.1 Cross Validation on Weighted SVM with different class_weights

In [53]:
#Grid Search on Weighted SVM
# define grid
balance = [{0:598.0, 1:2402.0},{0:1,1:100}, {0:1,1:10}, {0:1,1:1}, {0:10,1:1}, {0:100,1:1}]
param_grid = dict(class_weight=balance)

grid_weightedsvm = GridSearchCV(estimator=weighted_svm, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
grid_weightedsvm.fit(train_features,train_labels)
grid_weightedsvm.best_params_
grid_weightedsvm.best_estimator_

SVC(class_weight={0: 598.0, 1: 2402.0})

In [54]:
grid_weightedsvm.best_params_
grid_weightedsvm.best_estimator_

SVC(class_weight={0: 598.0, 1: 2402.0})

In [55]:
# Report the best configuration
print("Best: %f using %s" % (grid_weightedsvm.best_score_, grid_weightedsvm.best_params_))
# report all configurations
means = grid_weightedsvm.cv_results_['mean_test_score']
stds = grid_weightedsvm.cv_results_['std_test_score']
params = grid_weightedsvm.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.827847 using {'class_weight': {0: 598.0, 1: 2402.0}}
0.827847 (0.029792) with: {'class_weight': {0: 598.0, 1: 2402.0}}
0.794395 (0.034848) with: {'class_weight': {0: 1, 1: 100}}
0.781602 (0.033008) with: {'class_weight': {0: 1, 1: 10}}
0.797094 (0.025194) with: {'class_weight': {0: 1, 1: 1}}
0.794015 (0.032520) with: {'class_weight': {0: 10, 1: 1}}
0.794015 (0.032520) with: {'class_weight': {0: 100, 1: 1}}


### 2.2 Training weighted SVM using best parameters found above through CV on ORIGINAL DATA

In [56]:
weighted_svm_best = SVC(gamma = 'scale', class_weight ={0: 598.0, 1: 2402.0},probability=True)
start_time=time.time()
weighted_svm_best.fit(train_features, train_labels)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

print('Accuracy of weighted SVM on test set: {:.3f}'.format(weighted_svm_best.score(test_features,test_labels)))

start = time.time()
weighted_svm_pred = weighted_svm_best.predict(test_features)
end = time.time()
print(weighted_svm_pred[0:5,])

weighted_svm_predprob = weighted_svm_best.predict_proba(test_features)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(test_labels)!= weighted_svm_pred))
print('Classification report \n', classification_report(test_labels, weighted_svm_pred))

print('Confusion Matrix \n', confusion_matrix(test_labels, weighted_svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(test_labels, weighted_svm_predprob)))

Training  model takes 59.17 seconds
Accuracy of weighted SVM on test set: 0.775
[0 0 0 1 0]
Predicting test data takes 2.268 seconds
Classification error rate: 0.225
Classification report 
               precision    recall  f1-score   support

           0       0.88      0.82      0.85       461
           1       0.51      0.62      0.56       139

    accuracy                           0.78       600
   macro avg       0.69      0.72      0.70       600
weighted avg       0.79      0.78      0.78       600

Confusion Matrix 
 [[379  82]
 [ 53  86]]
AUC is: 0.7882


In [57]:
#Save trained weighted SVM model
pickle.dump(weighted_svm_best, open('../data/output/best_weighted_svm.p','wb'))

#Load trained weighted SVM model
#pickle.load(open('../data/output/best_weighted_svm.p','rb'))

## 3. KNN Classifier model using the BALANCED DATA SET 

### 3.1 Cross validation on KNN Classifier using different n_neighbors numbers

In [58]:
#Define KNN Classifier
knn = KNeighborsClassifier()

#CV on KNN
param_grid = {'n_neighbors': np.arange(1,44,4)} 
grid_knn = GridSearchCV(knn,param_grid,refit=True,verbose=3)
grid_knn.fit(X_train,y_train)


Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] n_neighbors=1 ...................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....................... n_neighbors=1, score=0.754, total=  10.3s
[CV] n_neighbors=1 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.2s remaining:    0.0s


[CV] ....................... n_neighbors=1, score=0.735, total=   9.6s
[CV] n_neighbors=1 ...................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   19.8s remaining:    0.0s


[CV] ....................... n_neighbors=1, score=0.728, total=  10.1s
[CV] n_neighbors=1 ...................................................
[CV] ....................... n_neighbors=1, score=0.746, total=  11.9s
[CV] n_neighbors=1 ...................................................
[CV] ....................... n_neighbors=1, score=0.732, total=  10.2s
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.697, total=  11.0s
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.709, total=  11.8s
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.666, total=  11.7s
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.691, total=  11.3s
[CV] n_neighbors=5 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed: 10.8min finished


GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41])},
             verbose=3)

In [59]:
print(grid_knn.best_params_)
print(grid_knn.best_estimator_)

{'n_neighbors': 1}
KNeighborsClassifier(n_neighbors=1)


### 3.2 Training KNN classifier using best parameters found above through CV

In [60]:
#Training standard KNN using best parameters found above through CV on BALANCED DATA SET

knn_best = KNeighborsClassifier(n_neighbors=1) 
start_time=time.time()
knn_best.fit(X_train, y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

#predictors=list(train_features)

print('Accuracy of knn on test set: {:.3f}'.format(knn_best.score(X_test,y_test)))

start = time.time()
knn_pred = knn_best.predict(X_test)
end = time.time()

knn_predprob = knn_best.predict_proba(X_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_test)!= knn_pred))
print('Classification report \n', classification_report(y_test, knn_pred))

print('Confusion Matrix \n', confusion_matrix(y_test, knn_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_test, knn_predprob)))

Training  model takes 2.322 seconds
Accuracy of knn on test set: 0.734
Predicting test data takes 15.29 seconds
Classification error rate: 0.2663891779396462
Classification report 
               precision    recall  f1-score   support

           0       0.84      0.57      0.68       477
           1       0.68      0.90      0.77       484

    accuracy                           0.73       961
   macro avg       0.76      0.73      0.73       961
weighted avg       0.76      0.73      0.73       961

Confusion Matrix 
 [[271 206]
 [ 50 434]]
AUC is: 0.7324


In [62]:
#Save trained KNN model
pickle.dump(knn_best, open('../data/output/best_knn.p','wb'))

#Load trained KNN model
#pickle.load(open('../data/output/best_knn.p','rb'))