Load Packages

In [1]:
import numpy as np
import scipy.io
import sklearn.metrics
import sklearn 
import os
import random
import pandas as pd
import time

# set path 

In [None]:
cwd = os.getcwd()
upperpath = os.path.dirname(cwd) 
uppupperpath = os.path.abspath(os.path.join(os.getcwd(), ".."))
path = uppupperpath + '/data/train_set/'
image_dir = path+"images/"
pt_dir = path+"points/"
label_path = path+"label.csv"

# read files

In [3]:
def read_data(path):
  
  # read labels
  labels = pd.read_csv(path+'/label.csv')
  y= labels['label'].to_numpy()

  # read points
  n = 3000
  for i in range(1,n+1):
    p_path = str(i).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords = mat['faceCoordinatesUnwarped'] 
    else:
      cords = mat['faceCoordinates2']

    distance = sklearn.metrics.pairwise_distances(cords)       
          # compute the pairwise distances in each mat
    flatten_distance = distance[np.triu_indices(len(cords[:,0]), k = 1)]    
          # stretch the upper triangle of the symmetric matrix 
          # to a long array with dimension 3003
          # 3003 = (1+77)*78/2
    if i==1:
      distances = np.mat([flatten_distance])
    else:
      distances = np.append(distances, np.mat([flatten_distance]), axis = 0)
  return (distances, y)

In [4]:
read_time_start=time.time()
X, Y = read_data(path)
print("Read the original dataset takes %s seconds" % round((time.time() - read_time_start),3))

Read the original dataset takes 81.951 seconds


In [5]:
X.shape, Y.shape 

((3000, 3003), (3000,))

Data Preprocessing Imbalanced Dataset (SMOTE)-oversamplig

In [6]:
print('majority class: %d' % np.sum(Y == 0))
print('minority class: %d' % np.sum(Y == 1))
#imbalanced dataset

majority class: 2402
minority class: 598


# oversample then split data

oversample

In [7]:
def data_preprocessing(X, Y, path):

  distances = X
  y = Y

  n = y.shape[0]
  mat_1 = np.add(np.where(y == 1),1)
  n_oversample = (n-sum(y))-sum(y) 
    # how many samples do we need to generate

  for i in range(n_oversample):
    samples_index = random.sample(list(list(mat_1)[0]), 2)
      # pick two random index of class 1 samples. 

    p_path = str(samples_index[0]).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords_0 = mat['faceCoordinatesUnwarped'] 
    else:
      cords_0 = mat['faceCoordinates2']
    
    p_path = str(samples_index[1]).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords_1 = mat['faceCoordinatesUnwarped'] 
    else:
      cords_1 = mat['faceCoordinates2']

    cords_new = (cords_0 + cords_1) / 2 
        # averaging two sets of cordinates to generate new set of cordinates
    distance = sklearn.metrics.pairwise_distances(cords_new)
        # compute the pairwise distances in each mat
    flatten_distance = distance[np.triu_indices(len(cords_new[:,0]), k = 1)]
        # stretch the upper triangle of the symmetric matrix 
        # to a long array with dimension 3003
        # 3003 = (1+77)*78/2
    
    distances = np.append(distances, np.mat([flatten_distance]), axis = 0)
    y = np.append(y,np.array(1))
        # Append new data to the original dataset

  return (distances, y)

In [8]:
X_balanced, Y_balanced = data_preprocessing(X, Y, path)
X_balanced.shape, Y_balanced.shape

((4804, 3003), (4804,))

In [9]:
print('majority class: %d' % np.sum(Y_balanced == 0))
print('minority class: %d' % np.sum(Y_balanced == 1))
#balanced dataset

majority class: 2402
minority class: 2402


split data

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import smote_variants as sv
import imbalanced_databases as imbd

In [11]:
from sklearn.model_selection import train_test_split
X_balanced_train,X_balanced_test,y_balanced_train,y_balanced_test= train_test_split(X_balanced, Y_balanced,test_size=0.2,random_state=0)
X_balanced_train.shape,X_balanced_test.shape,y_balanced_train.shape,y_balanced_test.shape

((3843, 3003), (961, 3003), (3843,), (961,))

In [12]:
print('majority train class: %d' % np.sum(y_balanced_train == 0))
print('minority train class: %d' % np.sum(y_balanced_train == 1))
print('majority test class: %d' % np.sum(y_balanced_test == 0))
print('minority test class: %d' % np.sum(y_balanced_test == 1))

majority train class: 1918
minority train class: 1925
majority test class: 484
minority test class: 477


# split then oversample

split

In [13]:
X.shape, Y.shape 

((3000, 3003), (3000,))

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.20,random_state=0)
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((2400, 3003), (2400,), (600, 3003), (600,))

In [15]:
print('majority train class: %d' % np.sum(Y_train == 0))
print('minority train class: %d' % np.sum(Y_train == 1))
print('majority test class: %d' % np.sum(Y_test == 0))
print('minority test class: %d' % np.sum(Y_test == 1))
#imbalanced dataset

majority train class: 1923
minority train class: 477
majority test class: 479
minority test class: 121


SMOTE overdampling

In [16]:
def data_preprocessing(X_train, Y_train, path):

  distances = X_train
  y = Y_train

  n = y.shape[0]
  mat_1 = np.add(np.where(y == 1),1)
  n_oversample = (n-sum(y))-sum(y) 
    # how many samples do we need to generate

  for i in range(n_oversample):
    samples_index = random.sample(list(list(mat_1)[0]), 2)
      # pick two random index of class 1 samples. 

    p_path = str(samples_index[0]).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords_0 = mat['faceCoordinatesUnwarped'] 
    else:
      cords_0 = mat['faceCoordinates2']
    
    p_path = str(samples_index[1]).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords_1 = mat['faceCoordinatesUnwarped'] 
    else:
      cords_1 = mat['faceCoordinates2']

    cords_new = (cords_0 + cords_1) / 2 
        # averaging two sets of cordinates to generate new set of cordinates
    distance = sklearn.metrics.pairwise_distances(cords_new)
        # compute the pairwise distances in each mat
    flatten_distance = distance[np.triu_indices(len(cords_new[:,0]), k = 1)]
        # stretch the upper triangle of the symmetric matrix 
        # to a long array with dimension 3003
        # 3003 = (1+77)*78/2
    
    distances = np.append(distances, np.mat([flatten_distance]), axis = 0)
    y = np.append(y,np.array(1))
        # Append new data to the original dataset

  return (distances, y)

In [17]:
X_train_balanced, y_train_balanced = data_preprocessing(X_train, Y_train, path)
X_train_balanced.shape, y_train_balanced.shape

((3846, 3003), (3846,))

In [18]:
def data_preprocessing(X_test, Y_test, path):

  distances = X_test
  y = Y_test

  n = y.shape[0]
  mat_1 = np.add(np.where(y == 1),1)
  n_oversample = (n-sum(y))-sum(y) 
    # how many samples do we need to generate

  for i in range(n_oversample):
    samples_index = random.sample(list(list(mat_1)[0]), 2)
      # pick two random index of class 1 samples. 

    p_path = str(samples_index[0]).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords_0 = mat['faceCoordinatesUnwarped'] 
    else:
      cords_0 = mat['faceCoordinates2']
    
    p_path = str(samples_index[1]).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords_1 = mat['faceCoordinatesUnwarped'] 
    else:
      cords_1 = mat['faceCoordinates2']

    cords_new = (cords_0 + cords_1) / 2 
        # averaging two sets of cordinates to generate new set of cordinates
    distance = sklearn.metrics.pairwise_distances(cords_new)
        # compute the pairwise distances in each mat
    flatten_distance = distance[np.triu_indices(len(cords_new[:,0]), k = 1)]
        # stretch the upper triangle of the symmetric matrix 
        # to a long array with dimension 3003
        # 3003 = (1+77)*78/2
    
    distances = np.append(distances, np.mat([flatten_distance]), axis = 0)
    y = np.append(y,np.array(1))
        # Append new data to the original dataset

  return (distances, y)

In [19]:
X_test_balanced, y_test_balanced = data_preprocessing(X_test, Y_test, path)
X_test_balanced.shape, y_test_balanced.shape

((958, 3003), (958,))

In [20]:
print('majority train class: %d' % np.sum(y_train_balanced == 0))
print('minority train class: %d' % np.sum(y_train_balanced == 1))
print('majority test class: %d' % np.sum(y_test_balanced == 0))
print('minority test class: %d' % np.sum(y_test_balanced == 1))

majority train class: 1923
minority train class: 1923
majority test class: 479
minority test class: 479


# divide into 2 chunk 3 groups 

        
    • balanced train
    
        1.(train with balanced,test with balanced)
           a.X_balanced_train,y_balanced_train;X_balanced_test,y_balanced_test
           b.X_balanced_train,y_balanced_train;X_test_balanced,y_test_balanced 
        
        2.(train with balanced,test with imbalanced) 
            X_train_balanced.y_train_balanced;X_test,Y_test
      

    • imbalanced train
    
        3. (train with imbalanced,test with imbalanced)
            X_train,Y_train; X_test,Y_test 

# try with SVM

In [21]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV,RepeatedStratifiedKFold,cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import scipy.io
import pickle
import os, sys
from scipy.spatial.distance import pdist
import time 
import xlsxwriter
from sklearn.metrics import accuracy_score, classification_report,make_scorer, confusion_matrix,roc_auc_score
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
#from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from sklearn import ensemble
from sklearn.svm import SVC
import random
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Input, Dropout
from keras.layers import BatchNormalization
from keras.models import Model
from keras import initializers
from keras.optimizers import Adam
from keras.utils import to_categorical

1.a. X_balanced_train, y_balanced_train; X_balanced_test, y_balanced_test

In [22]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid_svm = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
#train with balanced
grid_svm.fit(X_balanced_train,y_balanced_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  23.1s
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  23.7s
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  26.2s
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  29.9s
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  34.1s
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  30.3s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  29.5s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  30.1s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  30.6s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  30.4s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=  30.5s
[CV 2/5] END ..................C=0.1, gamma=0.0

[CV 2/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  27.0s
[CV 3/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  27.2s
[CV 4/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  27.5s
[CV 5/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  32.5s
[CV 1/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  32.3s
[CV 2/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  31.7s
[CV 3/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  27.5s
[CV 4/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  32.9s
[CV 5/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  30.1s
[CV 1/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=  27.4s
[CV 2/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=  28.1s
[CV 3/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=  26.2s
[CV 4/5] END ...............

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [23]:
print(grid_svm.best_params_)
print(grid_svm.best_estimator_)

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=10, gamma=0.0001)


In [24]:
# X_balanced_train, y_balanced_train; X_balanced_test,y_balanced_test
#Train SVM using best parameters
svm_best = SVC(C=10,gamma=0.0001,kernel='rbf',probability=True) 
start_time=time.time()
svm_best.fit(X_balanced_train, y_balanced_train)

print("Training  model takes %s seconds" % round((time.time() - start_time),3))
print('Accuracy of SVM on test set: {:.3f}'.format(svm_best.score(X_balanced_test,y_balanced_test)))

start = time.time()
svm_pred = svm_best.predict(X_balanced_test)
end = time.time()

svm_predprob = svm_best.predict_proba(X_balanced_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_balanced_test)!= svm_pred))
print('Classification report \n', classification_report(y_balanced_test, svm_pred))

print('Confusion Matrix \n', confusion_matrix(y_balanced_test, svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_balanced_test, svm_predprob)))

Training  model takes 206.468 seconds
Accuracy of SVM on test set: 0.506
Predicting test data takes 13.029 seconds
Classification error rate: 0.49427679500520294
Classification report 
               precision    recall  f1-score   support

           0       0.85      0.02      0.04       484
           1       0.50      1.00      0.67       477

    accuracy                           0.51       961
   macro avg       0.67      0.51      0.36       961
weighted avg       0.67      0.51      0.35       961

Confusion Matrix 
 [[ 11 473]
 [  2 475]]
AUC is: 0.5062


1.b. X_balanced_train, y_balanced_train; X_test_balanced, y_test_balanced 

In [25]:
#Train SVM using best parameters
svm_best = SVC(C=10,gamma=0.0001,kernel='rbf',probability=True) 
start_time=time.time()
svm_best.fit(X_balanced_train, y_balanced_train)

print("Training  model takes %s seconds" % round((time.time() - start_time),3))
print('Accuracy of SVM on test set: {:.3f}'.format(svm_best.score(X_test_balanced,y_test_balanced)))

start = time.time()
svm_pred = svm_best.predict(X_test_balanced)
end = time.time()

svm_predprob = svm_best.predict_proba(X_test_balanced)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_test_balanced)!= svm_pred))
print('Classification report \n', classification_report(y_test_balanced, svm_pred))

print('Confusion Matrix \n', confusion_matrix(y_test_balanced, svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_test_balanced, svm_predprob)))

Training  model takes 169.537 seconds
Accuracy of SVM on test set: 0.851
Predicting test data takes 13.573 seconds
Classification error rate: 0.14926931106471816
Classification report 
               precision    recall  f1-score   support

           0       0.99      0.71      0.83       479
           1       0.77      0.99      0.87       479

    accuracy                           0.85       958
   macro avg       0.88      0.85      0.85       958
weighted avg       0.88      0.85      0.85       958

Confusion Matrix 
 [[339 140]
 [  3 476]]
AUC is: 0.8769


2. a. X_train_balanced, y_train_balanced; X_test,Y_test

In [26]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid_svm = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
#train with balanced
grid_svm.fit(X_train_balanced,y_train_balanced)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  58.7s
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  24.1s
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  24.7s
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  24.6s
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=  24.8s
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  24.6s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  24.5s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  24.6s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  24.6s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  24.4s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=  24.4s
[CV 2/5] END ..................C=0.1, gamma=0.0

[CV 2/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  21.7s
[CV 3/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  21.8s
[CV 4/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  21.7s
[CV 5/5] END ....................C=1000, gamma=1, kernel=rbf; total time=  22.0s
[CV 1/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  23.4s
[CV 2/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  21.7s
[CV 3/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  21.7s
[CV 4/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  21.9s
[CV 5/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=  22.2s
[CV 1/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=  21.9s
[CV 2/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=  21.6s
[CV 3/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=  25.7s
[CV 4/5] END ...............

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [27]:
print(grid_svm.best_params_)
print(grid_svm.best_estimator_)

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=10, gamma=0.0001)


In [28]:
# X_train_balanced, y_train_balanced; X_test,Y_test
#Train SVM using best parameters
svm_best = SVC(C=10,gamma=0.0001,kernel='rbf',probability=True) 
start_time=time.time()
svm_best.fit(X_train_balanced, y_train_balanced)

print("Training  model takes %s seconds" % round((time.time() - start_time),3))
print('Accuracy of SVM on test set: {:.3f}'.format(svm_best.score(X_test,Y_test)))

start = time.time()
svm_pred = svm_best.predict(X_test)
end = time.time()

svm_predprob = svm_best.predict_proba(X_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(Y_test)!= svm_pred))
print('Classification report \n', classification_report(Y_test, svm_pred))

print('Confusion Matrix \n', confusion_matrix(Y_test, svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(Y_test, svm_predprob)))

Training  model takes 136.552 seconds
Accuracy of SVM on test set: 0.798
Predicting test data takes 6.439 seconds
Classification error rate: 0.20166666666666666
Classification report 
               precision    recall  f1-score   support

           0       0.80      1.00      0.89       479
           1       0.00      0.00      0.00       121

    accuracy                           0.80       600
   macro avg       0.40      0.50      0.44       600
weighted avg       0.64      0.80      0.71       600

Confusion Matrix 
 [[479   0]
 [121   0]]
AUC is: 0.4969


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# X_train_balanced, y_train_balanced; X_balanced_test,y_balanced_test
#Train SVM using best parameters
svm_best = SVC(C=10,gamma=0.0001,kernel='rbf',probability=True) 
start_time=time.time()
svm_best.fit(X_train_balanced, y_train_balanced)

print("Training  model takes %s seconds" % round((time.time() - start_time),3))
print('Accuracy of SVM on test set: {:.3f}'.format(svm_best.score(X_balanced_test,y_balanced_test)))

start = time.time()
svm_pred = svm_best.predict(X_balanced_test)
end = time.time()

svm_predprob = svm_best.predict_proba(X_balanced_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_balanced_test)!= svm_pred))
print('Classification report \n', classification_report(y_balanced_test, svm_pred))

print('Confusion Matrix \n', confusion_matrix(y_balanced_test, svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_balanced_test, svm_predprob)))

Training  model takes 136.394 seconds
Accuracy of SVM on test set: 0.593
Predicting test data takes 10.105 seconds
Classification error rate: 0.4068678459937565
Classification report 
               precision    recall  f1-score   support

           0       0.55      1.00      0.71       484
           1       1.00      0.18      0.31       477

    accuracy                           0.59       961
   macro avg       0.78      0.59      0.51       961
weighted avg       0.77      0.59      0.51       961

Confusion Matrix 
 [[484   0]
 [391  86]]
AUC is: 0.8797


In [30]:
# X_train_balanced, y_train_balanced; X_test_balanced,y_test_balanced 
#Train SVM using best parameters
svm_best = SVC(C=10,gamma=0.0001,kernel='rbf',probability=True) 
start_time=time.time()
svm_best.fit(X_train_balanced, y_train_balanced)

print("Training  model takes %s seconds" % round((time.time() - start_time),3))
print('Accuracy of SVM on test set: {:.3f}'.format(svm_best.score(X_test_balanced,y_test_balanced)))

start = time.time()
svm_pred = svm_best.predict(X_test_balanced)
end = time.time()

svm_predprob = svm_best.predict_proba(X_test_balanced)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_test_balanced)!= svm_pred))
print('Classification report \n', classification_report(y_test_balanced, svm_pred))

print('Confusion Matrix \n', confusion_matrix(y_test_balanced, svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_test_balanced, svm_predprob)))

Training  model takes 138.405 seconds
Accuracy of SVM on test set: 0.502
Predicting test data takes 10.709 seconds
Classification error rate: 0.4979123173277662
Classification report 
               precision    recall  f1-score   support

           0       0.50      1.00      0.67       479
           1       1.00      0.00      0.01       479

    accuracy                           0.50       958
   macro avg       0.75      0.50      0.34       958
weighted avg       0.75      0.50      0.34       958

Confusion Matrix 
 [[479   0]
 [477   2]]
AUC is: 0.7187


3.  X_train,Y_train; X_test,Y_test

In [31]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid_svm = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
#train with balanced
grid_svm.fit(X_train,Y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   9.0s
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   9.1s
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   8.8s
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   8.9s
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   9.3s
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   8.7s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   9.9s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   9.7s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   9.0s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   8.1s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   8.0s
[CV 2/5] END ..................C=0.1, gamma=0.0

[CV 2/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   8.5s
[CV 3/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   8.5s
[CV 4/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   8.5s
[CV 5/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   8.5s
[CV 1/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   8.4s
[CV 2/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   8.5s
[CV 3/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   8.4s
[CV 4/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   8.7s
[CV 5/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   8.4s
[CV 1/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   8.5s
[CV 2/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   8.7s
[CV 3/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=  10.4s
[CV 4/5] END ...............

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [32]:
print(grid_svm.best_params_)
print(grid_svm.best_estimator_)

{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
SVC(C=0.1, gamma=1)


In [33]:
# X_train,Y_train; X_test,Y_test
#Train SVM using best parameters
svm_best = SVC(C=0.1,gamma=1,kernel='rbf',probability=True) 
start_time=time.time()
svm_best.fit(X_train, Y_train)

print("Training  model takes %s seconds" % round((time.time() - start_time),3))
print('Accuracy of SVM on test set: {:.3f}'.format(svm_best.score(X_test,Y_test)))

start = time.time()
svm_pred = svm_best.predict(X_test)
end = time.time()

svm_predprob = svm_best.predict_proba(X_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(Y_test)!= svm_pred))
print('Classification report \n', classification_report(Y_test, svm_pred))

print('Confusion Matrix \n', confusion_matrix(Y_test, svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(Y_test, svm_predprob)))

Training  model takes 53.695 seconds
Accuracy of SVM on test set: 0.798
Predicting test data takes 4.011 seconds
Classification error rate: 0.20166666666666666
Classification report 
               precision    recall  f1-score   support

           0       0.80      1.00      0.89       479
           1       0.00      0.00      0.00       121

    accuracy                           0.80       600
   macro avg       0.40      0.50      0.44       600
weighted avg       0.64      0.80      0.71       600

Confusion Matrix 
 [[479   0]
 [121   0]]
AUC is: 0.5000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
# X_train,Y_train; X_balanced_test,y_balanced_test
#Train SVM using best parameters
svm_best = SVC(C=0.1,gamma=1,kernel='rbf',probability=True) 
start_time=time.time()
svm_best.fit(X_train, Y_train)

print("Training  model takes %s seconds" % round((time.time() - start_time),3))
print('Accuracy of SVM on test set: {:.3f}'.format(svm_best.score(X_balanced_test,y_balanced_test)))

start = time.time()
svm_pred = svm_best.predict(X_balanced_test)
end = time.time()

svm_predprob = svm_best.predict_proba(X_balanced_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_balanced_test)!= svm_pred))
print('Classification report \n', classification_report(y_balanced_test, svm_pred))

print('Confusion Matrix \n', confusion_matrix(y_balanced_test, svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_balanced_test, svm_predprob)))

Training  model takes 50.256 seconds
Accuracy of SVM on test set: 0.504
Predicting test data takes 6.741 seconds
Classification error rate: 0.4963579604578564
Classification report 
               precision    recall  f1-score   support

           0       0.50      1.00      0.67       484
           1       0.00      0.00      0.00       477

    accuracy                           0.50       961
   macro avg       0.25      0.50      0.33       961
weighted avg       0.25      0.50      0.34       961

Confusion Matrix 
 [[484   0]
 [477   0]]
AUC is: 0.1240


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
# X_train,Y_train; X_test_balanced,y_test_balanced 
#Train SVM using best parameters
svm_best = SVC(C=0.1,gamma=1,kernel='rbf',probability=True) 
start_time=time.time()
svm_best.fit(X_train, Y_train)

print("Training  model takes %s seconds" % round((time.time() - start_time),3))
print('Accuracy of SVM on test set: {:.3f}'.format(svm_best.score(X_test_balanced,y_test_balanced)))

start = time.time()
svm_pred = svm_best.predict(X_test_balanced)
end = time.time()

svm_predprob = svm_best.predict_proba(X_test_balanced)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_test_balanced)!= svm_pred))
print('Classification report \n', classification_report(y_test_balanced, svm_pred))

print('Confusion Matrix \n', confusion_matrix(y_test_balanced, svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_test_balanced, svm_predprob)))

Training  model takes 50.034 seconds
Accuracy of SVM on test set: 0.500
Predicting test data takes 6.845 seconds
Classification error rate: 0.5
Classification report 
               precision    recall  f1-score   support

           0       0.50      1.00      0.67       479
           1       0.00      0.00      0.00       479

    accuracy                           0.50       958
   macro avg       0.25      0.50      0.33       958
weighted avg       0.25      0.50      0.33       958

Confusion Matrix 
 [[479   0]
 [479   0]]
AUC is: 0.5000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


weighted SVM

In [37]:
weighted_svm = SVC(gamma = 'scale', class_weight = 'balanced')

#CV Weighted SVM 
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(weighted_svm, X, Y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.708


In [38]:
balance = [{0:598.0, 1:2402.0},{0:1,1:100}, {0:1,1:10}, {0:1,1:1}, {0:10,1:1}, {0:100,1:1}]
param_grid = dict(class_weight=balance)

grid_weightedsvm = GridSearchCV(estimator=weighted_svm, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
grid_weightedsvm.fit(X,Y)
grid_weightedsvm.best_params_
grid_weightedsvm.best_estimator_
grid_weightedsvm.best_params_
grid_weightedsvm.best_estimator_

SVC(class_weight={0: 598.0, 1: 2402.0})

In [39]:
print("Best: %f using %s" % (grid_weightedsvm.best_score_, grid_weightedsvm.best_params_))
# report all configurations
means = grid_weightedsvm.cv_results_['mean_test_score']
stds = grid_weightedsvm.cv_results_['std_test_score']
params = grid_weightedsvm.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.827703 using {'class_weight': {0: 598.0, 1: 2402.0}}
0.827703 (0.030973) with: {'class_weight': {0: 598.0, 1: 2402.0}}
0.792564 (0.035455) with: {'class_weight': {0: 1, 1: 100}}
0.782922 (0.031263) with: {'class_weight': {0: 1, 1: 10}}
0.798206 (0.034647) with: {'class_weight': {0: 1, 1: 1}}
0.796063 (0.030676) with: {'class_weight': {0: 10, 1: 1}}
0.796048 (0.030652) with: {'class_weight': {0: 100, 1: 1}}


In [40]:
#X_balanced_test,y_balanced_test

weighted_svm_best = SVC(gamma = 'scale', class_weight ={0: 598.0, 1: 2402.0},probability=True)
start_time=time.time()
weighted_svm_best.fit(X, Y)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

print('Accuracy of weighted SVM on test set: {:.3f}'.format(weighted_svm_best.score(X,Y)))

start = time.time()
weighted_svm_pred = weighted_svm_best.predict(X_balanced_test)
end = time.time()
print(weighted_svm_pred[0:5,])

weighted_svm_predprob = weighted_svm_best.predict_proba(X_balanced_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_balanced_test)!= weighted_svm_pred))
print('Classification report \n', classification_report(y_balanced_test, weighted_svm_pred))

print('Confusion Matrix \n', confusion_matrix(y_balanced_test, weighted_svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_balanced_test, weighted_svm_predprob)))

Training  model takes 54.529 seconds
Accuracy of weighted SVM on test set: 0.863
[0 1 1 0 0]
Predicting test data takes 3.94 seconds
Classification error rate: 0.11550468262226847
Classification report 
               precision    recall  f1-score   support

           0       0.92      0.84      0.88       484
           1       0.85      0.93      0.89       477

    accuracy                           0.88       961
   macro avg       0.89      0.88      0.88       961
weighted avg       0.89      0.88      0.88       961

Confusion Matrix 
 [[408  76]
 [ 35 442]]
AUC is: 0.9400


In [43]:
# X_test_balanced,y_test_balanced

weighted_svm_best = SVC(gamma = 'scale', class_weight ={0: 598.0, 1: 2402.0},probability=True)
start_time=time.time()
weighted_svm_best.fit(X, Y)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

print('Accuracy of weighted SVM on test set: {:.3f}'.format(weighted_svm_best.score(X,Y)))

start = time.time()
weighted_svm_pred = weighted_svm_best.predict(X_test_balanced)
end = time.time()
print(weighted_svm_pred[0:5,])

weighted_svm_predprob = weighted_svm_best.predict_proba(X_test_balanced)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(y_test_balanced)!= weighted_svm_pred))
print('Classification report \n', classification_report(y_test_balanced, weighted_svm_pred))

print('Confusion Matrix \n', confusion_matrix(y_test_balanced, weighted_svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(y_test_balanced, weighted_svm_predprob)))

Training  model takes 53.193 seconds
Accuracy of weighted SVM on test set: 0.863
[1 0 1 0 0]
Predicting test data takes 4.188 seconds
Classification error rate: 0.38204592901878914
Classification report 
               precision    recall  f1-score   support

           0       0.58      0.85      0.69       479
           1       0.72      0.39      0.50       479

    accuracy                           0.62       958
   macro avg       0.65      0.62      0.60       958
weighted avg       0.65      0.62      0.60       958

Confusion Matrix 
 [[406  73]
 [293 186]]
AUC is: 0.6809


In [42]:
# X_test,Y_test
weighted_svm_best = SVC(gamma = 'scale', class_weight ={0: 598.0, 1: 2402.0},probability=True)
start_time=time.time()
weighted_svm_best.fit(X, Y)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

print('Accuracy of weighted SVM on test set: {:.3f}'.format(weighted_svm_best.score(X,Y)))

start = time.time()
weighted_svm_pred = weighted_svm_best.predict(X_test)
end = time.time()
print(weighted_svm_pred[0:5,])

weighted_svm_predprob = weighted_svm_best.predict_proba(X_test)[:,1]

print("Predicting test data takes %s seconds" % round((end - start),3))
print('Classification error rate:', np.mean(np.array(Y_test)!= weighted_svm_pred))
print('Classification report \n', classification_report(Y_test, weighted_svm_pred))

print('Confusion Matrix \n', confusion_matrix(Y_test, weighted_svm_pred))
print('AUC is: {:.4f}'.format(roc_auc_score(Y_test, weighted_svm_predprob)))

Training  model takes 56.864 seconds
Accuracy of weighted SVM on test set: 0.863
[1 0 1 0 0]
Predicting test data takes 2.605 seconds
Classification error rate: 0.14
Classification report 
               precision    recall  f1-score   support

           0       0.97      0.85      0.91       479
           1       0.60      0.91      0.72       121

    accuracy                           0.86       600
   macro avg       0.79      0.88      0.81       600
weighted avg       0.90      0.86      0.87       600

Confusion Matrix 
 [[406  73]
 [ 11 110]]
AUC is: 0.9599
