Import Packages

In [81]:
import numpy as np
import scipy.io
import sklearn.metrics
import sklearn 
import os
import random
import pandas as pd
import time
import matplotlib.pyplot as plt
import smote_variants as sv
import imbalanced_databases as imbd
import random
random.seed(2021)

# set path

In [66]:
path = "C:/Users/liqia/Spring2021-Project3-group-3-master/data//train_set"
image_dir = path+"images/"
pt_dir = path+"points/"
label_path = path+"label.csv"

# read files

In [67]:
def read_data(path):
  
  # read labels
  labels = pd.read_csv(path+'/label.csv')
  y= labels['label'].to_numpy()

  # read points
  n = 3000
  for i in range(1,n+1):
    p_path = str(i).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords = mat['faceCoordinatesUnwarped'] 
    else:
      cords = mat['faceCoordinates2']

    distance = sklearn.metrics.pairwise_distances(cords)       
          # compute the pairwise distances in each mat
    flatten_distance = distance[np.triu_indices(len(cords[:,0]), k = 1)]    
          # stretch the upper triangle of the symmetric matrix 
          # to a long array with dimension 3003
          # 3003 = (1+77)*78/2
    if i==1:
      distances = np.mat([flatten_distance])
    else:
      distances = np.append(distances, np.mat([flatten_distance]), axis = 0)
  return (distances, y)

In [68]:
read_time_start=time.time()
X, Y = read_data(path)
print("Read the original dataset takes %s seconds" % round((time.time() - read_time_start),3))

Read the original dataset takes 55.406 seconds


In [69]:
X.shape, Y.shape 

((3000, 3003), (3000,))

Data Preprocessing Imbalanced Dataset (SMOTE)-oversamplig

In [70]:
print('majority class: %d' % np.sum(Y == 0))
print('minority class: %d' % np.sum(Y == 1))
#imbalanced dataset

majority class: 2402
minority class: 598


In [71]:
def data_preprocessing(X, Y, path):

  distances = X
  y = Y

  n = y.shape[0]
  mat_1 = np.add(np.where(y == 1),1)
  n_oversample = (n-sum(y))-sum(y) 
    # how many samples do we need to generate

  for i in range(n_oversample):
    samples_index = random.sample(list(list(mat_1)[0]), 2)
      # pick two random index of class 1 samples. 

    p_path = str(samples_index[0]).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords_0 = mat['faceCoordinatesUnwarped'] 
    else:
      cords_0 = mat['faceCoordinates2']
    
    p_path = str(samples_index[1]).zfill(4)+'.mat'
    mat = scipy.io.loadmat(path+'/points/'+p_path)
    if 'faceCoordinatesUnwarped' in mat:
      cords_1 = mat['faceCoordinatesUnwarped'] 
    else:
      cords_1 = mat['faceCoordinates2']

    cords_new = (cords_0 + cords_1) / 2 
        # averaging two sets of cordinates to generate new set of cordinates
    distance = sklearn.metrics.pairwise_distances(cords_new)
        # compute the pairwise distances in each mat
    flatten_distance = distance[np.triu_indices(len(cords_new[:,0]), k = 1)]
        # stretch the upper triangle of the symmetric matrix 
        # to a long array with dimension 3003
        # 3003 = (1+77)*78/2
    
    distances = np.append(distances, np.mat([flatten_distance]), axis = 0)
    y = np.append(y,np.array(1))
        # Append new data to the original dataset

  return (distances, y)

In [72]:
Balanced_X, Blanced_Y = data_preprocessing(X, Y, path)
Balanced_X.shape, Blanced_Y.shape

((4804, 3003), (4804,))

# split data

In [82]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(Balanced_X, Blanced_Y,test_size=0.2,random_state=0)

In [83]:
#split train and test with size=0.20.2

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,Y,test_size=0.2,random_state=0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2400, 3003), (2400,), (600, 3003), (600,))

# different models

# SVM

In [85]:
# linear
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svc_linear=SVC(kernel='linear',class_weight='balanced')
start_time=time.time()
svc_linear.fit(X_train,y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

print('Accuracy of SVM on test set: {:.3f}'.format(svc_linear.score(X,Y)))
start = time.time()
svm_linear_pred = svc_linear.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,svm_linear_pred))

Training  model takes 1655.733 seconds
Accuracy of SVM on test set: 0.851
Predicting test data takes 0.63 seconds
              precision    recall  f1-score   support

           0       0.90      0.79      0.84       479
           1       0.44      0.64      0.52       121

    accuracy                           0.76       600
   macro avg       0.67      0.72      0.68       600
weighted avg       0.81      0.76      0.78       600



In [86]:
# rbf
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svc_rbf=SVC(kernel='rbf',class_weight='balanced')
start_time=time.time()
svc_rbf.fit(X_train,y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

print('Accuracy of SVM on test set: {:.3f}'.format(svc_rbf.score(X,Y)))
start = time.time()
svm_rbf_pred = svc_rbf.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,svm_rbf_pred))

Training  model takes 9.602 seconds
Accuracy of SVM on test set: 0.604
Predicting test data takes 4.184 seconds
              precision    recall  f1-score   support

           0       0.88      0.54      0.67       479
           1       0.28      0.72      0.41       121

    accuracy                           0.58       600
   macro avg       0.58      0.63      0.54       600
weighted avg       0.76      0.58      0.62       600



In [87]:
# poly
svc_poly=SVC(kernel='poly',class_weight='balanced')
start_time=time.time()
svc_poly.fit(X_train,y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

print('Accuracy of SVM on test set: {:.3f}'.format(svc_poly.score(X,Y)))
start = time.time()
svm_poly_pred = svc_poly.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,svm_poly_pred))

Training  model takes 7.902 seconds
Accuracy of SVM on test set: 0.751
Predicting test data takes 1.646 seconds
              precision    recall  f1-score   support

           0       0.91      0.71      0.80       479
           1       0.39      0.71      0.50       121

    accuracy                           0.71       600
   macro avg       0.65      0.71      0.65       600
weighted avg       0.80      0.71      0.74       600



# knn

In [88]:
from sklearn.neighbors import NearestCentroid
import numpy as np
from sklearn.metrics import classification_report

In [89]:
clf = NearestCentroid()
start_time=time.time()
clf.fit(X_train, y_train)
NearestCentroid()
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

start = time.time()
pre=clf.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,pre))

Training  model takes 0.06 seconds
Predicting test data takes 0.029 seconds
              precision    recall  f1-score   support

           0       0.83      0.53      0.65       479
           1       0.24      0.59      0.34       121

    accuracy                           0.54       600
   macro avg       0.54      0.56      0.49       600
weighted avg       0.71      0.54      0.58       600



In [91]:
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline

for k in range(3,10):
    nca = NeighborhoodComponentsAnalysis(random_state=42)
    knn = KNeighborsClassifier(n_neighbors=k)
    nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
    nca_pipe.fit(X_train, y_train)
    pre=nca_pipe.predict(X_test)
    # Pipeline(...)
    print('the score of a model with k = %d is %f' % (k, nca_pipe.score(X_test, y_test)))
    print('the recall of a model with k = %d is %f' % (k, recall_score(y_test, pre)))
    
    

from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
k=3
nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier(n_neighbors=k)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
nca_pipe.fit(X_train, y_train)
# Pipeline(...)
pre=nca_pipe.predict(X_test)
# Pipeline(...)
print('the score of a model with k = %d is %f' % (k, nca_pipe.score(X_test, y_test)))
print('the recall of a model with k = %d is %f' % (k, recall_score(y_test, pre)))


from sklearn.metrics import classification_report

pre=nca_pipe.predict(X_test)
print(classification_report(y_test,pre))

the score of a model with k = 3 is 0.776667
the recall of a model with k = 3 is 0.223140
the score of a model with k = 4 is 0.800000
the recall of a model with k = 4 is 0.090909
the score of a model with k = 5 is 0.786667
the recall of a model with k = 5 is 0.148760
the score of a model with k = 6 is 0.800000
the recall of a model with k = 6 is 0.082645
the score of a model with k = 7 is 0.790000
the recall of a model with k = 7 is 0.115702
the score of a model with k = 8 is 0.810000
the recall of a model with k = 8 is 0.082645
the score of a model with k = 9 is 0.803333
the recall of a model with k = 9 is 0.132231
the score of a model with k = 3 is 0.776667
the recall of a model with k = 3 is 0.223140
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       479
           1       0.40      0.22      0.29       121

    accuracy                           0.78       600
   macro avg       0.61      0.57      0.58       600
weighted avg    

# Stochastic Gradient Descent

In [92]:
# SGD with penalty=l1
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="log", penalty="l1", max_iter=200, shuffle=True, class_weight='balanced')
start_time=time.time()
clf.fit(X_train, y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

from sklearn.metrics import classification_report
start = time.time()
pre=clf.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,pre))

Training  model takes 2.677 seconds
Predicting test data takes 0.009 seconds
              precision    recall  f1-score   support

           0       0.87      0.77      0.82       479
           1       0.37      0.53      0.43       121

    accuracy                           0.72       600
   macro avg       0.62      0.65      0.62       600
weighted avg       0.77      0.72      0.74       600



In [93]:
# SGD with penalty=12
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="log", penalty="l2", max_iter=200, shuffle=True, class_weight='balanced')
start_time=time.time()
clf.fit(X_train, y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

from sklearn.metrics import classification_report
start = time.time()
pre=clf.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,pre))

Training  model takes 1.978 seconds
Predicting test data takes 0.003 seconds
              precision    recall  f1-score   support

           0       0.93      0.56      0.70       479
           1       0.32      0.83      0.47       121

    accuracy                           0.62       600
   macro avg       0.63      0.70      0.58       600
weighted avg       0.81      0.62      0.65       600



# Neural Networks

In [94]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False,
              epsilon=1e-08, hidden_layer_sizes=(5,2),
              learning_rate='constant', learning_rate_init=0.001,
              max_iter=200, momentum=0.9, n_iter_no_change=10,
              nesterovs_momentum=True, power_t=0.5, random_state=2,
              shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
start_time=time.time()
clf.fit(X_train, y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

from sklearn.metrics import classification_report
start = time.time()
pre=clf.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,pre))

Training  model takes 0.313 seconds
Predicting test data takes 0.007 seconds
              precision    recall  f1-score   support

           0       0.80      1.00      0.89       479
           1       0.00      0.00      0.00       121

    accuracy                           0.80       600
   macro avg       0.40      0.50      0.44       600
weighted avg       0.64      0.80      0.71       600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Decision Tree

In [96]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth = 30, min_samples_leaf=2, max_leaf_nodes=3, class_weight='balanced')
start_time=time.time()
clf = clf.fit(X_train, y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

from sklearn.metrics import classification_report
start = time.time()
pre=clf.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,pre))

Training  model takes 2.343 seconds
Predicting test data takes 0.009 seconds
              precision    recall  f1-score   support

           0       0.86      0.71      0.77       479
           1       0.31      0.53      0.39       121

    accuracy                           0.67       600
   macro avg       0.58      0.62      0.58       600
weighted avg       0.75      0.67      0.70       600



# Random Forest & Adaboost

In [97]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=300, class_weight='balanced')
start_time=time.time()
clf = clf.fit(X_train, y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

from sklearn.metrics import classification_report
start = time.time()
pre=clf.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,pre))

Training  model takes 25.457 seconds
Predicting test data takes 0.113 seconds
              precision    recall  f1-score   support

           0       0.81      0.99      0.89       479
           1       0.72      0.11      0.19       121

    accuracy                           0.81       600
   macro avg       0.77      0.55      0.54       600
weighted avg       0.80      0.81      0.75       600



In [107]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=200)
start_time=time.time()
clf = clf.fit(X_train, y_train)
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

from sklearn.metrics import classification_report
start_time=time.time()
pre=clf.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,pre))

Training  model takes 186.366 seconds
Predicting test data takes 1463.377 seconds
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       479
           1       0.53      0.42      0.47       121

    accuracy                           0.81       600
   macro avg       0.69      0.66      0.67       600
weighted avg       0.79      0.81      0.80       600



In [108]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100)
start_time=time.time()
clf = clf.fit(X_train, y_train)
#scores = cross_val_score(clf, img_set, label, cv=5)
#print('the 5-fold cross validation score for AdaBoost with 100 estimators is %f' % scores.mean())
print("Training  model takes %s seconds" % round((time.time() - start_time),3))

from sklearn.metrics import classification_report
start_time=time.time()
pre=clf.predict(X_test)
end = time.time()
print("Predicting test data takes %s seconds" % round((end - start),3))
print(classification_report(y_test,pre))

Training  model takes 101.178 seconds
Predicting test data takes 1699.038 seconds
              precision    recall  f1-score   support

           0       0.84      0.92      0.88       479
           1       0.49      0.32      0.39       121

    accuracy                           0.80       600
   macro avg       0.67      0.62      0.63       600
weighted avg       0.77      0.80      0.78       600

