# SVM Model

In [18]:
import numpy as np
import pandas as pd
import time
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import sklearn.svm
from sklearn.svm import SVC
from sklearn import metrics

In [19]:
start_time = time.time()

# loading data and features:

fiducial_pt_full = pd.read_pickle('../output/fiducial_pt_full.pkl')
label_full = pd.read_pickle('../output/label_full.pkl')

# Randomly splitting data into training & test sets:
RSEED = 2020

X_train, X_test, y_train, y_test = train_test_split(fiducial_pt_full, label_full, test_size=0.2, random_state=RSEED)

# Using pairwise distance as feature.
# extracting pairwise distance as features (78*77/2=3303 features)
# nrow=number of records of the dataset; ncol=3303
feature_train = np.stack((metrics.pairwise_distances(x_train[i])[np.triu_indices(78, k = 1)] for i in range(x_train.shape[0])))
feature_test = np.stack((metrics.pairwise_distances(x_test[i])[np.triu_indices(78, k = 1)] for i in range(x_test.shape[0])))

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [23]:
# balanced test dataset
emotion_1 = y_test[y_test == 1]
emotion_0 = y_test[y_test== 0]
feature_1 = feature_test[y_test==1]
feature_0 = feature_test[y_test==0]
bal_feature = np.concatenate((feature_1[0:130],feature_0[0:130]),axis=0)
bal_y = np.concatenate((emotion_1[0:130],emotion_0[0:130]),axis=0)

# Train model:

In [20]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score
import time

def clf_metrics(y_true, y_pred, y_score):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_score)
    
    df = pd.DataFrame({'accuracy':[accuracy],'precision': [precision], 'recall': [recall], 'auc':[auc]})
    print(df)

In [22]:
#set weight
weights = np.zeros(len(y_train))
weights[y_train == 0] = 0.5
weights[y_train == 1] = 10
# train svm
import sklearn.svm
from sklearn.svm import SVC
start=time.time()
svc = SVC(kernel= 'linear', random_state = 123, C = 0.0001)
model = svc.fit(feature_train, y_train, sample_weight = weights)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 27.06 s


In [24]:
pred_train = model.predict(feature_train)
score_train = model.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = model.predict(feature_test)
score_test = model.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = model.predict(bal_feature)
score_test = model.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision    recall       auc
0  0.594167   0.328729  0.995816  0.860555


Test set:
   accuracy  precision    recall       auc
0      0.55   0.299465  0.933333  0.782674


Balanced test set:
   accuracy  precision    recall       auc
0     0.716       0.64  0.933333  0.800641


# Parameter tuning

## 1. Choose the best parameter C

In [26]:
from sklearn.model_selection import GridSearchCV
start=time.time()
param= {'C': [0.0000001,0.000001,0.00001,0.0001,0.001,0.01,1]}
gscv = GridSearchCV(SVC(kernel='linear',random_state = 123), param, cv=5)
gscv.fit(feature_train, y_train)
print('Tuning time cost {:.2f} s'.format(time.time()-start))
gscv.best_params_

Tuning time cost 4219.08 s


{'C': 0.001}

Change C to 0.001

In [27]:
#set weight
weights = np.zeros(len(y_train))
weights[y_train == 0] = 0.5
weights[y_train == 1] = 10
# train svm
import sklearn.svm
from sklearn.svm import SVC
start=time.time()
svc1 = SVC(kernel= 'linear', random_state = 123, C = 0.001)
model1 = svc1.fit(feature_train, y_train, sample_weight = weights)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 37.47 s


In [28]:
pred_train = model1.predict(feature_train)
score_train = model1.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = model1.predict(feature_test)
score_test = model1.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = model1.predict(bal_feature)
score_test = model1.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall       auc
0  0.717917   0.413853     1.0  0.908177


Test set:
   accuracy  precision    recall       auc
0  0.616667   0.327044  0.866667  0.782847


Balanced test set:
   accuracy  precision    recall       auc
0     0.732   0.670968  0.866667  0.809231


Accuracy is promoted significantly.

## 2. Tune weights

In [29]:
weights1 = np.zeros(len(y_train))
weights1[y_train == 0] = 1
weights1[y_train == 1] = 20

weights2 = np.zeros(len(y_train))
weights2[y_train == 0] = 1
weights2[y_train == 1] = 10

weights3 = np.zeros(len(y_train))
weights3[y_train == 0] = 1
weights3[y_train == 1] = 30

weights4 = np.zeros(len(y_train))
weights4[y_train == 0] = 1
weights4[y_train == 1] = 40

In [30]:
tuning1 = SVC(kernel= 'linear', random_state = 123, C = 0.001)
start = time.time()
tuning1.fit(feature_train, y_train, sample_weight = weights1)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 43.49 s


In [31]:
pred_train = tuning1.predict(feature_train)
score_train = tuning1.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning1.predict(feature_test)
score_test = tuning1.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = tuning1.predict(bal_feature)
score_test = tuning1.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall       auc
0  0.764583   0.458293     1.0  0.926886


Test set:
   accuracy  precision  recall       auc
0  0.638333   0.335593   0.825  0.783576


Balanced test set:
   accuracy  precision  recall       auc
0     0.716    0.66443   0.825  0.805128


In [34]:
tuning2 = SVC(kernel= 'linear', random_state = 123, C = 0.001)
start = time.time()
tuning2.fit(feature_train, y_train, sample_weight = weights2)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 52.80 s


In [35]:
pred_train = tuning2.predict(feature_train)
score_train = tuning2.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning2.predict(feature_test)
score_test = tuning2.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = tuning2.predict(bal_feature)
score_test = tuning2.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision    recall       auc
0  0.784583   0.480323  0.995816  0.933136


Test set:
   accuracy  precision    recall      auc
0  0.658333    0.34767  0.808333  0.79125


Balanced test set:
   accuracy  precision    recall       auc
0      0.72   0.673611  0.808333  0.815128


In [36]:
tuning3 = SVC(kernel= 'linear', random_state = 123, C = 0.001)
start = time.time()
tuning3.fit(feature_train, y_train, sample_weight = weights3)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 51.82 s


In [37]:
pred_train = tuning3.predict(feature_train)
score_train = tuning3.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning3.predict(feature_test)
score_test = tuning3.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = tuning3.predict(bal_feature)
score_test = tuning3.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall       auc
0   0.75875   0.452223     1.0  0.923377


Test set:
   accuracy  precision    recall     auc
0  0.646667   0.342466  0.833333  0.7825


Balanced test set:
   accuracy  precision    recall       auc
0     0.724   0.671141  0.833333  0.804231


In [40]:
tuning4 = SVC(kernel= 'linear', random_state = 123, C = 0.001)
start = time.time()
tuning4.fit(feature_train, y_train, sample_weight = weights4)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 49.80 s


In [41]:
pred_train = tuning4.predict(feature_train)
score_train = tuning4.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning4.predict(feature_test)
score_test = tuning4.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = tuning4.predict(bal_feature)
score_test = tuning4.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall       auc
0  0.757917   0.451369     1.0  0.922503


Test set:
   accuracy  precision    recall       auc
0  0.643333   0.340136  0.833333  0.782083


Balanced test set:
   accuracy  precision    recall       auc
0     0.724   0.671141  0.833333  0.803013


tunning2 has the best outcome.

### After tuning, we would choose C=0.001, random_state=123, and fit the model with sample_weights ratio of 10.

## Save the model to pickle object:

In [42]:
import scipy.io
import os
import numpy as np
import pandas as pd
import pickle

def get_points(file):
    '''load matlab style file'''
    mat = scipy.io.loadmat(file)
    return mat[list(mat.keys())[3]]

def pickle_save(filename, content):
    '''save the file into python pickle object under output folder'''
    with open('../output/%s.pkl'%filename, 'wb') as f:
        pickle.dump(content, f)
        
def pickle_open(filename):
    '''load the pickle file'''
    with open('../output/%s.pkl'%filename, 'rb') as f:
        content = pickle.load(f)
    return content

In [43]:
# store the trained model
pickle_save('SVM', tuning2)

In [44]:
# load the trained model from file
svm = pickle_open('SVM')

In [45]:
# SVM model performance
pred_train = svm.predict(feature_train)
score_train = svm.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = svm.predict(feature_test)
score_test = svm.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = svm.predict(bal_feature)
score_test = svm.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision    recall       auc
0  0.784583   0.480323  0.995816  0.933136


Test set:
   accuracy  precision    recall      auc
0  0.658333    0.34767  0.808333  0.79125


Balanced test set:
   accuracy  precision    recall       auc
0      0.72   0.673611  0.808333  0.815128


In [46]:
#label_prediction on test set
pred_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0])

In [None]:
#Store it to csv file
