## Step 1: set up controls for evaluation experiments

* (T/F) cross-validation on the training set
* (T/F) reweighting the samples for training set
* $K$, the number of CV folds
* (T/F) process features for training set
* (T/F) run evaluation on an independent test set
* (T/F) process features for test set

## Step 2: load data and train-test split

In [1]:
import scipy.io
import os
import numpy as np
import pandas as pd
import pickle

def get_points(file):
    '''load matlab style file'''
    mat = scipy.io.loadmat(file)
    return mat[list(mat.keys())[3]]

def pickle_save(filename, content):
    '''save the file into python pickle object under output folder'''
    with open('../output/%s.pkl'%filename, 'wb') as f:
        pickle.dump(content, f)
        
def pickle_open(filename):
    '''load the pickle file'''
    with open('../output/%s.pkl'%filename, 'rb') as f:
        content = pickle.load(f)
    return content
    

In [18]:
dir_list = os.listdir('../../train_set/points')
dir_list.sort()

fiducial_pt_full = np.stack((get_points('../../train_set/points/'+filename) for filename in dir_list))
label_full = pd.read_csv('../../train_set/label.csv')['label']

# save the data into pickle files, so that we don't need to read raw data everytime
pickle_save('fiducial_pt_full', fiducial_pt_full)
pickle_save('label_full', label_full)


### Train-test split

In [2]:
from sklearn.model_selection import train_test_split

# load data from pickle oject
fiducial_pt_full = pickle_open('fiducial_pt_full')
label_full = pickle_open('label_full')

## Note: randomly split into training & test set
X_train, X_test, y_train, y_test = train_test_split(fiducial_pt_full, label_full, test_size=0.2, random_state=42)


## Step 3: construct features and reponses

Here, use pairwise distances as features.

In [6]:
from sklearn.metrics import pairwise_distances

# extract pairwise distance as features (78*77/2=3003 features)
# nrow=number of records of the dataset; ncol=3003
feature_train = np.stack((pairwise_distances(X_train[i])[np.triu_indices(78, k = 1)] for i in range(X_train.shape[0])))
feature_test = np.stack((pairwise_distances(X_test[i])[np.triu_indices(78, k = 1)] for i in range(X_test.shape[0])))

### Not sure how to deal with the warning. Distances are calculated normally though

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [15]:
# balanced test dataset
emotion_1 = y_test[y_test == 1]
emotion_0 = y_test[y_test== 0]
feature_1 = feature_test[y_test==1]
feature_0 = feature_test[y_test==0]
bal_feature = np.concatenate((feature_1[0:130],feature_0[0:130]),axis=0)
bal_y = np.concatenate((emotion_1[0:130],emotion_0[0:130]),axis=0)

## Step 4: train classification model with training features and responses

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score
import time

def clf_metrics(y_true, y_pred, y_score):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_score)
    
    df = pd.DataFrame({'accuracy':[accuracy],'precision': [precision], 'recall': [recall], 'auc':[auc]})
    print(df)

In [16]:
weights = np.zeros(len(y_train))
weights[y_train == 0] = 0.5
weights[y_train == 1] = 10

baseline = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150,
                                      max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                      subsample=1,max_features='sqrt', random_state=299)
start = time.time()
baseline.fit(feature_train, y_train, sample_weight = weights)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 6.70 s


In [17]:
pred_train = baseline.predict(feature_train)
score_train = baseline.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = baseline.predict(feature_test)
score_test = baseline.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = baseline.predict(bal_feature)
score_test = baseline.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall      auc
0   0.89625   0.648305     1.0  0.99968


Test set:
   accuracy  precision    recall       auc
0  0.708333   0.417431  0.654676  0.777821


Balanced test set:
   accuracy  precision    recall       auc
0  0.723077       0.75  0.669231  0.814438


### Parameter tuning

### 1. max_depth and min_samples_split

In [20]:
from sklearn.model_selection import GridSearchCV

param_1 = {'max_depth':range(3,8,2)}

tuning1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150, 
                                                              max_features='sqrt', subsample=1, random_state=299), 

                       param_grid = param_1, scoring='roc_auc',n_jobs=-1,cv=5)

time_start_gs1 = time.time()
tuning1.fit(feature_train,y_train, sample_weight = weights)
time_end_gs1 = time.time()
print('Tuning time cost {:.2f} s'.format(time_end_gs1-time_start_gs1))

Tuning time cost 34.90 s


In [21]:
# report the best configuration
print("Best: %f using %s" % (tuning1.best_score_, tuning1.best_params_))

# report all configurations
means = tuning1.cv_results_['mean_test_score']
stds = tuning1.cv_results_['std_test_score']
params = tuning1.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.786853 using {'max_depth': 7}
0.763632 (0.013567) with: {'max_depth': 3}
0.762907 (0.022038) with: {'max_depth': 5}
0.786853 (0.010769) with: {'max_depth': 7}


### 2. learning_rate and n_estimators

In [23]:
param_2 = {'learning_rate':[0.1,0.05], 'n_estimators':[100,250,400,500]}

tuning2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150, max_depth=7,
                                                              max_features='sqrt', subsample=1, random_state=299), 
                       param_grid = param_2, scoring='roc_auc',n_jobs=-1,cv=5)

time_start_gs2 = time.time()
tuning2.fit(feature_train,y_train, sample_weight = weights)
time_end_gs2 = time.time()
print('Tuning time cost {:.2f} s'.format(time_end_gs2-time_start_gs2))

Tuning time cost 205.26 s


In [24]:
# report the best configuration
print("Best: %f using %s" % (tuning2.best_score_, tuning2.best_params_))

# report all configurations
means = tuning2.cv_results_['mean_test_score']
stds = tuning2.cv_results_['std_test_score']
params = tuning2.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.800166 using {'learning_rate': 0.1, 'n_estimators': 500}
0.777488 (0.011325) with: {'learning_rate': 0.1, 'n_estimators': 100}
0.792833 (0.009837) with: {'learning_rate': 0.1, 'n_estimators': 250}
0.798334 (0.008780) with: {'learning_rate': 0.1, 'n_estimators': 400}
0.800166 (0.009314) with: {'learning_rate': 0.1, 'n_estimators': 500}
0.762391 (0.010515) with: {'learning_rate': 0.05, 'n_estimators': 100}
0.785160 (0.008889) with: {'learning_rate': 0.05, 'n_estimators': 250}
0.792693 (0.014411) with: {'learning_rate': 0.05, 'n_estimators': 400}
0.793447 (0.015030) with: {'learning_rate': 0.05, 'n_estimators': 500}


In [25]:
best = tuning2.best_estimator_

In [26]:
pred_train = best.predict(feature_train)
score_train = best.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = best.predict(feature_test)
score_test = best.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = best.predict(bal_feature)
score_test = best.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall  auc
0       1.0        1.0     1.0  1.0


Test set:
   accuracy  precision    recall       auc
0  0.826667   0.716049  0.417266  0.792943


Balanced test set:
   accuracy  precision    recall       auc
0  0.707692   0.965517  0.430769  0.824852


### Tune weights

In [76]:
weights1 = np.zeros(len(y_train))
weights1[y_train == 0] = 1
weights1[y_train == 1] = 20

weights2 = np.zeros(len(y_train))
weights2[y_train == 0] = 1
weights2[y_train == 1] = 10

weights3 = np.zeros(len(y_train))
weights3[y_train == 0] = 1
weights3[y_train == 1] = 30

weights4 = np.zeros(len(y_train))
weights4[y_train == 0] = 1
weights4[y_train == 1] = 40

In [74]:
tuning3 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150,
                                      max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                      subsample=1,max_features='sqrt', random_state=299)
start = time.time()
tuning3.fit(feature_train, y_train, sample_weight = weights2)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 6.61 s


In [75]:
pred_train = tuning3.predict(feature_train)
score_train = tuning3.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning3.predict(feature_test)
score_test = tuning3.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = tuning3.predict(bal_feature)
score_test = tuning3.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall       auc
0  0.974167   0.880998     1.0  0.999893


Test set:
   accuracy  precision    recall       auc
0  0.761667   0.487952  0.582734  0.803337


Balanced test set:
   accuracy  precision  recall       auc
0  0.734615   0.821053     0.6  0.844438


In [69]:
tuning4 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150,
                                      max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                      subsample=1,max_features='sqrt', random_state=299)
start = time.time()
tuning4.fit(feature_train, y_train, sample_weight = weights3)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 6.71 s


In [70]:
pred_train = tuning4.predict(feature_train)
score_train = tuning4.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning4.predict(feature_test)
score_test = tuning4.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = tuning4.predict(bal_feature)
score_test = tuning4.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall       auc
0  0.831667   0.531866     1.0  0.999062


Test set:
   accuracy  precision    recall       auc
0  0.668333   0.382812  0.705036  0.762606


Balanced test set:
   accuracy  precision    recall       auc
0  0.715385   0.712121  0.723077  0.809053


In [67]:
tuning5 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150,
                                      max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                      subsample=1,max_features='sqrt', random_state=299)
start = time.time()
tuning5.fit(feature_train, y_train, sample_weight = weights4)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 6.45 s


In [68]:
pred_train = tuning5.predict(feature_train)
score_train = tuning5.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning5.predict(feature_test)
score_test = tuning5.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = tuning5.predict(bal_feature)
score_test = tuning5.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall      auc
0  0.784167   0.469806     1.0  0.99769


Test set:
   accuracy  precision   recall       auc
0  0.616667   0.349835  0.76259  0.731706


Balanced test set:
   accuracy  precision    recall       auc
0  0.719231   0.693878  0.784615  0.786036


### After tuning, we would choose learning_rate=0.1, n_estimators=150, max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=299, and fit the model with sample_weights ratio of 10.

### Save the model to pickle object:

In [77]:
weights2 = np.zeros(len(y_train))
weights2[y_train == 0] = 1
weights2[y_train == 1] = 10

gbt_baseline = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150,
                                      max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                      subsample=1,max_features='sqrt', random_state=299)
start = time.time()
gbt_baseline.fit(feature_train, y_train, sample_weight = weights2)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 6.36 s


In [78]:
# store the trained model
pickle_save('gbt_baseline', gbt_baseline)

In [79]:
# load the trained model from file
gbt_baseline = pickle_open('gbt_baseline')

In [80]:
# Gradient boosting baseline model performance
pred_train = gbt_baseline.predict(feature_train)
score_train = gbt_baseline.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = gbt_baseline.predict(feature_test)
score_test = gbt_baseline.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

pred_test = gbt_baseline.predict(bal_feature)
score_test = gbt_baseline.decision_function(bal_feature)
print('Balanced test set:')
clf_metrics(bal_y, pred_test, score_test)

Training set:
   accuracy  precision  recall       auc
0  0.974167   0.880998     1.0  0.999893


Test set:
   accuracy  precision    recall       auc
0  0.761667   0.487952  0.582734  0.803337


Balanced test set:
   accuracy  precision  recall       auc
0  0.734615   0.821053     0.6  0.844438
