In [5]:
import os,sys
import pandas as pd
from sklearn.model_selection import train_test_split
import scipy.io
import numpy as np
from scipy.spatial.distance import pdist
import time
import math
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score,pairwise_distances, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
import keras
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Input, Dropout
from keras.layers import BatchNormalization
from keras.models import Model
from keras import initializers
from keras.optimizers import Adam
import seaborn as sns
import matplotlib.image as img
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA

If the following code doesn't run, then do 'pip install ipynb' in the command line. This code lets us import functions from notebooks in the lib folder. Lib is supposed to have all the model training/predicting functions and the doc folder is only supposed to have report/presentation files like main.ipynb.

In [6]:
import ipynb
sys.path.append('../lib/')

### Step 0: set work directories

In [7]:
np.random.seed(2020)

Provide directories for training images. Training images and Training fiducial points will be in different subfolders.

In [8]:
#Change train_dir to your own path

root = sys.path[0]
train_dir =  os.path.join(root,  '../data/train_set/')  
train_image_dir = train_dir+"images/"
train_pt_dir = train_dir+"points/"
train_label_path = train_dir+"label.csv"

### Step 1: set up controls for evaluation experiments

In this chunk, we have a set of controls for the evaluation experiments. 

+ (T/F) cross-validation on the training set
+ (T/F) reweighting the samples for training set 
+ (number) K, the number of CV folds
+ (T/F) process features for training set
+ (T/F) run evaluation on an independent test set
+ (T/F) process features for test set

In [9]:
run_cv = True # run cross-validation on the training set
sample_reweight = True # run sample reweighting in model training
K = 5  # number of CV folds
run_feature_train = True # process features for training set
run_test = True # run evaluation on an independent test set
run_feature_test = True # process features for test set

### Step 2: import data and train-test split

In [10]:
info = pd.read_csv(train_label_path)
n = info.shape[0]

#Data is imbalanced 
print('Number of records with label 0 (basic emotion): {:4d} '.format(info.loc[info['label']==0].shape[0]))
print('Number of records with label 1 (complex emotion): {:2d} '.format(info.loc[info['label']==1].shape[0]))

Number of records with label 0 (basic emotion): 2402 
Number of records with label 1 (complex emotion): 598 


In [11]:
#we could use sklearn train_test_split here instead of doing it
#manually like in the starter code
n_train = int(round(n*(4/5),0))
train_idx = np.random.choice(list(info.index),size=n_train,replace=False)
test_idx = list(set(list(info.index))-set(train_idx)) #set difference

If you choose to extract features from images, such as using Gabor filter, R memory will exhaust all images are read together. The solution is to repeat reading a smaller batch(e.g 100) and process them. 

In [12]:
n_files = len(os.listdir(train_image_dir))

# The following codes may be irrelevant to our analysis and when running them 'No module names PIL' error
# popped up. So I temporarily commented them.
# image_list = []
# for i in range(1,101): # 1 to 100
#     image = img.imread(train_image_dir+'{:04d}'.format(i)+'.jpg')
#     image_list.append(image)

Fiducial points are stored in matlab format. In this step, we read them and store them in a list.

In [58]:
#function to read fiducial points
#input: index
#output: matrix of fiducial points corresponding to the index

def readMat_matrix(index):
    try:
        mat_data = scipy.io.loadmat(train_pt_dir+'{:04d}'.format(index)+'.mat')['faceCoordinatesUnwarped']
    except KeyError:
        mat_data = scipy.io.loadmat(train_pt_dir+'{:04d}'.format(index)+'.mat')['faceCoordinates2']
    return np.matrix.round(mat_data,0)

#load fiducial points
#pickle is the closest equivalent to .RData that I could find in Python
fiducial_pt_list = list(map(readMat_matrix,list(range(1,n_files+1))))
print(fiducial_pt_list[0].shape[1])
pickle.dump(fiducial_pt_list, open( "../output/fiducial_pt_list.p", "wb" ) )

2


### Step 3: construct features and responses

`feature.R` should be the wrapper for all your feature engineering functions and options. The function `feature( )` should have options that correspond to different scenarios for your project and produces an R object that contains features and responses that are required by all the models you are going to evaluate later. 
  
  + `feature.R`
  + Input: list of images or fiducial point
  + Output: an RData file that contains extracted features and corresponding responses

In [14]:
# Use feature.ipynb's feature function to generate features for the train and test points
# Please use one of the feature extraction methods

from ipynb.fs.full.feature import feature

tm_feature_train = np.nan
if run_feature_train == True:
    start = time.time()
    dat_train = feature(fiducial_pt_list, train_idx, info)
    end = time.time()
    tm_feature_train = end-start
    pickle.dump(dat_train, open( "../output/feature_train.p", "wb" ) )
else:
    pickle.load(open("../output/feature_train.p", "rb"))
    
    
tm_feature_test = np.nan
if run_feature_test == True:
    start = time.time()
    dat_test = feature(fiducial_pt_list, test_idx, info)
    end = time.time()
    tm_feature_test = end-start
    pickle.dump(dat_test, open( "../output/feature_test.p", "wb" ) )
else:
    pickle.load(open("../output/feature_test.p", "rb"))
    

In [59]:
# Use feature_dis_between_points.ipynb's feature function to generate features for the train and test points
# This is a feature extraction method calculating distance between two 2-D points
# However, it seems that this method is worse than that in starter code
# The test accuracy rates of baseline model and XGBoost model are lower by using this feature extraction method

# from ipynb.fs.full.feature_dis_between_points import feature_dis_between_points

# tm_feature_train = np.nan
# if run_feature_train == True:
#     start = time.time()
#     dat_train = feature_dis_between_points(fiducial_pt_list, train_idx, info)
#     end = time.time()
#     tm_feature_train = end-start
#     pickle.dump(dat_train, open( "../output/feature_train.p", "wb" ) )
# else:
#     pickle.load(open("../output/feature_train.p", "rb"))
    
    
# tm_feature_test = np.nan
# if run_feature_test == True:
#     start = time.time()
#     dat_test = feature_dis_between_points(fiducial_pt_list, test_idx, info)
#     end = time.time()
#     tm_feature_test = end-start
#     pickle.dump(dat_test, open( "../output/feature_test.p", "wb" ) )
# else:
#     pickle.load(open("../output/feature_test.p", "rb"))

In [60]:
#Get the traning/test features and labels

feature_train = dat_train.loc[:, dat_train.columns != 'labels']
label_train = dat_train['labels']

feature_test = dat_test.loc[:, dat_test.columns != 'labels']
label_test = dat_test['labels']

In [61]:
#PCA (not sure if i did it correctly)

pca = PCA().fit(feature_train)
feature_train = pca.transform(feature_train)
feature_test = pca.transform(feature_test)

### Step 4: Train a classification model with training features and responses

Call the train model and test model from library. 

`train.R` and `test.R` should be wrappers for all your model training steps and your classification/prediction steps. 

+ `train.R`
  + Input: a data frame containing features and labels and a parameter list.
  + Output:a trained model
+ `test.R`
  + Input: the fitted classification model using training data and processed features from testing images 
  + Input: an R object that contains a trained classifier.
  + Output: training model specification

+ In this Starter Code, we use logistic regression with LASSO penalty to do classification.

#### Model selection with cross-validation
* Do model selection by choosing among different values of training model parameters.

### Baseline Model

In [62]:
lmbd = [1e-3, 5e-3, 1e-2, 5e-2, 1e-1]
model_labels = ["Gradient Boosting with learning rate = "+str(x) for x in lmbd]

In [63]:
#need to do a grid search for optimal parameters
#takes a really long time and we don't know the features yet
#so i commented the grid search out for now

if (run_cv):
    params = {'learning_rate':lmbd, 'max_depth': [1,2,3,4], 'n_estimators':[100,200,300,400,500]}
    #gscv = GridSearchCV(GradientBoostingClassifier(),params,cv=K).fit(feature_train,label_train)

In [64]:
#Baseline model
#need to do grid search to get optimal parameters though
# import time
start = time.time()
gbm=GradientBoostingClassifier(learning_rate=0.1,max_depth=2,n_estimators=100)
gbm.fit(feature_train,label_train)
end = time.time()

print('Training time {:4f} seconds'.format(end-start))

Training time 85.377215 seconds


In [65]:
test_preds=gbm.predict(feature_test)
print('Classification Error: ',np.mean(np.array(test_preds)!=np.array(label_test))) #Classification Error
print('Test Accuracy: ',np.mean(np.array(test_preds)==np.array(label_test))) # Test Accuracy

Classification Error:  0.4583333333333333
Test Accuracy:  0.5416666666666666


In [66]:
#score is accuracy = 1-classification error i.e.
#1-(np.mean(np.array(test_preds)!=np.array(label_test)))

#problem with accuracy is that it'll be very high for our model
#because of imbalanced classes

#when we get the real test data in class, the accuracy will definitely
#not be as high because the real test data is balanced

gbm.score(feature_test,label_test)

0.5416666666666666

In [67]:
print(classification_report(label_test,test_preds))

              precision    recall  f1-score   support

           0       0.84      0.51      0.64       473
           1       0.26      0.65      0.37       127

    accuracy                           0.54       600
   macro avg       0.55      0.58      0.51       600
weighted avg       0.72      0.54      0.58       600



In [68]:
confusion_matrix(label_test,test_preds)

array([[243, 230],
       [ 45,  82]])

In [69]:
#ROC curve is FPR vs TPR so it's probably better than accuracy
roc_auc_score(label_test,test_preds)

0.5797056816100947

### XGBoost Model

In [70]:
start_time=time.time()

xgb = XGBClassifier(
 learning_rate =0.1,
 num_class=2,
 n_estimators= 200,
 max_depth=2,
 eta=1,
 min_child_weight=1,
 objective= 'multi:softmax',  # for multi-labels classification 
 scale_pos_weight=4,
 seed=123)

xgb.fit(feature_train, label_train, eval_metric='auc')
print("training  model takes %s seconds" % round((time.time() - start_time),3))

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


training  model takes 12.006 seconds


In [71]:
start_time = time.time()
pred_xgb = xgb.predict(feature_test)
print("testing model takes %s seconds" % round((time.time() - start_time),3))

testing model takes 0.026 seconds


In [72]:
acc_xgb = accuracy_score(pred_xgb,label_test )
print("Test accuracy is %s percent" %(acc_xgb*100))

Test accuracy is 76.83333333333333 percent


In [73]:
print(classification_report(label_test,pred_xgb))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       473
           1       0.42      0.24      0.30       127

    accuracy                           0.77       600
   macro avg       0.62      0.57      0.58       600
weighted avg       0.73      0.77      0.74       600



In [74]:
confusion_matrix(label_test,pred_xgb)

array([[431,  42],
       [ 97,  30]])

In [75]:
#ROC curve is FPR vs TPR so it's probably better than accuracy
roc_auc_score(label_test,pred_xgb)

0.5737127732183582