In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
import os
import sys
import matplotlib.image as img
import scipy.io
import pickle
from sklearn.metrics import pairwise_distances, classification_report, confusion_matrix, roc_auc_score
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

If the following code doesn't run, then do 'pip install ipynb' in the command line. This code lets us import functions from notebooks in the lib folder. Lib is supposed to have all the model training/predicting functions and the doc folder is only supposed to have report/presentation files like main.ipynb.

In [2]:
import ipynb
sys.path.append('../lib/')

### Step 0: set work directories

In [3]:
np.random.seed(2020)

Provide directories for training images. Training images and Training fiducial points will be in different subfolders.

In [4]:
#Change train_dir to where you have the train set because we can't
#upload the train set to github

train_dir = "/Users/rohan/Desktop/train_set/"
train_image_dir = train_dir+"images/"
train_pt_dir = train_dir+"points/"
train_label_path = train_dir+"label.csv"

### Step 1: set up controls for evaluation experiments

In this chunk, we have a set of controls for the evaluation experiments. 

+ (T/F) cross-validation on the training set
+ (T/F) reweighting the samples for training set 
+ (number) K, the number of CV folds
+ (T/F) process features for training set
+ (T/F) run evaluation on an independent test set
+ (T/F) process features for test set

In [5]:
run_cv = True # run cross-validation on the training set
sample_reweight = True # run sample reweighting in model training
K = 5  # number of CV folds
run_feature_train = True # process features for training set
run_test = True # run evaluation on an independent test set
run_feature_test = True # process features for test set

### Step 2: import data and train-test split

In [6]:
info = pd.read_csv(train_label_path)
n = info.shape[0]

#Data is imbalanced 
print('Number of records with label 0 (basic emotion): {:4d} '.format(info.loc[info['label']==0].shape[0]))
print('Number of records with label 1 (complex emotion): {:2d} '.format(info.loc[info['label']==1].shape[0]))

Number of records with label 0 (basic emotion): 2402 
Number of records with label 1 (complex emotion): 598 


In [7]:
#we could use sklearn train_test_split here instead of doing it
#manually like in the starter code
n_train = int(round(n*(4/5),0))
train_idx = np.random.choice(list(info.index),size=n_train,replace=False)
test_idx = list(set(list(info.index))-set(train_idx)) #set difference

If you choose to extract features from images, such as using Gabor filter, R memory will exhaust all images are read together. The solution is to repeat reading a smaller batch(e.g 100) and process them. 

In [8]:
n_files = len(os.listdir(train_image_dir))

image_list = []
for i in range(1,101): # 1 to 100
    image = img.imread(train_image_dir+'{:04d}'.format(i)+'.jpg')
    image_list.append(image)

Fiducial points are stored in matlab format. In this step, we read them and store them in a list.

In [9]:
#function to read fiducial points
#input: index
#output: matrix of fiducial points corresponding to the index

def readMat_matrix(index):
    try:
        mat_data = scipy.io.loadmat(train_pt_dir+'{:04d}'.format(index)+'.mat')['faceCoordinatesUnwarped']
    except KeyError:
        mat_data = scipy.io.loadmat(train_pt_dir+'{:04d}'.format(index)+'.mat')['faceCoordinates2']
    return np.matrix.round(mat_data,0)

#load fiducial points
#pickle is the closest equivalent to .RData that I could find in Python
fiducial_pt_list = list(map(readMat_matrix,list(range(1,n_files+1))))
pickle.dump(fiducial_pt_list, open( "../output/fiducial_pt_list.p", "wb" ) )

### Step 3: construct features and responses

`feature.R` should be the wrapper for all your feature engineering functions and options. The function `feature( )` should have options that correspond to different scenarios for your project and produces an R object that contains features and responses that are required by all the models you are going to evaluate later. 
  
  + `feature.R`
  + Input: list of images or fiducial point
  + Output: an RData file that contains extracted features and corresponding responses

In [10]:
# Use feature.ipynb's feature function to generate features for the train and test points

from ipynb.fs.full.feature import feature

tm_feature_train = np.nan
if run_feature_train == True:
    start = time.time()
    dat_train = feature(fiducial_pt_list, train_idx, info)
    end = time.time()
    tm_feature_train = end-start
    pickle.dump(dat_train, open( "../output/feature_train.p", "wb" ) )
else:
    pickle.load(open("../output/feature_train.p", "rb"))
    
    
tm_feature_test = np.nan
if run_feature_test == True:
    start = time.time()
    dat_test = feature(fiducial_pt_list, test_idx, info)
    end = time.time()
    tm_feature_test = end-start
    pickle.dump(dat_test, open( "../output/feature_test.p", "wb" ) )
else:
    pickle.load(open("../output/feature_test.p", "rb"))
    

In [11]:
#Get the traning/test features and labels

feature_train = dat_train.loc[:, dat_train.columns != 'labels']
label_train = dat_train['labels']

feature_test = dat_test.loc[:, dat_test.columns != 'labels']
label_test = dat_test['labels']

In [12]:
#PCA (not sure if i did it correctly)

pca = PCA().fit(feature_train)
feature_train = pca.transform(feature_train)
feature_test = pca.transform(feature_test)

### Step 4: Train a classification model with training features and responses

Call the train model and test model from library. 

`train.R` and `test.R` should be wrappers for all your model training steps and your classification/prediction steps. 

+ `train.R`
  + Input: a data frame containing features and labels and a parameter list.
  + Output:a trained model
+ `test.R`
  + Input: the fitted classification model using training data and processed features from testing images 
  + Input: an R object that contains a trained classifier.
  + Output: training model specification

+ In this Starter Code, we use logistic regression with LASSO penalty to do classification.

#### Model selection with cross-validation
* Do model selection by choosing among different values of training model parameters.

### Baseline Model

In [13]:
lmbd = [1e-3, 5e-3, 1e-2, 5e-2, 1e-1]
model_labels = ["Gradient Boosting with learning rate = "+str(x) for x in lmbd]

In [14]:
#need to do a grid search for optimal parameters
#takes a really long time and we don't know the features yet
#so i commented the grid search out for now

if (run_cv):
    params = {'learning_rate':lmbd, 'max_depth': [1,2,3,4], 'n_estimators':[100,200,300,400,500]}
    #gscv = GridSearchCV(GradientBoostingClassifier(),params,cv=K).fit(feature_train,label_train)

In [15]:
#Baseline model
#need to do grid search to get optimal parameters though

start = time.time()
gbm=GradientBoostingClassifier(learning_rate=0.1,max_depth=2,n_estimators=100)
gbm.fit(feature_train,label_train)
end = time.time()

print('Training time {:4f} seconds'.format(end-start))

Training time 67.552145 seconds


In [16]:
test_preds=gbm.predict(feature_test)
np.mean(np.array(test_preds)!=np.array(label_test)) #Classification Error

0.31

In [17]:
#score is accuracy = 1-classification error i.e.
#1-(np.mean(np.array(test_preds)!=np.array(label_test)))

#problem with accuracy is that it'll be very high for our model
#because of imbalanced classes

#when we get the real test data in class, the accuracy will definitely
#not be as high because the real test data is balanced

gbm.score(feature_test,label_test)

0.69

In [18]:
print(classification_report(label_test,test_preds))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80       473
           1       0.28      0.29      0.28       127

    accuracy                           0.69       600
   macro avg       0.54      0.54      0.54       600
weighted avg       0.70      0.69      0.69       600



In [19]:
confusion_matrix(label_test,test_preds)

array([[377,  96],
       [ 90,  37]])

In [20]:
#ROC curve is FPR vs TPR so it's probably better than accuracy
roc_auc_score(label_test,test_preds)

0.5441893759051789