# ADS Project 3 Group 4

## Libraries and Settings

In [1]:
from scipy.io import loadmat
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL
from PIL import Image

In [2]:
"""
Path
"""
DATA_PATH = "../data/train_set"
IMAGE_FOLDER = os.path.join(DATA_PATH, "images")
POINTS_FOLDER = os.path.join(DATA_PATH, "points")
LABELS_FOLDER = DATA_PATH

## Read Training Data

In [3]:
def read_all_images():
    """
    Read 2500 training images from the IMAGE_FOLDER
    :return a 4d numpy array in form of (index, height, width, channels), channels is RGB 
    """
    files = [file for file in os.listdir(IMAGE_FOLDER) if file.endswith('.jpg')]
    files.sort()
    
    face_images = np.zeros((len(files), 750, 1000, 3))
    
    for index, filename in enumerate(files):
        face_img_arr = plt.imread(os.path.join(IMAGE_FOLDER, filename))
        if face_img_arr.shape != (750,1000,3):
            # resize the image
            face_img = Image.fromarray(face_img_arr)
            face_img = face_img.resize((1000,750))
            face_img_arr = np.array(face_img)
        face_images[index] = face_img_arr
    return face_images

def read_labels():
    """
    Read the image labels from the label.csv file
    :return a pandas.DataFrame with 3 columns: 'emotion_idx','emotion_cat','type'
    """
    labels_df = pd.read_csv(os.path.join(LABELS_FOLDER, 'label.csv'))
    labels_df = labels_df.loc[:,['emotion_idx','emotion_cat','type']]
    return labels_df
    

def read_all_points():
    """
    Read all face coordinates points
    :return a tuple of shape (2500, 78, 2). Because for each of 2500 images there are 78 points associated with it
    """
    files = [file for file in os.listdir(POINTS_FOLDER) if file.endswith('.mat')]
    files.sort()
    
    face_points = np.zeros((len(files), 78, 2))
    for index, filename in enumerate(files):
        face_points_dict = loadmat(os.path.join(POINTS_FOLDER, filename))
    
        face_points[index] = face_points_dict.get('faceCoordinatesUnwarped',  face_points_dict.get('faceCoordinates2'))
    return face_points

def load_data(loadImage = False):
    """
    Load training data from local files
    
    :loadImage if it's False, this function will not load original images
    :return a tuple (images, points, labels)
        if loadImage is False, the 'images' will None. Otherwise its a numpy array with shape (2500,750,1000,3)
        points is a numpy array with shape (2500, 78, 2)
        labels is a pandas.DataFrame
    """
    #face_images_narr =  read_all_images() if loadImage else None
    face_images_points = read_all_points()
    labels = read_labels()
    #return face_images_narr, face_images_points, labels
    return face_images_points, labels

In [4]:
# pass True if you want to read original images, it might take some time to do it
#images, points, labels = load_data(True)
points, labels = load_data(True)

In [5]:
#if images:
#    print(images.shape)

print(points.shape)
labels

(2500, 78, 2)


Unnamed: 0,emotion_idx,emotion_cat,type
0,1,Neutral,simple
1,1,Neutral,simple
2,1,Neutral,simple
3,1,Neutral,simple
4,1,Neutral,simple
...,...,...,...
2495,22,Sadly disgusted,compound
2496,22,Sadly disgusted,compound
2497,22,Sadly disgusted,compound
2498,22,Sadly disgusted,compound


## Feature Selection

In [6]:
from sklearn.metrics import pairwise_distances
# function to calculate the distance
def feature(input_points):
    n = input_points.shape[0]
    pairwise_dist_data = []
    # return a vector
    def pairwise_dist(vec):
        vec = np.reshape(vec, (len(vec),1))
        dist_matrix = pairwise_distances(vec)
        dist_matrix = dist_matrix[np.triu_indices(dist_matrix.shape[0], k=1)]
        return dist_matrix
    
    # dist is an 2 column array
    def pairwise_dist_result(mat):
        dist = np.apply_along_axis(func1d=pairwise_dist, axis=0, arr=mat)
        dist_result = np.ndarray.flatten(dist,order='F').tolist()
        return dist_result
    
    for i in range(n):
        pairwise_dist_feature = pairwise_dist_result(points[i,:,:])
        pairwise_dist_data.append(pairwise_dist_feature)
        
    pairwise_dist_data = pd.DataFrame(pairwise_dist_data)
    return pairwise_dist_data
        


In [7]:
X = feature(points)
y = labels['emotion_idx']

## Model Traning and Selection

In [8]:
from sklearn.model_selection import train_test_split
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
# model selection with cross-validation 
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
para1 = {
     'n_estimators':[100,200,300,400,500,600] 
}
xgb_model = xgb.XGBClassifier(learning_rate=0.1, max_depth=1)
gsearch1 = GridSearchCV(estimator = xgb_model, 
                        param_grid = para1, 
                        scoring ='accuracy',
                        cv = 5,
                        n_jobs = 4
                       )
gsearch1.fit(X_train,y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5,verbose=False)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=1, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=4,
             param_grid={'n_estimators': [100, 200, 300, 400, 500, 600]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring

In [29]:
import pickle 
filename = 'xgboostpara1.sav'
pickle.dump(gsearch1, open(filename, 'wb'))
print("best number of stimators: {}".format(gsearch1.best_params_['n_estimators']))
print("accuracy: {}".format(gsearch1.best_score_))

best number of stimators: 100
accuracy: 0.363


In [16]:
para2 = {
    'learning_rate': [0.01, 0.05, 0.1]
}
xgb_model2 = xgb.XGBClassifier(max_depth=1, n_estimators=100)
gsearch2 = GridSearchCV(estimator = xgb_model2, 
                        param_grid = para2, 
                        scoring ='accuracy',
                        cv = 5,
                        n_jobs = 4
                       )
gsearch2.fit(X_train,y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=1, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=4,
             param_grid={'learning_rate': [0.01, 0.05, 0.1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy',

In [28]:
filename = 'xgboostpara2.sav'
pickle.dump(gsearch2, open(filename, 'wb'))
print("best learning rate: {}".format(gsearch2.best_params_['learning_rate']))
print("accuracy: {}".format(gsearch2.best_score_))

best learning rate: 0.1
accuracy: 0.363


In [27]:
pd.DataFrame(gsearch2.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,136.720679,38.891297,1.295959,0.642082,0.01,{'learning_rate': 0.01},0.223039,0.22963,0.228288,0.207071,0.21134,0.22,0.009064,3
1,460.045001,281.154844,0.805139,0.220934,0.05,{'learning_rate': 0.05},0.313725,0.355556,0.322581,0.252525,0.21134,0.292,0.051705,2
2,453.757779,210.32112,0.553683,0.124445,0.1,{'learning_rate': 0.1},0.409314,0.417284,0.349876,0.275253,0.360825,0.363,0.050936,1


- The best parameters: depth=1, learning_rate=0.1, number of estimators=100

## Testing

In [31]:
filename = 'xgboost_best.sav'
pickle.dump(xgb_best, open(filename, 'wb'))
print("accuracy: {}".format(accuracy_score(y_test, pred)))

accuracy: 0.474
