# ADS Project 3 Group 4

## Libraries and Settings

In [1]:
from scipy.io import loadmat
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from scipy.spatial.distance import pdist
from sklearn import svm

  from numpy.core.umath_tests import inner1d


In [2]:
"""
Path
"""
DATA_PATH = "../data/train_set"
IMAGE_FOLDER = os.path.join(DATA_PATH, "images")
POINTS_FOLDER = os.path.join(DATA_PATH, "points")
LABELS_FOLDER = DATA_PATH

## Read Training Data

In [3]:
def read_all_images():
    """
    Read 2500 training images from the IMAGE_FOLDER
    :return a 4d numpy array in form of (index, height, width, channels), channels is RGB 
    """
    files = [file for file in os.listdir(IMAGE_FOLDER) if file.endswith('.jpg')]
    files.sort()
    
    face_images = np.zeros((len(files), 750, 1000, 3))
    
    for index, filename in enumerate(files):
        face_img_arr = plt.imread(os.path.join(IMAGE_FOLDER, filename))
        if face_img_arr.shape != (750,1000,3):
            # resize the image
            face_img = Image.fromarray(face_img_arr)
            face_img = face_img.resize((1000,750))
            face_img_arr = np.array(face_img)
        face_images[index] = face_img_arr
    return face_images

def read_labels():
    """
    Read the image labels from the label.csv file
    :return a pandas.DataFrame with 3 columns: 'emotion_idx','emotion_cat','type'
    """
    labels_df = pd.read_csv(os.path.join(LABELS_FOLDER, 'label.csv'))
    labels_df = labels_df.loc[:,['emotion_idx','emotion_cat','type']]
    return labels_df
    

def read_all_points():
    """
    Read all face coordinates points
    :return a tuple of shape (2500, 78, 2). Because for each of 2500 images there are 78 points associated with it
    """
    files = [file for file in os.listdir(POINTS_FOLDER) if file.endswith('.mat')]
    files.sort()
    
    face_points = np.zeros((len(files), 78, 2))
    for index, filename in enumerate(files):
        face_points_dict = loadmat(os.path.join(POINTS_FOLDER, filename))
    
        face_points[index] = face_points_dict.get('faceCoordinatesUnwarped',  face_points_dict.get('faceCoordinates2'))
    return face_points

def load_data(loadImage = False):
    """
    Load training data from local files
    
    :loadImage if it's False, this function will not load original images
    :return a tuple (images, points, labels)
        if loadImage is False, the 'images' will None. Otherwise its a numpy array with shape (2500,750,1000,3)
        points is a numpy array with shape (2500, 78, 2)
        labels is a pandas.DataFrame
    """
    face_images_narr =  read_all_images() if loadImage else None
    face_images_points = read_all_points()
    labels = read_labels()
    return face_images_narr, face_images_points, labels

In [4]:
# pass True if you want to read original images, it might take some time to do it
images, points, labels = load_data(False)

In [5]:
if images:
    print(images.shape)

print(points.shape)

(2500, 78, 2)


## Random Forest

In [9]:
distances = np.zeros((2500, 3003))
for i in range(2500):
    current = points[i]
    distances[i,] = pdist(current)

In [10]:
new_feature = pd.DataFrame(distances.reshape(2500,-1))
j = 0
for i in range(3003):
    j += 1
    new_feature = new_feature.rename(columns = {i:('distance_'+ str(j))})
label = np.array(labels)

In [17]:
new_feature = np.array(new_feature)
train_feature, test_feature, train_label, test_label = train_test_split(new_feature, label, test_size = 0.25, random_state = 42)
train_feature = preprocessing.scale(train_feature)
test_feature = preprocessing.scale(test_feature)

### Set up random grid search for random forest

In [20]:
%%time
## Set up random grid 

n_estimators = [int(x) for x in np.linspace(start=300, stop = 1300, num = 20)]
max_features = ['auto','sqrt','log2']
max_depth = [int(x) for x in np.linspace(10, 100, 11, endpoint=True)]

random_grid = {'n_estimators' : n_estimators,
              'max_features' : max_features,
              'max_depth' : max_depth
              }

clf = RandomForestClassifier()
clf_random = RandomizedSearchCV(estimator = clf,
                                param_distributions=random_grid,
                               n_iter = 100,
                               cv = 5,
                                verbose = 2,
                                random_state = 42,
                                n_jobs = -1
                               )
clf_random.fit(train_feature,train_label)
result = clf_random.predict(test_feature)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] n_estimators=773, max_features=sqrt, max_depth=100 ..............
[CV] n_estimators=773, max_features=sqrt, max_depth=100 ..............
[CV] n_estimators=773, max_features=sqrt, max_depth=100 ..............
[CV] n_estimators=773, max_features=sqrt, max_depth=100 ..............
[CV]  n_estimators=773, max_features=sqrt, max_depth=100, total= 1.2min
[CV] n_estimators=773, max_features=sqrt, max_depth=100 ..............
[CV]  n_estimators=773, max_features=sqrt, max_depth=100, total= 1.2min
[CV] n_estimators=1300, max_features=auto, max_depth=82 ..............
[CV]  n_estimators=773, max_features=sqrt, max_depth=100, total= 1.2min
[CV] n_estimators=1300, max_features=auto, max_depth=82 ..............
[CV]  n_estimators=773, max_features=sqrt, max_depth=100, total= 1.2min
[CV] n_estimators=1300, max_features=auto, max_depth=82 ..............
[CV]  n_estimators=773, max_features=sqrt, max_depth=100, total= 1.4min
[CV] n_es

KeyboardInterrupt: 

In [19]:
## Find the best parameters
clf_random.best_params_

NameError: name 'clf_random' is not defined

In [13]:
%%time
clf = RandomForestClassifier()
cv_param = {'n_estimators' : [800,820,840,860,880],
              'max_features' : ['sqrt'],
              'max_depth' : [62,64,66,68,70]}
clf_cv = GridSearchCV(estimator = clf, param_grid = cv_param, cv = 5, n_jobs = -1, verbose =2)
clf_cv.fit(train_feature,train_label)
cv_result = clf_cv.predict(test_feature)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] max_depth=62, max_features=sqrt, n_estimators=800 ...............
[CV] max_depth=62, max_features=sqrt, n_estimators=800 ...............
[CV] max_depth=62, max_features=sqrt, n_estimators=800 ...............
[CV] max_depth=62, max_features=sqrt, n_estimators=800 ...............
[CV]  max_depth=62, max_features=sqrt, n_estimators=800, total= 1.2min
[CV] max_depth=62, max_features=sqrt, n_estimators=800 ...............
[CV]  max_depth=62, max_features=sqrt, n_estimators=800, total= 1.2min
[CV] max_depth=62, max_features=sqrt, n_estimators=820 ...............
[CV]  max_depth=62, max_features=sqrt, n_estimators=800, total= 1.2min
[CV] max_depth=62, max_features=sqrt, n_estimators=820 ...............
[CV]  max_depth=62, max_features=sqrt, n_estimators=800, total= 1.2min
[CV] max_depth=62, max_features=sqrt, n_estimators=820 ...............
[CV]  max_depth=62, max_features=sqrt, n_estimators=800, total= 1.4min
[CV] max_depth=

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 11.9min


[CV]  max_depth=64, max_features=sqrt, n_estimators=820, total= 1.3min
[CV] max_depth=64, max_features=sqrt, n_estimators=840 ...............
[CV]  max_depth=64, max_features=sqrt, n_estimators=820, total= 1.4min
[CV] max_depth=64, max_features=sqrt, n_estimators=840 ...............
[CV]  max_depth=64, max_features=sqrt, n_estimators=840, total= 1.4min
[CV] max_depth=64, max_features=sqrt, n_estimators=840 ...............
[CV]  max_depth=64, max_features=sqrt, n_estimators=840, total= 1.4min
[CV] max_depth=64, max_features=sqrt, n_estimators=860 ...............
[CV]  max_depth=64, max_features=sqrt, n_estimators=840, total= 1.4min
[CV] max_depth=64, max_features=sqrt, n_estimators=860 ...............
[CV]  max_depth=64, max_features=sqrt, n_estimators=840, total= 1.5min
[CV] max_depth=64, max_features=sqrt, n_estimators=860 ...............
[CV]  max_depth=64, max_features=sqrt, n_estimators=840, total= 1.5min
[CV] max_depth=64, max_features=sqrt, n_estimators=860 ...............
[CV]  

[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed: 39.8min finished


CPU times: user 1min 9s, sys: 1.38 s, total: 1min 10s
Wall time: 40min 49s


In [14]:
clf_cv.best_params_

{'max_depth': 62, 'max_features': 'sqrt', 'n_estimators': 840}

In [15]:
clf_tunned = RandomForestClassifier(max_depth = 62, 
                                    max_features = 'sqrt',
                                    n_estimators = 840,
                                    random_state=0)
clf_tunned.fit(train_feature, train_label)
result = clf_tunned.predict(test_feature)
print('Random Forest Accuracy:', accuracy_score(test_label, clf_tunned.predict(test_feature)))

Accuracy: 0.424


## SVM

In [12]:
%%time
Cs = [0.1,1,10,100,1000]
param_grid = {'C':Cs}
grid_search = GridSearchCV(svm.SVC(kernel='linear'), param_grid, cv=5)
grid_search.fit(train_feature, train_label)
score = cross_val_score(grid_search, train_feature, train_label, scoring='accuracy',cv=5).tolist()  
best_params = grid_search.best_params_

CPU times: user 26min 41s, sys: 3.74 s, total: 26min 44s
Wall time: 26min 53s


In [13]:
best_params

{'C': 0.1}

In [14]:
from sklearn.svm import SVC 
svm_model_linear = SVC(kernel = 'linear', C = 0.1).fit(train_feature, train_label) 
svm_predictions = svm_model_linear.predict(test_feature) 
  
# model accuracy for X_test   
accuracy = svm_model_linear.score(test_feature, test_label)
print('SVM Accuracy:',svm_model_linear.score(test_feature, test_label))

0.4848