# Step 0: set work directories, extract paths, summarize

In [0]:
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import scipy.io as scio
from collections import OrderedDict 
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from time import time

In [3]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print("Auth Success")

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Auth Success


First upload train_set.zip to google drive, and please replace the *id* of the zip file.

In [0]:
download = drive.CreateFile({'id': '1VnzsmUSgP_IqXvlgWMCVPpbPW665fjaI'}) #please replace the id of your file
download.GetContentFile('train_set.zip')
!unzip train_set.zip

# Step 1: set up controls for evaluation experiments.


In [0]:
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV #Perforing grid search
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

# Step 2: import data and train-test split 

In [6]:
########## Importing the fidusial points ##########
import scipy.io as scio
from collections import OrderedDict 
points_path = 'train_set/points'
points = [p for p in sorted(os.listdir(points_path))]
all_points = []
for p in points:
  poiFile = os.path.join(points_path, p)
  poi = scio.loadmat(poiFile)
  poi = OrderedDict(poi)
  all_points.append(poi.popitem()[1])
y = pd.read_csv('train_set/label.csv')['emotion_idx']

print('success')

success


# Step 3: construct features and responses

In [7]:
########## Calculating pairwise distance ##########
pair_dist = []
for i in range(len(all_points)):
  pair_dist.append(metrics.pairwise_distances(all_points[i])[np.triu_indices(78)])

########## Split train_set & test_set ##########
points_train, points_test, y_train, y_test = train_test_split(pair_dist, y, random_state=42, test_size=0.2)
print('success')

success


# Step 4: Train a classification model with training features and responses

## GBM & CV (Baseline Model)

In [0]:
start = time.time()
gbm0 = GradientBoostingClassifier(random_state=42)
gbm0.fit(points_train, y_train)
finish = time.time()
print("Time：%f s" %(finish-start))
pred = gbm0.predict(points_test)
print("Baseline GBM Accuracy : %.4g" % metrics.accuracy_score(y_test, pred))

The default setting of GBM model gives accuracy 42.2%, next I'll tune the model in the following order: n_estimators, max_depth and min_samples_split, min_samples_leaf, max_features, subsample, learning_rate. 

#### Tuning Process

In [0]:
param_test1 = {'n_estimators':range(20,101,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500, min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=42), 
                        param_grid = param_test1, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch1.fit(np.array(points_train), np.array(y_train))
gsearch1.best_estimator_, gsearch1.best_params_, gsearch1.best_score_



(GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=8,
                            max_features='sqrt', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=50, min_samples_split=500,
                            min_weight_fraction_leaf=0.0, n_estimators=90,
                            n_iter_no_change=None, presort='auto',
                            random_state=42, subsample=0.8, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 {'n_estimators': 90},
 0.44895409587322294)

In [0]:
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,100)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=90, max_features='sqrt', subsample=0.8, random_state=42), 
                        param_grid = param_test2, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch2.fit(points_train, y_train)
gsearch2.best_estimator_, gsearch2.best_params_, gsearch2.best_score_

(GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=9,
                            max_features='sqrt', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=300,
                            min_weight_fraction_leaf=0.0, n_estimators=90,
                            n_iter_no_change=None, presort='auto',
                            random_state=42, subsample=0.8, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 {'max_depth': 9, 'min_samples_split': 300},
 0.4242135993612245)

In [0]:
param_test3 = {'min_samples_leaf':range(30,101,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=90, max_depth=9, min_samples_split=300,max_features='sqrt', subsample=0.8, random_state=42), 
                        param_grid = param_test3, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch3.fit(points_train, y_train)
gsearch3.best_estimator_, gsearch3.best_params_, gsearch3.best_score_

(GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=9,
                            max_features='sqrt', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=70, min_samples_split=300,
                            min_weight_fraction_leaf=0.0, n_estimators=90,
                            n_iter_no_change=None, presort='auto',
                            random_state=42, subsample=0.8, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 {'min_samples_leaf': 70},
 0.4462476273943968)

In [0]:
param_test4 = {'max_features':range(7,20,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=90, max_depth=9, min_samples_split=300, min_samples_leaf=70, subsample=0.8, random_state=42),
                        param_grid = param_test4, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch4.fit(points_train, y_train)
gsearch4.best_estimator_, gsearch4.best_params_, gsearch4.best_score_



(GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=9,
                            max_features=19, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=70, min_samples_split=300,
                            min_weight_fraction_leaf=0.0, n_estimators=90,
                            n_iter_no_change=None, presort='auto',
                            random_state=42, subsample=0.8, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 {'max_features': 19},
 0.44071402186934916)

In [0]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=90, max_depth=9, min_samples_split=300, min_samples_leaf=70, subsample=0.8, max_features=19, random_state=42),
                        param_grid = param_test5, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch5.fit(points_train, y_train)
gsearch5.best_estimator_, gsearch5.best_params_, gsearch5.best_score_



(GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=9,
                            max_features=19, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=70, min_samples_split=300,
                            min_weight_fraction_leaf=0.0, n_estimators=90,
                            n_iter_no_change=None, presort='auto',
                            random_state=42, subsample=0.85, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 {'subsample': 0.85},
 0.4410745645105936)

#### Training Model

In [0]:
start = time.time()
gbm_trained1 = GradientBoostingClassifier(
    random_state=42, 
    learning_rate=0.05, 
    n_estimators=180, 
    max_depth=9, 
    min_samples_split=300, 
    min_samples_leaf=70, 
    max_features=19, 
    subsample=0.85)
gbm_trained1.fit(points_train, y_train)
finish = time.time()
print("Time：%f s" %(finish-start))
pred = gbm_trained1.predict(points_test)
print("GBM Accuracy : %.4g" % metrics.accuracy_score(y_test, pred))

Time：43.609636 s
GBM Accuracy : 0.424


In [0]:
start = time.time()
gbm_trained2 = GradientBoostingClassifier(
    random_state=42, 
    learning_rate=0.01, 
    n_estimators=900, 
    max_depth=9, 
    min_samples_split=300, 
    min_samples_leaf=70, 
    max_features=19, 
    subsample=0.85)
gbm_trained2.fit(points_train, y_train)
finish = time.time()
print("Time：%f s" %(finish-start))
pred = gbm_trained2.predict(points_test)
print("GBM Accuracy : %.4g" % metrics.accuracy_score(y_test, pred))

Time：209.384207 s
GBM Accuracy : 0.438


In [0]:
start = time.time()
gbm_trained3 = GradientBoostingClassifier(
    random_state=42, 
    learning_rate=0.005, 
    n_estimators=1800, 
    max_depth=9, 
    min_samples_split=300, 
    min_samples_leaf=70, 
    max_features=19, 
    subsample=0.85)
gbm_trained3.fit(points_train, y_train)
finish = time.time()
print("Time：%f s" %(finish-start))
pred = gbm_trained3.predict(points_test)
print("GBM Accuracy : %.4g" % metrics.accuracy_score(y_test, pred))

Time：403.559488 s
GBM Accuracy : 0.436


In [0]:
start = time.time()
gbm_trained4 = GradientBoostingClassifier(
    random_state=42, 
    learning_rate=0.005, 
    n_estimators=2000, 
    max_depth=9, 
    min_samples_split=300, 
    min_samples_leaf=70, 
    max_features=19, 
    subsample=0.85)
gbm_trained4.fit(points_train, y_train)
finish = time.time()
print("Time：%f s" %(finish-start))
pred = gbm_trained4.predict(points_test)
print("GBM Accuracy : %.4g" % metrics.accuracy_score(y_test, pred))

Time：453.803439 s
GBM Accuracy : 0.436


After cross validating, the GBM model we choose will be 
learning_rate=0.01, n_estimators=900, max_depth=9 min_samples_split=300, min_samples_leaf=70, max_features=19, subsample=0.85

#### Prediction

In [0]:
start = time.time()
pred = gbm_trained2.predict(points_train)
finish = time.time()
print("Time：%f s" %(finish-start))
print("GBM Accuracy on training data: %.4g" % metrics.accuracy_score(y_train, pred))

start = time.time()
pred = gbm_trained2.predict(points_test)
finish = time.time()
print("Time：%f s" %(finish-start))
print("GBM Accuracy on test data: %.4g" % metrics.accuracy_score(y_test, pred))

Time：3.800666 s
GBM Accuracy on training data: 1
Time：0.522831 s
GBM Accuracy on test data: 0.438


In [0]:
start = time.time()
gbm_trained = GradientBoostingClassifier(
    random_state=42, 
    learning_rate=0.01, 
    n_estimators=900, 
    max_depth=9, 
    min_samples_split=300, 
    min_samples_leaf=70, 
    max_features=19, 
    subsample=0.85)
gbm_trained.fit(pair_dist, y)
finish = time.time()
print("Time：%f s" %(finish-start))

Time：275.379290 s


In [0]:
from sklearn.externals import joblib
joblib.dump(gbm_trained, "gbm_trained.m")

## SVM & CV

In [0]:
########## Scaling datasets ##########
points_train_np = np.array(points_train)
points_test_np = np.array(points_test)
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)
points_train_scale = scale(points_train_np)
points_test_scale = scale(points_test_np)

In [0]:
########## PCA Modules ###########

#n_comp = 1000
#pca = PCA(n_components=n_comp, svd_solver='randomized',
#          whiten=True).fit(points_train_np)
#points_train_pca = pca.transform(points_train_np)
#points_test_pca = pca.transform(points_test_np)

#### Tuning Process

In [0]:
###### Tunning Modules #########
tunning = False
t0 = time.time()

if tunning:
  param_grid = {'C': [1,5,10],
              'gamma': [0.00001,0.0001,0.01], }
  clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                   param_grid, cv=5, iid=False)
  clf = clf.fit(points_train_scale, y_train)
  print("done in %0.3fs" % (time.time() - t0))
  print("Best estimator found by grid search:")
  print(clf.best_estimator_)

#### Training Model



In [13]:
########## Training Modules ##########
###### if tunning is true, there is no need to train the model again ########
t0 = time.time()
clf = SVC(kernel="rbf", class_weight="balanced",C=10,gamma=0.0001)
clf = clf.fit(points_train_scale, y_train)
print("Training done in %0.3fs" %(time.time() -t0))

Training done in 16.100s


#### Prediction

In [14]:
######## Prediction on test_set #########
t0 = time.time()
y_pred = clf.predict(points_test_scale)
acc_pred = np.sum(y_pred == y_test)/y_test.shape[0]
print("Prediction on test_set done in %0.3fs" % (time.time() - t0))
print("Test_set accurarcy is %0.3f" %acc_pred)

Prediction on test_set done in 5.152s
Test_set accurarcy is 0.512


In [15]:
######## Prediction on train_set #########
t0 = time.time()
y_pred_train = clf.predict(points_train_scale)
acc_pred_train = np.sum(y_pred_train == y_train)/y_train.shape[0]
print("Prediction on train_set done in %0.3fs" % (time.time() - t0))
print("Train_set accurarcy is %0.3f" %acc_pred_train)

Prediction on train_set done in 20.572s
Train_set accurarcy is 0.821


In [16]:
y_pred_train = clf.predict(points_train_scale)
np.sum(y_pred_train == y_train)/y_train.shape[0]

0.821

In [0]:
C=10
C1=8
gamma=0.0001

In [0]:
1/2000

0.0005

In [23]:
joblib.dump(clf, "svm_final.m")

['svm_final.m']

## Xgboost & CV

In [0]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
import time
import numpy as np

In [0]:
def modelfit(alg, dtrain, predictors, cv_folds=10):
  #Fit the algorithm on the data
  alg.fit(dtrain, predictors)

  #Predict training set:
  dtrain_predictions = alg.predict(dtrain)
  dtrain_predprob = alg.predict_proba(dtrain)[:,1]

  #Print model report:
  print("\nModel Report")
  print("Accuracy : %.4g" % metrics.accuracy_score(predictors, dtrain_predictions))

In [0]:
start = time.time()
xgb1 = XGBClassifier(
 objective= 'multi:softmax',
 num_class= 23,
 seed=27)

modelfit(xgb1, np.array(points_train), np.array(y_train))
finish = time.time()
print("Prediction on train_set done in %0.3fs" % (finish-start))


Model Report
Accuracy : 1
Prediction on train_set done in 782.057s


In [0]:
start = time.time()
preds = xgb1.predict(points_test)
acc_pred = metrics.accuracy_score(preds, y_test)
finish = time.time()
print("Prediction on test_set done in %0.3fs" % (finish - start))
print("Test_set accurarcy is %0.3f" %acc_pred)

Prediction on test_set done in 0.492s
Test_set accurarcy is 0.482


We can see that before we change any paramters, the accuracy rate of xgboost is 48.2%,which is better than our baseline. The speed is better as well. Hence, we think we can consider this method.

#### Tuning Process

As it takes a lot of time to fit the model, we only tune the most important parameters.

As I tried to change many parameters  in the model, I found 'n_estimators' and 'min_child_weight' influenced the performance  a lot.

Step1: n_estimators

In [0]:
start = time.time()
xgb2 = XGBClassifier(
 n_estimators = 1000, 
 objective= 'multi:softmax',
 num_class= 23,
 max_depth =5,
 min_child_weight =1,
 nthread =4,
 subsample = 0.8,
 colsample_bytree  = 0.8,
 scale_pos_weight = 1,
 seed=27)

modelfit(xgb2, np.array(points_train), np.array(y_train))
finish = time.time()
print("Time：%f s" %(finish-start))
print("Test_set accurarcy is %0.3f" %acc_pred)


Model Report
Accuracy : 1
Time：800.118329 s


In [0]:
start = time.time()
preds = xgb2.predict(points_test)
acc_pred = metrics.accuracy_score(preds, y_test)
finish = time.time()
print("Prediction on test_set done in %0.3fs" % (finish - start))
print("Test_set accurarcy is %0.3f" %acc_pred)

Prediction on test_set done in 0.634s
Test_set accurarcy is 0.528


We changed n_estimators to 1000. Then the accuracy rate of the model increased a lot and the running time did not increase much. Hence, we tuned n_estimators to 1000. And we tuned 'subsample' and 'colsample_bytree' to a more reasonable value.

Step2: Tune min_child_weight

In [0]:
param_test1 = {
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(n_estimators = 1000, 
 objective= 'multi:softmax',
 num_class= 23,
 max_depth =5,
 min_child_weight =1,
 nthread =4,
 subsample = 0.8,
 colsample_bytree  = 0.8,
 scale_pos_weight = 1,                                               
 seed=27), 
param_grid = param_test1, scoring = 'accuracy',iid=False, cv=2)
gsearch1.fit(np.array(points_train), np.array(y_train))
gsearch1.best_params_, gsearch1.best_score_, gsearch1.cv_results_

({'min_child_weight': 1},
 0.45233928421423175,
 {'mean_fit_time': array([302.53986263, 267.62051725, 259.8642416 ]),
  'mean_score_time': array([0.34943461, 0.34187734, 0.33939075]),
  'mean_test_score': array([0.45233928, 0.44191391, 0.444447  ]),
  'param_min_child_weight': masked_array(data=[1, 3, 5],
               mask=[False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'min_child_weight': 1},
   {'min_child_weight': 3},
   {'min_child_weight': 5}],
  'rank_test_score': array([1, 3, 2], dtype=int32),
  'split0_test_score': array([0.47912525, 0.45626243, 0.45328032]),
  'split1_test_score': array([0.42555332, 0.42756539, 0.43561368]),
  'std_fit_time': array([2.66214442, 4.87970066, 2.17809272]),
  'std_score_time': array([0.00725555, 0.01867068, 0.00683665]),
  'std_test_score': array([0.02678596, 0.01434852, 0.00883332])})

We found that 'min_child_weight' = 1 is the best. And we get our final xgboost model.

#### Training Model

In [0]:
start = time.time()
xgb_final = XGBClassifier(
 n_estimators = 1000, 
 objective= 'multi:softmax',
 num_class= 23,
 max_depth =5,
 min_child_weight =1,
 nthread =4,
 subsample = 0.8,
 colsample_bytree  = 0.8,
 scale_pos_weight = 1,
 seed=27)

modelfit(xgb_final, np.array(points_train), np.array(y_train))
finish = time.time()
print("Time：%f s" %(finish-start))


Model Report
Accuracy : 1
Time：807.873069 s


#### Prediction

In [0]:
start = time.time()
preds = xgb_final.predict(points_test)
acc_pred = metrics.accuracy_score(preds, y_test)
finish = time.time()
print("Prediction on test_set done in %0.3fs" % (finish - start))
print("Test_set accurarcy is %0.3f" %acc_pred)

Prediction on test_set done in 0.629s
Test_set accurarcy is 0.528


In [0]:
start = time.time()
modelfit(xgb_final, np.array(pair_dist), np.array(y))
finish = time.time()
print("Time：%f s" %(finish-start))


Model Report
Accuracy : 1
Time：1133.331869 s


In [0]:
joblib.dump(xgb_final, "xgb_final.m")

# Step 5: Run test on test images

#### GBM

In [0]:
gbm_final=joblib.load("gbm_trained.m")

start_gbm = time.time()
pred_gbm_final = gbm_final.predict(points_test)
finish_gbm = time.time()
print("GBM Accuracy on test data: %.4g" %metrics.accuracy_score(preds_gbm_final, y_test))

#### XGBOOST

In [0]:
xgb_final=joblib.load("xgb_final.m")

start_xgb = time.time()
preds_xgb_final = xgb_final.predict(points_test)
acc_pred_xgb_final = metrics.accuracy_score(preds_xgb_final, y_test)
finish_xgb = time.time()
print("XGBOOST Accuracy on test data: %0.3f" %acc_pred_xgb_final)

#### SVM

In [0]:
svm_final=joblib.load("svm_final.m")

start_svm = time.time()
y_pred_svm_final = svm_final.predict(points_test_scale)
acc_pred_svm_final = np.sum(y_pred == y_test)/y_test.shape[0]
finish_svm = time.time()
print("Test_set accurarcy is %0.3f" %acc_pred_svm_final)



# Summarize Running Time

In [0]:
print("Time：%f s" %(finish_gbm-start_gbm)) # GBM
print("Time：%f s" %(finish_xgb-start_xgb)) # XGBOOST
print("Time：%f s" %(finish_svm-start_svm)) # SVM