In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import sys
import os
import scipy.io
import re
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from scipy.spatial.distance import pdist

# Train Dataset

In [6]:
mat = []
dirname = "../data/train_set/points"
for f in sorted(os.listdir("../data/train_set/points")):
    mat.append(scipy.io.loadmat(os.path.join(dirname,f)))

label_file = pd.read_csv("../data/train_set/label.csv")

In [7]:
pattern = re.compile(r'face*')
coords = [x[list(filter(pattern.match, x.keys()))[0]] for x in mat]

In [8]:
feature_mat = np.array([pdist(x) for x in coords])
reduced_feature_mat = PCA(n_components=500).fit_transform(feature_mat)
label = np.asarray(label_file.loc[:, 'emotion_idx'])
#dataset = np.concatenate((feature_mat, label[:,np.newaxis]), axis=1)

# Cross-Validation to Tune Parameter

x = list(range(40,110,10))
y = []
for k in range(40,110,10):
    cvs = cross_validate(GradientBoostingClassifier(n_estimators=k,validation_fraction=0.2,
                                               n_iter_no_change=5, tol=0.01), feature_mat, label, cv=3, scoring='accuracy')
    print('run for n_estimators={}, '.format(k),'with accuracy {}'.format(cvs['test_score']))
    y.append(np.mean(cvs['test_score']))
    
plt.plot(x,y)

## Finally we decide use GBM with n_estimators=6 as our baseline model

In [None]:
gbm0 = GradientBoostingClassifier(n_estimators=60)
gbm0.fit(feature_mat, label)

### If the train process is too long, the fitted model can be accessed from our saved model

In [11]:
import pickle
with open('../doc/baseline_model.sav', 'rb') as f:
    gbm0 = pickle.load(f)

# Test Dataset

In [12]:
mat = []
dirname = "../data/test_set/points"
for f in sorted(os.listdir("../data/test_set/points")):
    mat.append(scipy.io.loadmat(os.path.join(dirname,f)))

In [13]:
pattern = re.compile(r'face*')
coords = [x[list(filter(pattern.match, x.keys()))[0]] for x in mat]

In [14]:
feature_mat_test = np.array([pdist(x) for x in coords])
reduced_feature_mat = PCA(n_components=500).fit_transform(feature_mat_test)
#label_test = np.asarray(label_file.loc[:, 'emotion_idx'])
#dataset = np.concatenate((feature_mat, label[:,np.newaxis]), axis=1)

In [15]:
pred=gbm0.predict(feature_mat_test)
pred

array([22,  7,  9, ...,  4, 14, 19])

In [19]:
import pandas as pd
data = pd.read_csv("../data/test_set/label_predictions.csv")
data["Baseline"] = pred
data.to_csv("../data/test_set/label_predictions.csv")