In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import numpy as np
import statistics

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import neighbors
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import max_error
from sklearn.metrics import explained_variance_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from operator import itemgetter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import torchvision
from torchvision import datasets, models, transforms

import warnings

cudnn.benchmark = True
warnings.filterwarnings("ignore")

<div class="alert alert-block alert-success">
Data Preprocessing and feature extraction 
</div>

In [2]:
# Installing opensmile via pip
!pip install opensmile
# update required packages if needed
!pip install --upgrade pyyaml
# install audiofile 
!pip install audiofile
# install XtremeGradientBoost classifier
!pip install xgboost



In [3]:
import opensmile
import audiofile
from xgboost import XGBClassifier

In [4]:
# Scale x feastures
def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

# Train SVM classifier
def train_svm(X_train, Y_train, Z_train, svm_kernel='rbf'):
    ###########################################################################################
    #  OPTIMIZATION THROUGH THE LEAVE-ONE-SPEAKER-OUT METHOD, i.e. (6-FOLD) CROSS-VALIDATION  #
    ###########################################################################################

    # Define 5 levels of complexity
    C = [0.0001, 0.001, 0.01, 0.1, 1.0]
    # Create list to collect mean results (across the folds) for each complexity
    results = []
    # Run experiments for each complexity
    for elem in C:
        # Define classifier with one-vs-the-rest multi-class strategy
        clf = svm.SVC(C=elem, kernel=svm_kernel)
        # Create empty list to collect scores for each cv for the given complexity
        scores = []
        users = np.unique(Z_train)
        # Create splitting considering each time a different speaker in test
        for speaker_out in users:
            data_train   = np.empty((0, X_train.shape[1]))
            target_train = np.empty((0, 1))
            data_val     = np.empty((0, X_train.shape[1]))
            target_val   = np.empty((0, 1))
            for index, utterance in enumerate(Z_train):
                if utterance == speaker_out:
                    data_val   = np.append(data_val, [X_train[index]], axis=0)
                    target_val = np.append(target_val, [Y_train[index]], axis=0)
                else:
                    data_train   = np.append(data_train, [X_train[index]], axis=0)
                    target_train = np.append(target_train, [Y_train[index]], axis=0)
            # Run svm for each of the 6-fold cross validations
            clf.fit(data_train, target_train.ravel())  # add .ravel() to avoid dataConversion warning
            predictions = clf.predict(data_val)
            UAR = recall_score(target_val, predictions, average='macro')
            scores.append(UAR)
        # Compute mean of accuracies for the given complexity and append it to the results list
        results.append((statistics.mean(scores), elem))
    return results

#Tune SVM for best C gotten from cross-validation earlier
def tune_svm(results, X_train, Y_train, X_test, Y_test, svm_kernel='rbf'):
    #############################################
    #  PERFORM FINAL TRAINING/TEST WITH BEST C  #
    #############################################

    # Get best complexity from the cross-validation
    best_C = max(results, key=itemgetter(0))[1]

    print(f'C: {best_C}')
    
    # Make training again with the optimal hyper-parameters
    clf = svm.SVC(C=best_C, kernel=svm_kernel)
    clf.fit(X_train, Y_train.ravel())
    # Make final test
    predictions = clf.predict(X_test)
    UAR = recall_score(Y_test, predictions, average='macro')
    print(predictions)
    print(UAR)
    
    return predictions

# print the statistics of the predictions for a given test set (of labels)
def print_statistics(Y_test, predictions):
    # Compute evaluation metrics
    UAR = recall_score(Y_test, predictions, average='macro')
    WAR = recall_score(Y_test, predictions, average='weighted')
    print('UAR =', UAR)
    print('WAR =', WAR)
    # Confusion matrix

    cm = confusion_matrix(Y_test, predictions, labels=[1,2,3,4,5,6,7,8])
    print(cm)

    # Compute recall, precision, and F1 score for each class
    rec_result = recall_score(Y_test, predictions, average=None, labels=[1,2,3,4,5,6,7,8])
    print('Recall for neutral =', rec_result[0]*100, '%')
    print('Recall for calm =',  rec_result[1]*100, '%')
    print('Recall for happy =', rec_result[2]*100, '%')
    print('Recall for sad =', rec_result[3]*100, '%')
    print('Recall for angry =', rec_result[4]*100, '%')
    print('Recall for fearful =', rec_result[5]*100, '%')
    print('Recall for disgust =', rec_result[6]*100, '%')
    print('Recall for surprised =', rec_result[7]*100, '%')

    prec_result = precision_score(Y_test, predictions, average=None, labels=[1,2,3,4,5,6,7,8])
    print('Precision for neutral =', prec_result[0]*100, '%')
    print('Precision for calm =',  prec_result[1]*100, '%')
    print('Precision for happy =', prec_result[2]*100, '%')
    print('Precision for sad =', prec_result[3]*100, '%')
    print('Precision for angry =', prec_result[4]*100, '%')
    print('Precision for fearful =', prec_result[5]*100, '%')
    print('Precision for disgust =', prec_result[6]*100, '%')
    print('Precision for surprised =', prec_result[7]*100, '%')

<div class="alert alert-block alert-danger">
Task 0: Extract the baseline eGeMAPSv02 feature set 
</div>

In [5]:
# Define feature extractor to get functionals from the eGeMAPS (v02) feature set
smile_func = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

In [6]:
my_dir = os.getcwd()

# Collect data
for phase in ['test', 'train']:
    print(phase)
    
    functionals = []
    
    csv_name = my_dir + '/RESULTS/' + phase + '.csv'
    labels_ID = {'label': [], 'intensity' : [], 'ID': [], 'name': []}
    for file in os.listdir(f'resources/{phase}/'):  
        file_name  = os.path.basename(file[0:-4])
        speaker_ID = file_name[-2:]
        label      = file_name[6:8]
        intensity  = file_name[9:11]

        labels_ID['label'].append(label)
        labels_ID['intensity'].append(intensity)
        labels_ID['ID'].append(speaker_ID)
        labels_ID['name'].append(file_name)
        
        functional = smile_func.process_file(f'resources/{phase}/{file}')
        functional['file'] = file
        functionals.append(functional)
    
    pd.concat(functionals).to_csv(csv_name, index=False)
    df = pd.DataFrame.from_dict(labels_ID)
    df.to_csv(my_dir + '/RESULTS/' + phase + '_labels_ID.csv', sep='\t', index = False)
    
print("Finished data extraction phase")

test
train
Finished data extraction phase


<div class="alert alert-block alert-danger">
Task 1: Baseline for SER
</div>

In [7]:
#################
#  DATA IMPORT  #
#################

my_dir = os.getcwd()

features = pd.read_csv(my_dir + '/RESULTS/train.csv', sep=',', index_col=-1)
ID_labels = pd.read_csv(my_dir + '/RESULTS/train_labels_ID.csv', sep='\t')
X_train = features.values  # this extracts the values as a numpy array
Y_train = ID_labels.loc[:, ['label']].to_numpy()
Z_train = ID_labels.loc[:, ['ID']].to_numpy()

features = pd.read_csv(my_dir + '/RESULTS/test.csv', sep=',', index_col=-1)
ID_labels = pd.read_csv(my_dir + '/RESULTS/test_labels_ID.csv', sep='\t')
X_test = features.values  # this extracts the values as a numpy array
Y_test = ID_labels.loc[:, ['label']].to_numpy()

# exploring training set
print('TRAINING SET')
print(X_train.shape[0], ' instances')
print(len(np.unique(Y_train)), ' classes: ', np.unique(Y_train))
print(X_train.shape[1], ' features')

TRAINING SET
1200  instances
8  classes:  [1 2 3 4 5 6 7 8]
88  features


## Feature Scaling ##

In [8]:
####################
# Feature scaling #
####################
# Normalize features in the training set
X_train, X_test = scale_features(X_train, X_test)

## Tuning the classifier & making predictions ##

In [9]:
results = train_svm(X_train, Y_train, Z_train, svm_kernel='linear')
print(results)

[(0.33359375, 0.0001), (0.42890625, 0.001), (0.49453125, 0.01), (0.509375, 0.1), (0.475, 1.0)]


In [10]:
predictions = tune_svm(results, X_train, Y_train, X_test, Y_test, svm_kernel='linear')

C: 0.1
[2 2 1 4 2 1 3 1 5 2 3 1 2 2 1 1 2 2 2 4 2 2 1 4 2 2 2 2 2 4 1 4 2 2 2 4 2
 2 1 2 2 2 1 4 2 2 1 2 5 2 3 1 5 3 3 1 5 1 3 1 5 2 3 1 5 3 3 7 8 3 3 5 5 6
 8 3 8 3 5 3 4 2 1 4 7 2 2 4 5 2 2 1 5 2 1 1 5 1 2 4 5 2 2 7 4 4 4 7 4 4 2
 4 5 2 5 5 5 1 8 3 5 7 8 5 5 7 8 5 5 5 5 5 5 3 5 5 5 5 5 5 5 5 5 5 5 3 3 4
 7 3 1 6 4 6 1 6 7 3 1 6 6 6 6 3 6 6 3 6 5 6 5 6 5 6 5 6 5 1 8 7 3 7 4 7 5
 6 8 7 5 2 8 7 8 2 5 8 5 7 5 7 3 6 5 7 3 3 8 7 8 8 8 4 8 8 3 8 8 8 8 8 8 8
 8 7 8 8 8 8 8 8 8 8 8 8 8 6 8 3 8 4]
0.49609375


**Evaluation metrics**

We will evalaute our model's results in terms of:

- UAR
- WAR
- Recall per class
- Precision per class

In [11]:
print_statistics(Y_test, predictions)

UAR = 0.49609375
WAR = 0.5041666666666667
[[ 6  6  2  1  1  0  0  0]
 [ 5 21  0  6  0  0  0  0]
 [ 5  2 12  0  8  1  1  3]
 [ 5 10  0 10  4  0  3  0]
 [ 1  1  2  0 23  0  2  3]
 [ 3  0  6  2  5 14  2  0]
 [ 1  2  4  1  7  2  9  6]
 [ 0  0  2  2  0  1  1 26]]
Recall for neutral = 37.5 %
Recall for calm = 65.625 %
Recall for happy = 37.5 %
Recall for sad = 31.25 %
Recall for angry = 71.875 %
Recall for fearful = 43.75 %
Recall for disgust = 28.125 %
Recall for surprised = 81.25 %
Precision for neutral = 23.076923076923077 %
Precision for calm = 50.0 %
Precision for happy = 42.857142857142854 %
Precision for sad = 45.45454545454545 %
Precision for angry = 47.91666666666667 %
Precision for fearful = 77.77777777777779 %
Precision for disgust = 50.0 %
Precision for surprised = 68.42105263157895 %


<div class="alert alert-block alert-danger">
Task 2: Improvement over basemodel
</div>

<div class="alert alert-block alert-warning">
eGeMAPSv02 feature + RBF kernel
</div>

As we have now our baseline model, we would first of like to evaluate the eGeMAPSv02 feature set with a RBF-kernel for our SVM instead of a linear kernel

In [12]:
results = train_svm(X_train, Y_train, Z_train, svm_kernel='rbf')
print(results)

[(0.37890625, 0.0001), (0.37890625, 0.001), (0.37890625, 0.01), (0.396875, 0.1), (0.48828125, 1.0)]


In [13]:
predictions = tune_svm(results, X_train, Y_train, X_test, Y_test, svm_kernel='rbf')

C: 1.0
[2 2 1 4 2 1 3 1 3 2 3 4 2 2 1 4 2 2 1 4 2 2 1 4 2 2 1 2 2 4 2 2 2 2 2 2 2
 2 4 2 2 2 1 2 2 2 1 2 1 2 8 1 3 3 3 2 3 1 3 1 7 2 3 4 5 3 3 3 8 3 6 3 3 3
 8 3 5 3 8 6 4 2 2 2 7 2 2 4 7 2 2 4 7 2 1 1 4 4 1 4 5 4 4 7 4 4 4 4 5 4 4
 4 5 7 5 3 5 1 8 3 5 7 8 3 5 5 8 3 5 3 5 5 5 3 5 3 5 5 5 3 5 5 5 5 7 3 1 4
 7 3 1 6 3 2 4 7 3 6 1 6 3 6 3 6 6 6 3 6 5 6 3 6 5 6 3 6 6 3 8 7 6 7 6 7 4
 4 1 3 5 3 1 7 5 3 8 4 5 7 8 7 5 6 8 7 5 3 8 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 7 8 7 8 8]
0.50390625


In [14]:
# Compute evaluation metrics
print_statistics(Y_test, predictions)

UAR = 0.50390625
WAR = 0.5208333333333334
[[ 4  6  3  3  0  0  0  0]
 [ 5 23  0  4  0  0  0  0]
 [ 4  3 15  1  2  2  1  4]
 [ 3  8  0 15  2  0  4  0]
 [ 1  0  8  0 18  0  2  3]
 [ 3  1  9  2  2 12  3  0]
 [ 2  0  5  3  5  4  8  5]
 [ 0  0  0  0  0  0  2 30]]
Recall for neutral = 25.0 %
Recall for calm = 71.875 %
Recall for happy = 46.875 %
Recall for sad = 46.875 %
Recall for angry = 56.25 %
Recall for fearful = 37.5 %
Recall for disgust = 25.0 %
Recall for surprised = 93.75 %
Precision for neutral = 18.181818181818183 %
Precision for calm = 56.09756097560976 %
Precision for happy = 37.5 %
Precision for sad = 53.57142857142857 %
Precision for angry = 62.06896551724138 %
Precision for fearful = 66.66666666666666 %
Precision for disgust = 40.0 %
Precision for surprised = 71.42857142857143 %


While quite simple and fast to both implement and compute, it does not bring any real advantage, apart from a slightly raised WAR. For recall and precision, some areas are better, yet a lot are worse. Therefore eGeMAPSv02 feature + RBF kernel do not bring any advantage.

<div class="alert alert-block alert-warning">
ComParE_2016 feature set + SVMs
</div>

In [15]:
 # New opensmile feature extraction in here
    # Define feature extractor to get functionals from the eGeMAPS (v02) feature set
smile_func = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

In [16]:
my_dir = os.getcwd()

# Collect data
for phase in ['test', 'train']:
    print(phase)
    
    functionals = []
    
    csv_name = my_dir + '/RESULTS/' + phase + '2.csv'
    labels_ID = {'label': [], 'intensity' : [], 'ID': [], 'name': []}
    for file in os.listdir(f'resources/{phase}/'):  
        file_name  = os.path.basename(file[0:-4])
        speaker_ID = file_name[-2:]
        label      = file_name[6:8]
        intensity  = file_name[9:11]

        labels_ID['label'].append(label)
        labels_ID['intensity'].append(intensity)
        labels_ID['ID'].append(speaker_ID)
        labels_ID['name'].append(file_name)
        
        functional = smile_func.process_file(f'resources/{phase}/{file}')
        functional['file'] = file
        functionals.append(functional)
    
    pd.concat(functionals).to_csv(csv_name, index=False)
    df = pd.DataFrame.from_dict(labels_ID)
    df.to_csv(my_dir + '/RESULTS/' + phase + '2_labels_ID.csv', sep='\t', index = False)
    
print("Finished data extraction phase 2")

test
train
Finished data extraction phase 2


In [17]:
#################
#  DATA IMPORT  #
#################

my_dir = os.getcwd()

features = pd.read_csv(my_dir + '/RESULTS/train2.csv', sep=',', index_col=-1)
ID_labels = pd.read_csv(my_dir + '/RESULTS/train2_labels_ID.csv', sep='\t')
X_train = features.values  # this extracts the values as a numpy array
Y_train = ID_labels.loc[:, ['label']].to_numpy()
Z_train = ID_labels.loc[:, ['ID']].to_numpy()

features = pd.read_csv(my_dir + '/RESULTS/test2.csv', sep=',', index_col=-1)
ID_labels = pd.read_csv(my_dir + '/RESULTS/test2_labels_ID.csv', sep='\t')
X_test = features.values  # this extracts the values as a numpy array
Y_test = ID_labels.loc[:, ['label']].to_numpy()

# exploring training set
print('TRAINING SET')
print(X_train.shape[0], ' instances')
print(len(np.unique(Y_train)), ' classes: ', np.unique(Y_train))
print(X_train.shape[1], ' features')

TRAINING SET
1200  instances
8  classes:  [1 2 3 4 5 6 7 8]
6373  features


In [18]:
####################
# Feature scaling #
####################
X_train, X_test = scale_features(X_train, X_test)

<div class="alert alert-block alert-warning">
Idea: Use grid search over the SVM over ComParE_2016 feature set with GridSearchCV
</div>

Another idea to find the best possible SVM classifier, apart from simply trying out several combinations by hand, is doing exhaustive search (for hyperparameter training) over out Leave-One-Out SVM. More specifically, we would like to find the most favourable combinations of C and Gamma with respect to the used kernel. As the last controlled parameter, we choose both balanced and non-optimized class weights for each combintation of C, gamma and kernel.

- C -> Is a regularization parameter
    - Higher C -> Avoid as much missclassification as possible, but smaller-margin hyperplane.
    - Smaller C -> Aim for wider-margin hyperplance, but allow also more missclassifications
- Gamma -> Only affects the outcome for RBF kernels, for other kernels this parameter will be ignored mostly
    - Small Gamma -> The influence of a training sample "reaches far", therefore has more influence on the decision boundary even when that is far away. This might lead to SVMs that cannot capture the complexity of data and might only separate rather poorly
    - High Gamma -> The influence of a training does only reach rather close to the sample, but it has a strong influence.This might lead to overfitting, no matter how we choose our C value
   
For more about RFB SVM parameters, look here: <https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html>

In [19]:
param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'class_weight': ['balanced', None],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'sigmoid'],
}

def train_SVM_grid(x_train,y_train,param_grid):
    tree = svm.SVC(random_state=55)
    model = GridSearchCV(tree,param_grid=param_grid, cv=6, n_jobs=-1, verbose=3)
    model.fit(x_train,y_train)
    print(f'{model.best_params_}\n{model.best_estimator_}')
    return model.best_params_,model.best_estimator_

def predict_SVM_grid(model, x_train, y_train, x_test, y_test):
    # Make training again with the optimal hyper-parameters
    clf = svm.SVC(C=model.C, kernel=model.kernel, gamma=model.gamma, class_weight=model.class_weight)
    clf.fit(X_train, y_train.ravel())
    # Make final test
    predictions = clf.predict(x_test)
    WAR = recall_score(y_test, predictions, average='weighted')
    print(predictions)
    print(WAR)
    
    return predictions

In [20]:
svm_param, svm_model = train_SVM_grid(X_train, Y_train.ravel(), param_grid)

Fitting 6 folds for each of 120 candidates, totalling 720 fits
{'C': 0.001, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'linear'}
SVC(C=0.001, class_weight='balanced', gamma=1, kernel='linear', random_state=55)


In [21]:
predictions = predict_SVM_grid(svm_model, X_train, Y_train.ravel(), X_test, Y_test.ravel())

[1 1 1 1 7 1 5 4 3 2 1 4 4 2 5 1 2 4 2 4 4 2 7 2 2 2 7 2 2 4 7 4 2 2 2 2 2
 2 2 2 2 2 7 2 2 2 2 2 1 3 8 7 3 3 6 2 3 3 8 1 3 4 8 3 5 6 8 4 5 3 8 3 3 3
 8 3 3 3 8 3 4 2 2 7 7 2 2 2 4 7 7 4 4 5 7 4 3 4 7 3 3 4 7 7 4 4 7 4 4 4 2
 7 5 5 8 5 5 5 8 5 5 5 8 5 5 5 8 5 5 5 6 5 5 5 3 7 5 5 5 5 5 5 5 5 5 6 5 6
 7 6 6 6 7 3 1 6 7 3 1 5 6 6 5 6 6 6 3 6 3 6 8 6 5 6 6 3 5 3 7 7 7 7 7 7 5
 5 7 7 5 7 7 7 7 7 7 7 5 7 7 7 7 7 7 7 5 7 8 7 8 8 8 8 8 8 8 8 8 8 8 6 5 8
 8 8 5 8 8 8 3 3 8 8 8 6 8 6 5 8 8 8]
0.6083333333333333


In [22]:
print_statistics(Y_test.ravel(), predictions)

UAR = 0.59765625
WAR = 0.6083333333333333
[[ 7  2  1  3  2  0  1  0]
 [ 0 23  0  5  0  0  4  0]
 [ 2  1 15  2  2  2  1  7]
 [ 0  6  3 12  1  0 10  0]
 [ 0  0  1  0 25  1  1  4]
 [ 2  0  5  0  5 16  3  1]
 [ 0  0  1  0  6  0 24  1]
 [ 0  0  2  0  3  3  0 24]]
Recall for neutral = 43.75 %
Recall for calm = 71.875 %
Recall for happy = 46.875 %
Recall for sad = 37.5 %
Recall for angry = 78.125 %
Recall for fearful = 50.0 %
Recall for disgust = 75.0 %
Recall for surprised = 75.0 %
Precision for neutral = 63.63636363636363 %
Precision for calm = 71.875 %
Precision for happy = 53.57142857142857 %
Precision for sad = 54.54545454545454 %
Precision for angry = 56.81818181818182 %
Precision for fearful = 72.72727272727273 %
Precision for disgust = 54.54545454545454 %
Precision for surprised = 64.86486486486487 %


ComParE_2016 feature + linear kernel seems to be a very good combination at it outperforms the baseline in all recall values and all but two precision values. The UAR went up by a solid 10 percent and WAR bei 8 percent. Overall a decent result. 

<div class="alert alert-block alert-warning">
Gradient boosted classification via grid search
</div>

In [23]:
param_grid = {
    'criterion': ['friedman_mse', 'squared_error'],
    'loss': ['deviance', 'exponential'],
    'max_depth': range(1,10),
    'max_features': ['sqrt', 'log2'],
}

def train_dec_tree(x_train,y_train,param_grid):
    tree = GradientBoostingClassifier(random_state=55)
    model = GridSearchCV(tree,param_grid=param_grid, n_jobs=-1, verbose=3)
    model.fit(x_train,y_train)
    print(f'{model.best_params_}\n{model.best_estimator_}')
    return model.best_params_,model.best_estimator_

def predict_dec_tree(model, x_train, y_train, x_test, y_test):
    classifier = GradientBoostingClassifier(criterion=model.criterion, max_depth=model.max_depth, max_features=model.max_features)
    classifier.fit(x_train, y_train)
    predictions = classifier.predict(x_test)
    WAR = recall_score(y_test, predictions, average='weighted')
    print(predictions)
    print(WAR)
    
    return predictions

In [24]:
tree_params, tree_model = train_dec_tree(X_train, Y_train.ravel(), param_grid)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
{'criterion': 'squared_error', 'loss': 'deviance', 'max_depth': 3, 'max_features': 'sqrt'}
GradientBoostingClassifier(criterion='squared_error', max_features='sqrt',
                           random_state=55)


In [25]:
predictions = predict_dec_tree(tree_model, X_train, Y_train.ravel(), X_test, Y_test.ravel())

[7 2 2 2 4 1 3 4 4 2 3 4 4 4 7 4 2 2 2 2 2 2 2 2 2 2 7 2 2 2 7 2 2 2 2 2 2
 2 7 2 7 2 7 2 2 2 7 2 4 4 8 7 4 3 3 4 5 3 3 4 3 4 8 4 5 6 8 3 8 3 8 3 5 3
 8 3 5 3 8 3 7 4 2 2 7 4 2 2 7 2 1 2 7 2 2 2 5 4 7 3 7 4 7 7 7 4 7 6 8 4 7
 7 5 7 8 7 5 7 8 3 5 7 8 5 5 5 8 5 5 5 5 3 5 5 5 7 5 5 5 5 5 5 5 7 7 4 7 6
 7 6 6 7 7 3 1 4 7 3 1 7 5 6 3 6 6 3 3 3 5 6 3 6 5 6 3 3 5 3 7 7 5 7 7 7 5
 7 7 3 5 7 7 5 7 7 7 7 5 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 5 8 8 8 3 8 8 8 8 8 8]
0.5333333333333333


In [26]:
print_statistics(Y_test.ravel(), predictions)

UAR = 0.50390625
WAR = 0.5333333333333333
[[ 1  4  2  7  0  0  2  0]
 [ 0 26  0  0  0  0  6  0]
 [ 0  0 12  7  4  1  1  7]
 [ 1  9  1  6  1  1 12  1]
 [ 0  0  2  0 20  0  6  4]
 [ 2  0  9  2  3  9  7  0]
 [ 0  0  2  0  6  0 24  0]
 [ 0  0  1  0  1  0  0 30]]
Recall for neutral = 6.25 %
Recall for calm = 81.25 %
Recall for happy = 37.5 %
Recall for sad = 18.75 %
Recall for angry = 62.5 %
Recall for fearful = 28.125 %
Recall for disgust = 75.0 %
Recall for surprised = 93.75 %
Precision for neutral = 25.0 %
Precision for calm = 66.66666666666666 %
Precision for happy = 41.37931034482759 %
Precision for sad = 27.27272727272727 %
Precision for angry = 57.14285714285714 %
Precision for fearful = 81.81818181818183 %
Precision for disgust = 41.37931034482759 %
Precision for surprised = 71.42857142857143 %


Sadly, the outcome for the best model is not as good as for SVMs. It was an interesting idea to try out gradient boosted trees as the classifier, but it could not increase the accuracy much over the baseline.

<div class="alert alert-block alert-warning">
Try some Random Forrests with GridSeach
</div>

In [27]:
param_grid_forest = {
    'class_weight': [None, "balanced"],
    'criterion': ['gini','entropy'],
    'max_depth': range(5,20),
    'max_features': [None, "sqrt"],
}

def train_random_forest(x_train,y_train,param_grid):
    random_forrest = RandomForestClassifier(random_state=55)
    model = GridSearchCV(random_forrest,param_grid=param_grid, n_jobs = -1, verbose=3)
    model.fit(x_train,y_train)
    print(f'{model.best_params_}\n{model.best_estimator_}')
    return model.best_params_,model.best_estimator_

def predict_random_forest(model, x_train, y_train, x_test, y_test):
    classifier = RandomForestClassifier(criterion=model.criterion, max_depth=model.max_depth, max_features=model.max_features, class_weight=model.class_weight)
    classifier.fit(x_train, y_train)
    predictions = classifier.predict(x_test)
    WAR = recall_score(y_test, predictions, average='weighted')
    print(predictions)
    print(WAR)
    
    return predictions
    

In [28]:
forest_params, forest_model = train_random_forest(X_train, Y_train.ravel(), param_grid_forest)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 11, 'max_features': None}
RandomForestClassifier(class_weight='balanced', max_depth=11, max_features=None,
                       random_state=55)


In [29]:
predictions = predict_random_forest(forest_model, X_train, Y_train.ravel(), X_test, Y_test.ravel())

[7 2 7 2 7 1 3 1 3 2 3 2 3 2 5 4 2 2 2 2 2 2 2 2 2 2 7 2 2 2 7 2 2 2 2 2 2
 2 2 2 7 2 7 2 2 2 2 2 7 4 8 1 3 3 8 7 3 3 3 3 5 3 8 3 3 8 8 3 8 6 8 3 3 3
 8 3 8 3 8 3 7 2 2 2 7 2 2 2 7 2 5 2 7 2 7 2 5 4 7 7 7 6 7 7 7 4 7 7 7 6 7
 2 5 5 8 5 5 7 8 3 5 5 8 5 5 7 8 7 5 3 5 5 5 3 5 7 5 5 5 7 5 5 5 7 5 6 8 7
 7 6 1 8 7 6 3 5 7 6 3 7 7 6 3 6 5 6 3 6 5 6 8 6 3 6 3 3 5 3 7 7 7 7 7 7 5
 7 7 7 5 3 7 7 7 7 7 8 7 7 7 7 7 7 7 7 7 7 7 7 8 6 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8]
0.5625


In [30]:
print_statistics(Y_test.ravel(), predictions)

UAR = 0.53515625
WAR = 0.5625
[[ 2  5  4  1  1  0  3  0]
 [ 0 28  0  0  0  0  4  0]
 [ 1  0 16  1  1  1  2 10]
 [ 0 11  0  2  2  2 15  0]
 [ 0  0  3  0 19  0  6  4]
 [ 1  0  7  0  4 11  6  3]
 [ 0  0  2  0  3  0 26  1]
 [ 0  0  0  0  0  1  0 31]]
Recall for neutral = 12.5 %
Recall for calm = 87.5 %
Recall for happy = 50.0 %
Recall for sad = 6.25 %
Recall for angry = 59.375 %
Recall for fearful = 34.375 %
Recall for disgust = 81.25 %
Recall for surprised = 96.875 %
Precision for neutral = 50.0 %
Precision for calm = 63.63636363636363 %
Precision for happy = 50.0 %
Precision for sad = 50.0 %
Precision for angry = 63.33333333333333 %
Precision for fearful = 73.33333333333333 %
Precision for disgust = 41.935483870967744 %
Precision for surprised = 63.26530612244898 %


Random forest do generate a favourable improvement over the baseline and achieve an 5% improvement for UAR and nearly 8% for WAR. However, they were computationally more expensive that our baseline and cannot reach the equally expensive SVM gridsear

<div class="alert alert-block alert-warning">
Extreme Gradient Boosting
</div>

In [31]:
def predict_xgb_classifier(x_train, y_train, x_test, y_test):
    xgb = XGBClassifier()
    xgb.fit(x_train, y_train)
    predictions = xgb.predict(x_test)
    WAR = recall_score(y_test, predictions, average='weighted')
    print(predictions)
    print(WAR)
    
    return predictions

In [32]:
le = LabelEncoder()
trans_Y_train = le.fit_transform(Y_train)
trans_Y_test = le.fit_transform(Y_test)
predictions = predict_xgb_classifier(X_train, trans_Y_train, X_test, trans_Y_test)

[4 1 0 3 3 0 4 3 2 3 4 3 3 1 6 3 1 1 6 1 1 1 6 1 1 1 6 1 1 1 6 1 1 1 1 1 1
 1 6 1 1 3 6 1 1 1 6 1 1 3 7 6 2 2 7 3 2 2 7 2 4 1 7 3 4 2 7 2 4 2 7 2 2 3
 7 2 4 2 7 5 6 3 1 1 6 3 1 1 6 1 0 1 6 6 1 1 4 3 6 6 6 3 6 3 6 6 6 6 3 3 6
 6 4 6 7 4 4 6 7 2 4 4 7 4 4 4 4 6 4 2 4 4 4 4 4 4 4 4 4 4 4 4 4 6 4 5 6 5
 6 5 0 7 6 2 2 5 6 5 0 5 6 5 2 2 5 5 4 5 4 5 7 5 4 5 2 2 4 6 6 6 4 6 6 6 4
 6 6 6 6 6 6 6 6 6 6 6 4 6 6 6 6 6 6 4 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
0.5833333333333334


In [33]:
print_statistics(trans_Y_test, predictions)

UAR = 0.5546875
WAR = 0.5833333333333334
[[24  0  1  0  0  7  0  0]
 [ 2 12  4  4  1  1  8  0]
 [ 8  0  7  1  0 15  0  0]
 [ 0  2  0 23  0  4  3  0]
 [ 0  6  0  4 13  5  2  0]
 [ 0  0  0  5  0 27  0  0]
 [ 0  0  0  0  0  0 32  0]
 [ 0  0  0  0  0  0  0  0]]
Recall for neutral = 75.0 %
Recall for calm = 37.5 %
Recall for happy = 21.875 %
Recall for sad = 71.875 %
Recall for angry = 40.625 %
Recall for fearful = 84.375 %
Recall for disgust = 100.0 %
Recall for surprised = 0.0 %
Precision for neutral = 66.66666666666666 %
Precision for calm = 57.14285714285714 %
Precision for happy = 36.84210526315789 %
Precision for sad = 57.49999999999999 %
Precision for angry = 92.85714285714286 %
Precision for fearful = 45.0 %
Precision for disgust = 71.11111111111111 %
Precision for surprised = 0.0 %


This rather simple to use method performs considerably better than our baseline, but does not yet reach the performance level of the best SVM methods. Therefore we consider it to be a viable idea, yet with room for improvement and definitly not the best. A positive aspect is their relativly low computational time, measureable in mere seconds instead of minutes and hours for our GridSearches.

<div class="alert alert-block alert-warning">
Try a CNN 
</div>

In [34]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
data_transforms = {
    'test': transforms.Compose([
        transforms.Grayscale(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Lambda(lambda x: x.repeat(3,1,1)),
    ]),
}

data_dir = './resources/librosa/data/librosa/' # path to iamge data
image_datasets = {'test': torchvision.datasets.ImageFolder(os.path.join(data_dir, 'test'), data_transforms['test'])}
dataloaders = torch.utils.data.DataLoader(image_datasets['test'], batch_size=1, shuffle=False, num_workers=0)
dataset_sizes = {'test': len(image_datasets['test'])}
class_names = image_datasets['test'].classes
print(dataset_sizes)
print(class_names)

#setup path where model is stored (pre-trained on kaggle)
PATH = './resources/emotion_rec_out/emotionRecognition_restnet.pt'

model = torchvision.models.resnet18(pretrained=False)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(class_names))
model.load_state_dict(torch.load(PATH, map_location=device))
model.eval()
model = model.to(device)

{'test': 240}
['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised']


In [35]:
Y_test = []
predictions = []
for sample, label in dataloaders:
    sample = sample.to(device)
    label = label.cpu().numpy()[0]
    Y_test.append(label)
    #predict with CNN
    outputs = model(sample)
    _, predicted = torch.max(outputs, 1)
    predicted = predicted.cpu().numpy()[0]
    predictions.append(predicted)

In [36]:
# Compute evaluation metrics
UAR = recall_score(Y_test, predictions, average='macro')
WAR = recall_score(Y_test, predictions, average='weighted')
print('UAR =', UAR)
print('WAR =', WAR)
# Confusion matrix
# Angst, Ekel, Freude, Langeweile, Neutral, Traurigkeit, Wut
cm = confusion_matrix(Y_test, predictions, labels=[0,1,2,3,4,5,6,7])
print(cm)

# Compute recall, precision, and F1 score for each class
rec_result = recall_score(Y_test, predictions, average=None, labels=[0,1,2,3,4,5,6,7])
for i, class_ in enumerate(class_names):
    print(f'Recall for {class_} =', rec_result[i]*100, '%')


prec_result = precision_score(Y_test, predictions, average=None, labels=[0,1,2,3,4,5,6,7])
for i, class_ in enumerate(class_names):
    print(f'Precision for {class_} =', prec_result[i]*100, '%')

UAR = 0.5390625
WAR = 0.5583333333333333
[[20  1  3  3  1  0  0  4]
 [ 0 16  3  4  0  2  7  0]
 [ 6  0 25  1  0  0  0  0]
 [ 3  0  4 20  2  1  2  0]
 [ 5  6  1  1  8  1  2  8]
 [ 0  5  2  3  0  4  2  0]
 [ 2  7  5  8  0  0 10  0]
 [ 1  0  0  0  0  0  0 31]]
Recall for angry = 62.5 %
Recall for calm = 50.0 %
Recall for disgust = 78.125 %
Recall for fearful = 62.5 %
Recall for happy = 25.0 %
Recall for neutral = 25.0 %
Recall for sad = 31.25 %
Recall for surprised = 96.875 %
Precision for angry = 54.054054054054056 %
Precision for calm = 45.714285714285715 %
Precision for disgust = 58.139534883720934 %
Precision for fearful = 50.0 %
Precision for happy = 72.72727272727273 %
Precision for neutral = 50.0 %
Precision for sad = 43.47826086956522 %
Precision for surprised = 72.09302325581395 %


Our trained CNN performs decently, with a WAR of 55% it is a solid 5 percentage points better than the baseline. The WAR is up by around 4 percents. The training was done via Kaggle on grayscale images of the spectograms, we use the trained model for evaluation. Due to the rather limited size of our dataset, our assumption is that the model cannot generalize well and therefore clearly lacks behind our found SVM.

<div class="alert alert-block alert-danger">
Task 3: Interpretation of results
</div>

The interpretations can both be found directly under seach method + in our presentation.