In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pathlib
from util import runGridSearchClassifiers

## Loading data

In [2]:
directory_path = '../../04_-_Dev/videos'
features = 'emobase'

In [3]:
df_total = pd.read_pickle(directory_path + '/audio_' + features + '_data.p')

In [4]:
df_total.shape

(2573701, 60)

In [5]:
df_total.video_name.nunique()

30

In [6]:
df_total[df_total.isna().any(axis=1)]

Unnamed: 0,frameIndex,frameTime,pcm_intensity_sma,pcm_loudness_sma,mfcc_sma[1],mfcc_sma[2],mfcc_sma[3],mfcc_sma[4],mfcc_sma[5],mfcc_sma[6],...,pcm_zcr_sma_de,voiceProb_sma_de,F0_sma_de,F0env_sma_de,video_name,stress_global,type_candidat,sexe,stress,diapo


## Data processing

In [7]:
time_window = 5
df_total['frameTimeWindow'] = df_total.frameTime.apply(lambda x : np.floor(x / time_window) * time_window).astype(int)

In [5]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

In [11]:
# 5 seconds windows
X = df_total.iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]

In [16]:
y = df_total.iloc[:,3:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'min'}).iloc[:,-1]

In [18]:
X.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

In [21]:
from scipy.stats import kurtosis, skew
X_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]
y_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'mean'}).iloc[:,-1]

In [22]:
X_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

## Modèles

In [6]:
X = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

X_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

In [7]:
X = X.fillna(0)
X_audio = X_audio.fillna(0)

## Stress par diapos
### All diapos

In [8]:
diapo_selection = '_all'

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [150, 200, 250], 'max_depth':[10, 15, 20, 25], 'class_weight':[None,'balanced']}
                ]
parameters_list = [
{'n_estimators': [150, 200, 250], 'max_depth':[10, 15, 20, 25], 'class_weight':['balanced']}
]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [15]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 12 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 21.4min finished
Best estimator RandomForestClassifier(class_weight='balanced', max_depth=15, n_estimators=200,
                       n_jobs=-1, random_state=42)
Best results 0.4529982783484758
Best params {'class_weight': 'balanced', 'max_depth': 15, 'n_estimators': 200}
accuracy (mean, std) 0.4603521332304298 0.2114706580772435
f1 (mean, std) 0.4529982783484758 0.20974192525992463
balanced accuracy (mean, std) 0.43354841847414627 0.17316512582141624
precision (mean, std) 0.5723149667598205 0.23354294213480234
recall (mean, std) 0.4603521332304298 0.2114706580772435

f1_score (weighted) 0.4281715798174184
accuracy 0.4567298667167261


In [16]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')

In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[ 654, 1239,  171,    0],
       [ 814, 1749,   27,    0],
       [ 316,  307,   30,    0],
       [   7,   13,    0,    0]])

In [18]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)


In [19]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pcm_intensity_sma,pcm_intensity_sma,pcm_intensity_sma,pcm_intensity_sma,pcm_intensity_sma,pcm_intensity_sma,pcm_intensity_sma,pcm_intensity_sma,pcm_intensity_sma,pcm_loudness_sma,...,F0_sma_de,F0env_sma_de,F0env_sma_de,F0env_sma_de,F0env_sma_de,F0env_sma_de,F0env_sma_de,F0env_sma_de,F0env_sma_de,F0env_sma_de
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,percentil75,median,skew,std,percentil25,mean,kurtosis,min,max,percentil75,...,max,percentil75,median,skew,std,percentil25,mean,kurtosis,min,max
video_name,diapo,frameTimeWindow,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Test_pour_AFPA,1,0,8.774428e-11,5.652821e-11,2.724692,5.158104e-11,3.385610e-11,6.831969e-11,13.638802,4.089930e-12,4.401225e-10,0.055112,...,0.0000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,-3.000000,0.00000,0.00000
Test_pour_AFPA,1,5,1.021841e-10,6.024235e-11,1.669479,6.252104e-11,3.375239e-11,7.729215e-11,3.261232,4.340127e-12,3.715988e-10,0.057011,...,0.0000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,-3.000000,0.00000,0.00000
Test_pour_AFPA,1,10,1.003699e-10,6.097148e-11,1.231535,5.513744e-11,2.914076e-11,7.262539e-11,1.487425,1.711006e-12,2.891272e-10,0.057567,...,0.0000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,-3.000000,0.00000,0.00000
Test_pour_AFPA,1,15,1.096644e-10,6.800392e-11,1.635156,6.265870e-11,3.708061e-11,8.263100e-11,3.776347,1.168483e-12,3.714314e-10,0.057631,...,0.0000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,-3.000000,0.00000,0.00000
Test_pour_AFPA,1,20,1.056265e-10,6.197396e-11,1.170682,5.422649e-11,3.360517e-11,7.523362e-11,1.322073,3.922888e-12,3.111625e-10,0.058139,...,0.0000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,-3.000000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,18,600,1.176876e-11,6.051347e-13,6.714008,4.009258e-11,0.000000e+00,1.408690e-11,58.442977,0.000000e+00,4.133286e-10,0.026832,...,116.1643,0.00000,0.0,0.429715,2.522699,0.000000,-0.060956,11.035127,-11.59529,14.82329
WIN_20210417_14_53_12_Pro,18,605,6.166123e-12,1.043340e-12,4.475901,1.810855e-11,4.173127e-14,7.418843e-12,23.430566,0.000000e+00,1.428942e-10,0.024051,...,117.4899,0.00000,0.0,-1.647909,3.576282,0.000000,-0.029058,8.227114,-19.88145,11.37295
WIN_20210417_14_53_12_Pro,18,610,4.465477e-12,8.346684e-13,4.223178,8.823801e-12,0.000000e+00,4.190518e-12,21.232590,0.000000e+00,6.840137e-11,0.021834,...,114.2939,0.00000,0.0,1.881708,2.252460,0.000000,0.090406,18.710329,-11.04165,16.36507
WIN_20210417_14_53_12_Pro,18,615,1.247821e-11,5.592215e-12,2.798153,1.261866e-11,2.170155e-12,9.820167e-12,9.882993,0.000000e+00,8.663809e-11,0.029513,...,125.1482,0.00000,0.0,-0.042469,3.112394,0.000000,0.029907,9.872000,-16.25249,16.44305


#### En utilisant la proportion des prédictions 0, 1 et 2

In [20]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.176471,0.823529,0.00
Test_pour_AFPA,8,0.200000,0.800000,0.00
Test_pour_AFPA,9,0.000000,0.950000,0.05
Test_pour_AFPA,10,0.000000,1.000000,0.00
Test_pour_AFPA,11,0.050000,0.950000,0.00
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.428571,0.571429,0.00
WIN_20210417_14_53_12_Pro,11,0.600000,0.400000,0.00
WIN_20210417_14_53_12_Pro,12,0.151515,0.848485,0.00
WIN_20210417_14_53_12_Pro,17,0.571429,0.428571,0.00


In [21]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
0,Test_pour_AFPA,1,1.0
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0
237,WIN_20210417_14_53_12_Pro,12,0.0
238,WIN_20210417_14_53_12_Pro,17,1.0


In [22]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [23]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.176471,0.823529,0.00
Test_pour_AFPA,8,0.200000,0.800000,0.00
Test_pour_AFPA,9,0.000000,0.950000,0.05
Test_pour_AFPA,10,0.000000,1.000000,0.00
Test_pour_AFPA,11,0.050000,0.950000,0.00
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.428571,0.571429,0.00
WIN_20210417_14_53_12_Pro,11,0.600000,0.400000,0.00
WIN_20210417_14_53_12_Pro,12,0.151515,0.848485,0.00
WIN_20210417_14_53_12_Pro,17,0.571429,0.428571,0.00


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [25]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    3.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.342090687090687
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4666666666666667 0.24986107250941583
f1 (mean, std) 0.342090687090687 0.26396459108072284
balanced accuracy (mean, std) 0.45595238095238094 0.10688460940608407
precision (mean, std) 0.2864583333333333 0.25224684779442186
recall (mean, std) 0.4666666666666667 0.24986107250941583

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks    

In [26]:
best_result

{'best_estimator': KNeighborsClassifier(),
 'best_score': 0.5049107142857142,
 'best_params': {'n_neighbors': 5, 'p': 2, 'weights': 'uniform'},
 'mean_test_f1_score': 0.5049107142857142,
 'std_test_f1_score': 0.22926693101143997,
 'mean_test_accuracy_score': 0.5041666666666667,
 'std_test_accuracy_score': 0.20021689627889938,
 'mean_test_balanced_accuracy_score': 0.5370899470899472,
 'std_test_balanced_accuracy_score': 0.20378238534037207,
 'mean_test_precision': 0.6314136904761904,
 'std_test_precision': 0.28049332460208276,
 'mean_test_recall': 0.5041666666666667,
 'std_test_recall': 0.20021689627889938}

#### En aggrégeant les prédicitions des diapos

In [27]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)


In [28]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [29]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,diapo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Test_pour_AFPA,1,0.823529,0,1,1.0,0.386953,1.0,1.0,0.880952,-1.697337
Test_pour_AFPA,8,0.800000,0,1,1.0,0.421637,1.0,1.0,0.250000,-1.500000
Test_pour_AFPA,9,1.050000,1,2,1.0,0.223607,1.0,1.0,15.052632,4.129483
Test_pour_AFPA,10,1.000000,1,1,1.0,0.000000,1.0,1.0,-3.000000,0.000000
Test_pour_AFPA,11,0.950000,0,1,1.0,0.223607,1.0,1.0,15.052632,-4.129483
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.571429,0,1,1.0,0.513553,0.0,1.0,-1.916667,-0.288675
WIN_20210417_14_53_12_Pro,11,0.400000,0,1,0.0,0.502625,0.0,1.0,-1.833333,0.408248
WIN_20210417_14_53_12_Pro,12,0.848485,0,1,1.0,0.364110,1.0,1.0,1.778571,-1.943855
WIN_20210417_14_53_12_Pro,17,0.428571,0,1,0.0,0.534522,0.0,1.0,-1.916667,0.288675


In [30]:
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [31]:
best_result, y_predict,y_predict_proba, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)
best_result

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    6.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    6.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=4, multi_class='multinomial', random_state=42)
Best results 0.4583969271469271
Best params {'C': 4, 'class_weight': None}
accuracy (mean, std) 0.4708333333333333 0.2033145483125975
f1 (mean, std) 0.4583969271469271 0.22590056284071497
balanced accuracy (mean, std) 0.49503968253968256 0.18076355105014674
precision (mean, std) 0.568531746031746 0.3019076277002293
recall (mean, std) 0.4708333333333333 0.2033145483125975

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elap

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=4)),
                 ('knn',
                  KNeighborsClassifier(n_neighbors=9, weights='distance'))]),
 'best_score': 0.5105066923816923,
 'best_params': {'knn__n_neighbors': 9,
  'knn__p': 2,
  'knn__weights': 'distance',
  'pca__n_components': 4},
 'mean_test_f1_score': 0.5105066923816923,
 'std_test_f1_score': 0.20498075270089106,
 'mean_test_accuracy_score': 0.5,
 'std_test_accuracy_score': 0.18257418583505536,
 'mean_test_balanced_accuracy_score': 0.49427248677248675,
 'std_test_balanced_accuracy_score': 0.20013975032987208,
 'mean_test_precision': 0.6111855158730158,
 'std_test_precision': 0.2507258574113345,
 'mean_test_recall': 0.5,
 'std_test_recall': 0.18257418583505536}

In [32]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=4)),
                 ('knn',
                  KNeighborsClassifier(n_neighbors=9, weights='distance'))]),
 'best_score': 0.5105066923816923,
 'best_params': {'knn__n_neighbors': 9,
  'knn__p': 2,
  'knn__weights': 'distance',
  'pca__n_components': 4},
 'mean_test_f1_score': 0.5105066923816923,
 'std_test_f1_score': 0.20498075270089106,
 'mean_test_accuracy_score': 0.5,
 'std_test_accuracy_score': 0.18257418583505536,
 'mean_test_balanced_accuracy_score': 0.49427248677248675,
 'std_test_balanced_accuracy_score': 0.20013975032987208,
 'mean_test_precision': 0.6111855158730158,
 'std_test_precision': 0.2507258574113345,
 'mean_test_recall': 0.5,
 'std_test_recall': 0.18257418583505536}

In [33]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[47, 50,  4,  0],
       [42, 63,  7,  1],
       [ 6, 19,  0,  0],
       [ 0,  1,  0,  0]])

In [34]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')

In [35]:
df_ypredict

Unnamed: 0,"(video_name, )","(diapo, )",ypredict
0,Test_pour_AFPA,1,1.0
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,1.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,1.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,1.0
236,WIN_20210417_14_53_12_Pro,11,0.0
237,WIN_20210417_14_53_12_Pro,12,1.0
238,WIN_20210417_14_53_12_Pro,17,0.0


## Stress global

### En utilisant le stress prédit des diapos

In [36]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [37]:
ypredict_stress_diapo

diapo,1,8,9,10,11,12,17,18
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Test_pour_AFPA,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
Video_1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
WIN_20210329_10_16_02_Pro,1.0,0.0,1.0,0.0,1.0,2.0,1.0,0.0
WIN_20210330_13_10_29_Pro,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
WIN_20210331_21_22_52_Pro,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
WIN_20210402_14_27_50_Pro,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
WIN_20210402_19_04_53_Pro,1.0,0.0,1.0,1.0,1.0,1.0,2.0,1.0
WIN_20210403_18_49_15_Pro,0.0,0.0,0.0,0.0,3.0,1.0,1.0,1.0
WIN_20210404_10_58_27_Pro,0.0,0.0,1.0,1.0,2.0,1.0,1.0,1.0


In [38]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
df_annotations_stress

Unnamed: 0_level_0,1,8,9,10,11,12,17,18,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [39]:
Xy = ypredict_stress_diapo.merge(df_annotations_stress.iloc[:,-1],how='inner',on='video_name')
X = Xy.iloc[:,:-1]
y = Xy.iloc[:,-1]

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [41]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.1, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.33333333333333337
Best params {'C': 0.1, 'class_weight': 'balanced'}
accuracy (mean, std) 0.36666666666666664 0.19436506316151
f1 (mean, std) 0.33333333333333337 0.1709811916593008
balanced accuracy (mean, std) 0.34444444444444444 0.18392161508052055
precision (mean, std) 0.34444444444444444 0.18053418676968802
recall (mean, std) 0.36666666666666664 0.19436506316151

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_job

In [42]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=4)),
                 ('knn',
                  KNeighborsClassifier(n_neighbors=20, weights='distance'))]),
 'best_score': 0.4461904761904762,
 'best_params': {'knn__n_neighbors': 20,
  'knn__p': 2,
  'knn__weights': 'distance',
  'pca__n_components': 4},
 'mean_test_f1_score': 0.4461904761904762,
 'std_test_f1_score': 0.11844333405136959,
 'mean_test_accuracy_score': 0.5,
 'std_test_accuracy_score': 0.10540925533894598,
 'mean_test_balanced_accuracy_score': 0.47777777777777775,
 'std_test_balanced_accuracy_score': 0.12472191289246473,
 'mean_test_precision': 0.52,
 'std_test_precision': 0.19160143817599887,
 'mean_test_recall': 0.5,
 'std_test_recall': 0.10540925533894598}

In [43]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global' + diapo_selection + '.csv')

#### Autre méthode

### En utilisant le stress prédit des time windows 5s

In [44]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.185185,0.798942,0.015873
Video_1,0.357616,0.642384,0.0
WIN_20210323_19_17_40_Pro,0.303571,0.690476,0.005952
WIN_20210329_10_16_02_Pro,0.190476,0.809524,0.0
WIN_20210330_13_10_29_Pro,0.271523,0.728477,0.0
WIN_20210331_21_22_52_Pro,0.329412,0.670588,0.0
WIN_20210402_14_27_50_Pro,0.005376,0.994624,0.0
WIN_20210402_19_04_53_Pro,0.897143,0.08,0.022857
WIN_20210403_18_49_15_Pro,0.287293,0.712707,0.0
WIN_20210404_10_58_27_Pro,0.81,0.115,0.075


In [45]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [46]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [48]:
best_result, y_predict,y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=10, multi_class='multinomial', random_state=42)
Best results 0.4666666666666666
Best params {'C': 10, 'class_weight': None}
accuracy (mean, std) 0.4999999999999999 0.14907119849998596
f1 (mean, std) 0.4666666666666666 0.14622830112321866
balanced accuracy (mean, std) 0.45555555555555555 0.13333333333333333
precision (mean, std) 0.4611111111111111 0.16629588385661961
recall (mean, std) 0.4999999999999999 0.14907119849998596

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | e

In [49]:
best_result

{'best_estimator': KNeighborsClassifier(n_neighbors=8, weights='distance'),
 'best_score': 0.5033333333333333,
 'best_params': {'n_neighbors': 8, 'p': 2, 'weights': 'distance'},
 'mean_test_f1_score': 0.5033333333333333,
 'std_test_f1_score': 0.13506285688020217,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.1247219128924647,
 'mean_test_balanced_accuracy_score': 0.5333333333333333,
 'std_test_balanced_accuracy_score': 0.16703662642636563,
 'mean_test_precision': 0.5777777777777777,
 'std_test_precision': 0.2211083193570267,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.1247219128924647}

#### Autre méthode

In [50]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict = df_ypredict.groupby(['video_name']).agg({'ypredict': ['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,0.830688,0.0,2.0,1.0,0.416303,1.0,1.0,0.987649,-1.07429
Video_1,0.642384,0.0,1.0,1.0,0.480893,0.0,1.0,-1.647003,-0.594136
WIN_20210323_19_17_40_Pro,0.702381,0.0,2.0,1.0,0.471455,0.0,1.0,-1.029843,-0.712361
WIN_20210329_10_16_02_Pro,0.809524,0.0,1.0,1.0,0.394019,1.0,1.0,0.485294,-1.576482
WIN_20210330_13_10_29_Pro,0.728477,0.0,1.0,1.0,0.446225,0.0,1.0,-0.944346,-1.02745
WIN_20210331_21_22_52_Pro,0.670588,0.0,1.0,1.0,0.471388,0.0,1.0,-1.473058,-0.725908
WIN_20210402_14_27_50_Pro,0.994624,0.0,1.0,1.0,0.073324,1.0,1.0,181.005405,-13.527949
WIN_20210402_19_04_53_Pro,0.125714,0.0,2.0,0.0,0.395625,0.0,0.0,10.58608,3.293186
WIN_20210403_18_49_15_Pro,0.712707,0.0,1.0,1.0,0.453755,0.0,1.0,-1.11613,-0.940144
WIN_20210404_10_58_27_Pro,0.265,0.0,2.0,0.0,0.588649,0.0,0.0,3.033203,2.085987


In [51]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [53]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='accuracy_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.4333333333333334
Best params {'C': 0.1, 'class_weight': 'balanced'}
accuracy (mean, std) 0.4333333333333334 0.27080128015453203
f1 (mean, std) 0.3682539682539682 0.2641128659230408
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.7s finished


In [54]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=4, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.3726984126984127
Best params {'C': 4, 'class_weight': 'balanced'}
accuracy (mean, std) 0.4333333333333334 0.30912061651652345
f1 (mean, std) 0.3726984126984127 0.28096116926439624
balanced accuracy (mean, std) 0.37777777777777777 0.26620330112690976
precision (mean, std) 0.3333333333333333 0.2623493100692799
recall (mean, std) 0.4333333333333334 0.30912061651652345

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-

In [55]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=2)),
                 ('logistic',
                  LogisticRegression(C=0.5, class_weight='balanced',
                                     multi_class='multinomial'))]),
 'best_score': 0.4282539682539682,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': 'balanced',
  'pca__n_components': 2},
 'mean_test_f1_score': 0.4282539682539682,
 'std_test_f1_score': 0.1999128053817612,
 'mean_test_accuracy_score': 0.5,
 'std_test_accuracy_score': 0.18257418583505539,
 'mean_test_balanced_accuracy_score': 0.4666666666666666,
 'std_test_balanced_accuracy_score': 0.19436506316151006,
 'mean_test_precision': 0.42388888888888887,
 'std_test_precision': 0.23964737469359196,
 'mean_test_recall': 0.5,
 'std_test_recall': 0.18257418583505539}

In [60]:
df_ypredict_stress_global = pd.concat([ypredict_stress_diapo.reset_index(), pd.DataFrame(y_predict,columns=['predicted_stress_global'])], axis=1) 
df_ypredict_stress_global = df_ypredict_stress_global.set_index('video_name').sort_index()
df_ypredict_stress_global = df_ypredict_stress_global.iloc[:,-1]

In [61]:
df_ypredict_stress_global

video_name
Test_pour_AFPA                     1.0
Video_1                            0.0
WIN_20210323_19_17_40_Pro          0.0
WIN_20210329_10_16_02_Pro          0.0
WIN_20210330_13_10_29_Pro          0.0
WIN_20210331_21_22_52_Pro          0.0
WIN_20210402_14_27_50_Pro          1.0
WIN_20210402_19_04_53_Pro          2.0
WIN_20210403_18_49_15_Pro          0.0
WIN_20210404_10_58_27_Pro          2.0
WIN_20210404_21_41_12_Pro          0.0
WIN_20210405_15_09_16_Pro          0.0
WIN_20210406_15_06_15_Pro          1.0
WIN_20210406_18_35_52_Pro          0.0
WIN_20210406_18_49_10_Pro          1.0
WIN_20210406_21_05_52_Pro          0.0
WIN_20210407_09_04_05_Pro          0.0
WIN_20210407_14_54_56_Pro_edit2    0.0
WIN_20210408_11_48_58_Pro          0.0
WIN_20210408_14_00_44_Pro          0.0
WIN_20210408_14_02_19_Pro          0.0
WIN_20210408_14_11_32_Pro          0.0
WIN_20210408_15_20_51_Pro          1.0
WIN_20210408_16_04_32_Pro          1.0
WIN_20210409_10_26_11_Pro          0.0
WIN_20210413_1

### Audios diapos only

In [62]:
diapo_selection = '_audio_only'
diapo_audio_list = [8,9,10,11,17]
X = X_audio
y = y_audio

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [100, 150, 200], 'max_depth':[10, 15, 20, 25], 'class_weight':[None,'balanced']}
                ]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [64]:
best_result, y_predict, y_predict_proba,result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 24 candidates, totalling 720 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 12.2min finished
Best estimator RandomForestClassifier(max_depth=25, n_jobs=-1, random_state=42)
Best results 0.4895567766570025
Best params {'class_weight': None, 'max_depth': 25, 'n_estimators': 100}
accuracy (mean, std) 0.4627812792533997 0.2374004254394037
f1 (mean, std) 0.4895567766570025 0.2643965616504656
balanced accuracy (mean, std) 0.47060091995098524 0.1975398259321073
precision (mean, std) 0.6905439806468997 0.29522573381181333
recall (mean, std) 0.4627812792533997 0.2374004254394037

f1_score (weighted) 0.4456935088383956
accuracy 0.4628518689432395


In [65]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')

In [66]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[463, 455,  20,   0],
       [396, 528,  51,   0],
       [117, 105,  12,   0],
       [  5,  15,   0,   0]])

In [67]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)


#### En utilisant la proportion des prédictions 0, 1 et 2

In [68]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,8,0.000000,1.000000,0.0
Test_pour_AFPA,9,0.150000,0.850000,0.0
Test_pour_AFPA,10,0.066667,0.933333,0.0
Test_pour_AFPA,11,0.000000,1.000000,0.0
Test_pour_AFPA,17,0.000000,1.000000,0.0
...,...,...,...,...
WIN_20210417_14_53_12_Pro,8,0.909091,0.090909,0.0
WIN_20210417_14_53_12_Pro,9,0.952381,0.047619,0.0
WIN_20210417_14_53_12_Pro,10,0.714286,0.285714,0.0
WIN_20210417_14_53_12_Pro,11,0.700000,0.300000,0.0


In [69]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress = df_annotations_stress[df_annotations_stress.diapo.isin(diapo_audio_list)]
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
6,Test_pour_AFPA,17,0.0
...,...,...,...
233,WIN_20210417_14_53_12_Pro,8,0.0
234,WIN_20210417_14_53_12_Pro,9,0.0
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0


In [70]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [71]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,8,0.000000,1.000000,0.0
Test_pour_AFPA,9,0.150000,0.850000,0.0
Test_pour_AFPA,10,0.066667,0.933333,0.0
Test_pour_AFPA,11,0.000000,1.000000,0.0
Test_pour_AFPA,17,0.000000,1.000000,0.0
...,...,...,...,...
WIN_20210417_14_53_12_Pro,8,0.909091,0.090909,0.0
WIN_20210417_14_53_12_Pro,9,0.952381,0.047619,0.0
WIN_20210417_14_53_12_Pro,10,0.714286,0.285714,0.0
WIN_20210417_14_53_12_Pro,11,0.700000,0.300000,0.0


In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [73]:
best_result, y_predict, y_predict_proba,result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    3.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=10, multi_class='multinomial', random_state=42)
Best results 0.31909523809523804
Best params {'C': 10, 'class_weight': None}
accuracy (mean, std) 0.3533333333333334 0.29522119767312704
f1 (mean, std) 0.31909523809523804 0.31268205621551665
balanced accuracy (mean, std) 0.3951851851851852 0.26297985078792435
precision (mean, std) 0.3837777777777778 0.39844673730517083
recall (mean, std) 0.3533333333333334 0.29522119767312704

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks     

In [74]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn',
                  KNeighborsClassifier(n_neighbors=15, p=1,
                                       weights='distance'))]),
 'best_score': 0.471074074074074,
 'best_params': {'knn__n_neighbors': 15,
  'knn__p': 1,
  'knn__weights': 'distance',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.471074074074074,
 'std_test_f1_score': 0.28791490758404,
 'mean_test_accuracy_score': 0.4333333333333334,
 'std_test_accuracy_score': 0.2637338725980331,
 'mean_test_balanced_accuracy_score': 0.4078703703703704,
 'std_test_balanced_accuracy_score': 0.2744945629500942,
 'mean_test_precision': 0.5954444444444444,
 'std_test_precision': 0.36430398716144785,
 'mean_test_recall': 0.4333333333333334,
 'std_test_recall': 0.2637338725980331}

#### En aggrégeant les prédictions des diapos

In [75]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)

In [76]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [77]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [78]:
y

0      1.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
145    0.0
146    0.0
147    0.0
148    0.0
149    1.0
Name: stress, Length: 150, dtype: float64

In [82]:
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [83]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)
best_result

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    5.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    5.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.05, multi_class='multinomial', random_state=42)
Best results 0.455021164021164
Best params {'C': 0.05, 'class_weight': None}
accuracy (mean, std) 0.4666666666666667 0.3112698007981
f1 (mean, std) 0.455021164021164 0.3250524089852367
balanced accuracy (mean, std) 0.4896296296296296 0.2990338810281261
precision (mean, std) 0.5756666666666668 0.3857816980087554
recall (mean, std) 0.4666666666666667 0.3112698007981

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn', KNeighborsClassifier(n_neighbors=4, p=1))]),
 'best_score': 0.5307142857142857,
 'best_params': {'knn__n_neighbors': 4,
  'knn__p': 1,
  'knn__weights': 'uniform',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.5307142857142857,
 'std_test_f1_score': 0.27125440357186104,
 'mean_test_accuracy_score': 0.5066666666666666,
 'std_test_accuracy_score': 0.2619584360585134,
 'mean_test_balanced_accuracy_score': 0.46898148148148144,
 'std_test_balanced_accuracy_score': 0.27220434874556015,
 'mean_test_precision': 0.6684444444444445,
 'std_test_precision': 0.327399103296735,
 'mean_test_recall': 0.5066666666666666,
 'std_test_recall': 0.2619584360585134}

In [85]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn', KNeighborsClassifier(n_neighbors=4, p=1))]),
 'best_score': 0.5307142857142857,
 'best_params': {'knn__n_neighbors': 4,
  'knn__p': 1,
  'knn__weights': 'uniform',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.5307142857142857,
 'std_test_f1_score': 0.27125440357186104,
 'mean_test_accuracy_score': 0.5066666666666666,
 'std_test_accuracy_score': 0.2619584360585134,
 'mean_test_balanced_accuracy_score': 0.46898148148148144,
 'std_test_balanced_accuracy_score': 0.27220434874556015,
 'mean_test_precision': 0.6684444444444445,
 'std_test_precision': 0.327399103296735,
 'mean_test_recall': 0.5066666666666666,
 'std_test_recall': 0.2619584360585134}

In [86]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[48, 18,  2,  0],
       [37, 29,  0,  0],
       [10,  5,  0,  0],
       [ 1,  0,  0,  0]])

In [87]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')

In [88]:
df_ypredict

Unnamed: 0,video_name,diapo,ypredict
0,Test_pour_AFPA,8,0.0
1,Test_pour_AFPA,9,0.0
2,Test_pour_AFPA,10,1.0
3,Test_pour_AFPA,11,0.0
4,Test_pour_AFPA,17,0.0
...,...,...,...
145,WIN_20210417_14_53_12_Pro,8,0.0
146,WIN_20210417_14_53_12_Pro,9,1.0
147,WIN_20210417_14_53_12_Pro,10,0.0
148,WIN_20210417_14_53_12_Pro,11,0.0


## Stress global

### En utilisant le stress prédit des diapos

In [89]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [90]:
ypredict_stress_diapo

diapo,8,9,10,11,17
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test_pour_AFPA,0.0,0.0,1.0,0.0,0.0
Video_1,0.0,0.0,1.0,0.0,1.0
WIN_20210323_19_17_40_Pro,0.0,0.0,1.0,0.0,0.0
WIN_20210329_10_16_02_Pro,1.0,1.0,1.0,1.0,0.0
WIN_20210330_13_10_29_Pro,1.0,1.0,1.0,1.0,0.0
WIN_20210331_21_22_52_Pro,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,1.0,0.0
WIN_20210402_19_04_53_Pro,0.0,0.0,1.0,1.0,0.0
WIN_20210403_18_49_15_Pro,0.0,0.0,1.0,1.0,0.0
WIN_20210404_10_58_27_Pro,0.0,1.0,0.0,0.0,0.0


In [91]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress

Unnamed: 0,video_name,1,8,9,10,11,12,17,18,stress_global
0,Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
2,WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
4,WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
6,WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
7,WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
8,WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
9,WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [92]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
diapo_audio_list.append('stress_global')
df_annotations_stress = df_annotations_stress[[str(diapo) for diapo in diapo_audio_list]]
diapo_audio_list.pop()
df_annotations_stress


Unnamed: 0_level_0,8,9,10,11,17,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test_pour_AFPA,1.0,0.0,0.0,0.0,0.0,1.0
Video_1,0.0,0.0,0.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,0.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,1.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,1.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,1.0,1.0,1.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,2.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,1.0,1.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,2.0,2.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,1.0,1.0,1.0


In [93]:
Xy = ypredict_stress_diapo.merge(df_annotations_stress.iloc[:,-1],how='inner',on='video_name')
X = Xy.iloc[:,:-1]
y = Xy.iloc[:,-1]

In [94]:
Xy

Unnamed: 0_level_0,8,9,10,11,17,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test_pour_AFPA,0.0,0.0,1.0,0.0,0.0,1.0
Video_1,0.0,0.0,1.0,0.0,1.0,0.0
WIN_20210323_19_17_40_Pro,0.0,0.0,1.0,0.0,0.0,1.0
WIN_20210329_10_16_02_Pro,1.0,1.0,1.0,1.0,0.0,1.0
WIN_20210330_13_10_29_Pro,1.0,1.0,1.0,1.0,0.0,0.0
WIN_20210331_21_22_52_Pro,1.0,1.0,0.0,0.0,1.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,1.0,0.0,1.0
WIN_20210402_19_04_53_Pro,0.0,0.0,1.0,1.0,0.0,2.0
WIN_20210403_18_49_15_Pro,0.0,0.0,1.0,1.0,0.0,2.0
WIN_20210404_10_58_27_Pro,0.0,1.0,0.0,0.0,0.0,1.0


In [95]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [96]:
best_result, y_predict,y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.32825396825396824
Best params {'C': 0.01, 'class_weight': 'balanced'}
accuracy (mean, std) 0.39999999999999997 0.08164965809277262
f1 (mean, std) 0.32825396825396824 0.08206456663199935
balanced accuracy (mean, std) 0.4111111111111111 0.09686442096757052
precision (mean, std) 0.28888888888888886 0.08351831321318283
recall (mean, std) 0.39999999999999997 0.08164965809277262

Fitting 5 folds for each of 80 candidates, totalli

In [97]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=4)),
                 ('knn',
                  KNeighborsClassifier(n_neighbors=11, p=1,
                                       weights='distance'))]),
 'best_score': 0.5793650793650794,
 'best_params': {'knn__n_neighbors': 11,
  'knn__p': 1,
  'knn__weights': 'distance',
  'pca__n_components': 4},
 'mean_test_f1_score': 0.5793650793650794,
 'std_test_f1_score': 0.21938552267316777,
 'mean_test_accuracy_score': 0.6000000000000001,
 'std_test_accuracy_score': 0.20000000000000004,
 'mean_test_balanced_accuracy_score': 0.6222222222222222,
 'std_test_balanced_accuracy_score': 0.20905430802474198,
 'mean_test_precision': 0.6055555555555555,
 'std_test_precision': 0.2619961360567021,
 'mean_test_recall': 0.6000000000000001,
 'std_test_recall': 0.20000000000000004}

In [111]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global' + diapo_selection + '.csv')

In [112]:
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict_proba, columns=['stress_global_predict_0','stress_global_predict_1','stress_global_predict_2'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global_proba' + diapo_selection + '.csv')

#### Autre méthode

### En utilisant le stress prédit des time windows 5s

In [113]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.056338,0.943662,0.0
Video_1,0.763889,0.236111,0.0
WIN_20210323_19_17_40_Pro,0.527778,0.472222,0.0
WIN_20210329_10_16_02_Pro,0.069444,0.930556,0.0
WIN_20210330_13_10_29_Pro,0.041096,0.958904,0.0
WIN_20210331_21_22_52_Pro,0.098592,0.901408,0.0
WIN_20210402_14_27_50_Pro,0.0,0.152778,0.847222
WIN_20210402_19_04_53_Pro,0.319444,0.680556,0.0
WIN_20210403_18_49_15_Pro,0.277778,0.722222,0.0
WIN_20210404_10_58_27_Pro,0.30137,0.69863,0.0


In [114]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [115]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [116]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [117]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=1, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.37666666666666665
Best params {'C': 1, 'class_weight': 'balanced'}
accuracy (mean, std) 0.4333333333333333 0.1699673171197595
f1 (mean, std) 0.37666666666666665 0.1706342561817997
balanced accuracy (mean, std) 0.45555555555555555 0.18392161508052052
precision (mean, std) 0.4111111111111111 0.23465235646603197
recall (mean, std) 0.4333333333333333 0.1699673171197595

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-

In [118]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('logistic',
                  LogisticRegression(C=2, class_weight='balanced',
                                     multi_class='multinomial'))]),
 'best_score': 0.3822222222222222,
 'best_params': {'logistic__C': 2,
  'logistic__class_weight': 'balanced',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.3822222222222222,
 'std_test_f1_score': 0.1461185042489507,
 'mean_test_accuracy_score': 0.4333333333333334,
 'std_test_accuracy_score': 0.1699673171197595,
 'mean_test_balanced_accuracy_score': 0.4222222222222222,
 'std_test_balanced_accuracy_score': 0.17777777777777776,
 'mean_test_precision': 0.4166666666666667,
 'std_test_precision': 0.17568209223157663,
 'mean_test_recall': 0.4333333333333334,
 'std_test_recall': 0.1699673171197595}

#### Autre méthode

In [119]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict = df_ypredict.groupby(['video_name']).agg({'ypredict': ['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,0.943662,0.0,1.0,1.0,0.232214,1.0,1.0,12.809701,-3.848337
Video_1,0.236111,0.0,1.0,0.0,0.427672,0.0,0.0,-0.455615,1.242733
WIN_20210323_19_17_40_Pro,0.472222,0.0,1.0,0.0,0.502731,0.0,1.0,-1.987616,0.111283
WIN_20210329_10_16_02_Pro,0.930556,0.0,1.0,1.0,0.255992,1.0,1.0,9.474627,-3.387422
WIN_20210330_13_10_29_Pro,0.958904,0.0,1.0,1.0,0.199886,1.0,1.0,19.37619,-4.623439
WIN_20210331_21_22_52_Pro,0.901408,0.0,1.0,1.0,0.300235,1.0,1.0,5.252232,-2.692997
WIN_20210402_14_27_50_Pro,1.847222,1.0,2.0,2.0,0.362298,2.0,2.0,1.725782,-1.930229
WIN_20210402_19_04_53_Pro,0.680556,0.0,1.0,1.0,0.469533,0.0,1.0,-1.400177,-0.774482
WIN_20210403_18_49_15_Pro,0.722222,0.0,1.0,1.0,0.451046,0.0,1.0,-1.015385,-0.992278
WIN_20210404_10_58_27_Pro,0.69863,0.0,1.0,1.0,0.462028,0.0,1.0,-1.250446,-0.865768


In [120]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [121]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [122]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.05, multi_class='multinomial', random_state=42)
Best results 0.39619047619047615
Best params {'C': 0.05, 'class_weight': None}
accuracy (mean, std) 0.4666666666666666 0.22110831935702666
f1 (mean, std) 0.39619047619047615 0.21011604611373003
balanced accuracy (mean, std) 0.42222222222222217 0.187905939169864
precision (mean, std) 0.39777777777777773 0.21051377480515626
recall (mean, std) 0.4666666666666666 0.22110831935702666

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 385 out of 400 | elapsed:    1.9

In [123]:
best_result

{'best_estimator': LogisticRegression(C=0.05, multi_class='multinomial', random_state=42),
 'best_score': 0.39619047619047615,
 'best_params': {'C': 0.05, 'class_weight': None},
 'mean_test_f1_score': 0.39619047619047615,
 'std_test_f1_score': 0.21011604611373003,
 'mean_test_accuracy_score': 0.4666666666666666,
 'std_test_accuracy_score': 0.22110831935702666,
 'mean_test_balanced_accuracy_score': 0.42222222222222217,
 'std_test_balanced_accuracy_score': 0.187905939169864,
 'mean_test_precision': 0.39777777777777773,
 'std_test_precision': 0.21051377480515626,
 'mean_test_recall': 0.4666666666666666,
 'std_test_recall': 0.22110831935702666}