In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pathlib
from util import runGridSearchClassifiers

## Loading data

In [2]:
directory_path = '../../04_-_Dev/videos'
features = 'eGeMAPS'#''

In [3]:
currentDirectory = pathlib.Path(directory_path)
currentPattern = "*." + features + ".annotated.csv"
file_list = [str(currentFile) for currentFile in currentDirectory.glob(currentPattern)]

df_total = pd.DataFrame()
for filename in file_list:
    df = pd.read_csv(filename, delimiter=';')
    df_total = pd.concat([df_total, df], axis=0)

print('Number of videos', df_total['video_name'].nunique())
print('Number of annotations', df_total[['video_name','diapo']].drop_duplicates().shape[0])


Number of videos 30
Number of annotations 240


In [4]:
df_total = df_total.dropna(axis=0)

In [5]:
df_total.to_pickle(directory_path + '/audio_' + features + '_data.p')

In [3]:
df_total = pd.read_pickle(directory_path + '/audio_' + features + '_data.p')

## Data pre-processing 1 - Sans utilisation de la temporalité

In [4]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

In [13]:
# Sequence as a whole with 9 agregators
X = df_total.iloc[:,2:].groupby(['video_name','diapo']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]

In [14]:
# Sequence as a whole with 4 agregators
#X = df_total.iloc[:,2:].groupby(['video_name','diapo']).agg({'mean','std', kurtosis, skew}).iloc[:,:-8]

In [15]:
X.shape

(240, 468)

In [17]:
X_temp = df_total[['video_name','diapo','type_candidat','sexe']].groupby(['video_name','diapo']).agg({'type_candidat':'first','sexe':'first'})
X_temp.loc[X_temp['type_candidat'] == 'Stagiaire','Stagiaire'] = 1
X_temp.loc[X_temp['type_candidat'] != 'Stagiaire','Stagiaire'] = 0
X_temp.loc[X_temp['sexe'] == 'F','Femme'] = 1
X_temp.loc[X_temp['sexe'] != 'F','Femme'] = 0
X_temp = X_temp.drop(['type_candidat','sexe'], axis=1)

In [19]:
X = pd.concat([X,X_temp],axis=1)

In [20]:
y = df_total.iloc[:,2:].groupby(['video_name','diapo']).agg({'stress':'mean'}).iloc[:,-1]

In [21]:
X.shape, y.shape

((240, 470), (240,))

In [22]:
X.to_pickle(directory_path + '/audio_' + features + '_data_X.p')
y.to_pickle(directory_path + '/audio_' + features + '_data_y.p')

In [7]:
from scipy.stats import kurtosis, skew
X_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]
y_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo']).agg({'stress':'mean'}).iloc[:,-1]

X_audio.to_pickle(directory_path + '/audio_' + features + '_data_X_audio.p')
y_audio.to_pickle(directory_path + '/audio_' + features + '_data_y_audio.p')

In [8]:
X_audio.shape

(150, 630)

## Modèles

In [9]:
X = pd.read_pickle(directory_path + '/audio_' + features + '_data_X.p')
y = pd.read_pickle(directory_path + '/audio_' + features + '_data_y.p')

X_audio = pd.read_pickle(directory_path + '/audio_' + features + '_data_X_audio.p')
y_audio = pd.read_pickle(directory_path + '/audio_' + features + '_data_y_audio.p')

In [10]:
X.shape, X_audio.shape

((240, 623), (150, 630))

In [11]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,"(Loudness_sma3, kurtosis)","(Loudness_sma3, max)","(Loudness_sma3, percentil25)","(Loudness_sma3, median)","(Loudness_sma3, std)","(Loudness_sma3, mean)","(Loudness_sma3, percentil75)","(Loudness_sma3, skew)","(Loudness_sma3, min)","(alphaRatio_sma3, kurtosis)",...,"(F3amplitudeLogRelF0_sma3nz_de_de, max)","(F3amplitudeLogRelF0_sma3nz_de_de, percentil25)","(F3amplitudeLogRelF0_sma3nz_de_de, median)","(F3amplitudeLogRelF0_sma3nz_de_de, std)","(F3amplitudeLogRelF0_sma3nz_de_de, mean)","(F3amplitudeLogRelF0_sma3nz_de_de, percentil75)","(F3amplitudeLogRelF0_sma3nz_de_de, skew)","(F3amplitudeLogRelF0_sma3nz_de_de, min)",Stagiaire,Femme
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Test_pour_AFPA,1,195.631236,3.545050,0.114812,0.119039,0.130805,0.136717,0.124505,12.240252,0.083114,35.764880,...,131.8622,0.000000,0.0,8.210914,-6.270232e-18,0.000000,-0.448768,-133.887694,1.0,0.0
Test_pour_AFPA,8,1.922854,2.867280,0.117015,0.282706,0.424739,0.460516,0.711492,1.370343,0.100229,0.110346,...,143.5802,-0.575755,0.0,21.556528,2.415845e-17,0.485107,0.023731,-158.937280,1.0,0.0
Test_pour_AFPA,9,2.268561,3.393920,0.112834,0.209026,0.383514,0.411576,0.639661,1.420768,0.099458,0.410082,...,151.9159,-0.272000,0.0,19.785368,-3.301512e-17,0.193978,0.180785,-161.495940,1.0,0.0
Test_pour_AFPA,10,10.863511,5.028640,0.111221,0.148684,0.391520,0.382675,0.590052,2.289813,0.099492,0.166996,...,141.5019,-0.215863,0.0,18.762260,8.702810e-03,0.000000,0.191598,-160.932650,1.0,0.0
Test_pour_AFPA,11,9.862057,5.151010,0.113637,0.252318,0.381967,0.408419,0.605844,2.060058,0.098327,-0.160992,...,154.1847,-0.406947,0.0,19.736843,-6.073836e-03,0.449439,0.074909,-158.581520,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,2.940143,3.264950,0.001104,0.013893,0.464786,0.301682,0.547356,1.707042,0.001034,0.025470,...,201.0000,0.000000,0.0,16.854526,-4.242046e-18,0.000000,-0.110385,-295.940550,1.0,1.0
WIN_20210417_14_53_12_Pro,11,1.568983,2.660928,0.001126,0.081050,0.431815,0.330021,0.586716,1.397632,0.001034,-0.130052,...,143.9756,0.000000,0.0,19.730021,9.621933e-18,0.000000,-0.007796,-160.083260,1.0,1.0
WIN_20210417_14_53_12_Pro,12,4.232695,0.384004,0.014089,0.029460,0.038861,0.041404,0.058949,1.654525,0.001034,0.938287,...,116.6289,0.000000,0.0,8.419112,2.647985e-18,0.000000,-1.313023,-144.071920,1.0,1.0
WIN_20210417_14_53_12_Pro,17,0.669043,3.344785,0.006968,0.488653,0.708703,0.651603,1.049473,1.121390,0.001034,0.245458,...,135.7129,-0.731520,0.0,18.815206,-6.615398e-17,0.753908,0.199953,-146.911680,1.0,1.0


In [12]:
X = X.fillna(0)

### Leave one interview out
#### All diapos

In [11]:
diapo_selection = '_all'

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [50, 100, 150], 'max_depth':[5, 10, 15, 20, 25], 'class_weight':[None,'balanced']}
                ]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [46]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   11.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.3
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.3 0.45825756949558394
f1 (mean, std) 0.3 0.45825756949558394
balanced accuracy (mean, std) 0.3 0.45825756949558394
precision (mean, std) 0.3 0.45825756949558394
recall (mean, std) 0.3 0.45825756949558394

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Do

In [47]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_diapo' + diapo_selection + '.csv')

### Audio diapos only

In [13]:
diapo_selection = '_audio_only'

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [50, 100, 150], 'max_depth':[ 5, 10, 15, 20, 25], 'class_weight':[None,'balanced']}
                ]
groups = X_audio.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X_audio, y_audio, groups)

In [15]:
best_result, y_predict,y_predict_proba, result_list = runGridSearchClassifiers(X_audio, y_audio, cv_loo, models_list, parameters_list, output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 30 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  1.3min finished
Best estimator RandomForestClassifier(max_depth=5, n_estimators=50, n_jobs=-1, random_state=42)
Best results 0.4660158730158729
Best params {'class_weight': None, 'max_depth': 5, 'n_estimators': 50}
accuracy (mean, std) 0.46 0.248461935381123
f1 (mean, std) 0.4660158730158729 0.2799987142887613
balanced accuracy (mean, std) 0.4677777777777778 0.2460966888174515
precision (mean, std) 0.5954444444444446 0.36782794219815385
recall (mean, std) 0.46 0.248461935381123

f1_score (weighted) 0.4485365136980382
accuracy 0.46


In [16]:
df_ypredict = pd.concat([X_audio.reset_index()[['video_name','diapo']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_diapo' + diapo_selection + '.csv')

## Stress global
### All diapos

In [13]:
diapo_selection = '_all' 

In [14]:
df_ypredict = pd.read_csv('ypredict_' + features + '_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.drop(df_ypredict.columns[0],axis=1)
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')

In [15]:
#df_annotations_stress_diapo = df_total.pivot_table(values='stress', index='video_name',columns='diapo', aggfunc='mean')
#df_annotations_stress_global = df_total.pivot_table(values='stress_global', index='video_name', aggfunc='mean')
#df_annotations_stress = df_annotations_stress_diapo.merge(df_annotations_stress_global, on='video_name')
#df_annotations_stress

In [16]:
df_annotations_stress = pd.read_csv('annotations.csv')

#### En utilisant les annotations comme X

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                KNeighborsClassifier(),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2], 'class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]
X = df_annotations_stress.iloc[:,:-1].set_index('video_name')
y = df_annotations_stress.iloc[:,-1]
cv = 5

In [22]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, cv, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.1, multi_class='multinomial', random_state=42)
Best results 0.7651587301587301
Best params {'C': 0.1, 'class_weight': None}
accuracy (mean, std) 0.8 0.0666666666666667
f1 (mean, std) 0.7651587301587301 0.11176124986563704
balanced accuracy (mean, std) 0.8 0.0666666666666667
precision (mean, std) 0.8016666666666665 0.16758635142870432
recall (mean, std) 0.8 0.0666666666666667

Fitting 5 folds for each of 44 candidates, totalling 220 fits
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 220 out of 220 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator KNeighborsCla

In [23]:
ypredict_stress_global = best_result['best_estimator'].predict(ypredict_stress_diapo)
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, precision_score, recall_score
print('Accuracy',accuracy_score(y.values,ypredict_stress_global))
print('F1',f1_score(y.values,ypredict_stress_global, average='weighted'))
print('Balanced accuracy',balanced_accuracy_score(y.values,ypredict_stress_global))
print('Precision',precision_score(y.values,ypredict_stress_global, average='weighted'))
print('Recall',recall_score(y.values,ypredict_stress_global, average='weighted'))

Accuracy 0.36666666666666664
F1 0.3095238095238095
Balanced accuracy 0.33888888888888885
Precision 0.27777777777777773
Recall 0.36666666666666664


In [24]:
pd.concat([X.reset_index().iloc[:,0],y, pd.DataFrame(ypredict_stress_global,columns=['predicted_stress_global'])],axis=1)

Unnamed: 0,video_name,stress_global,predicted_stress_global
0,Test_pour_AFPA,1.0,1.0
1,Video_1,0.0,1.0
2,WIN_20210323_19_17_40_Pro,1.0,0.0
3,WIN_20210329_10_16_02_Pro,1.0,1.0
4,WIN_20210330_13_10_29_Pro,0.0,0.0
5,WIN_20210331_21_22_52_Pro,1.0,1.0
6,WIN_20210402_14_27_50_Pro,1.0,1.0
7,WIN_20210402_19_04_53_Pro,2.0,1.0
8,WIN_20210403_18_49_15_Pro,2.0,1.0
9,WIN_20210404_10_58_27_Pro,1.0,1.0


#### En utilisant les prédictions comme annotations

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                KNeighborsClassifier(),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]
X = ypredict_stress_diapo
y = df_annotations_stress.iloc[:,-1]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [18]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    2.7s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=10, multi_class='multinomial', random_state=42)
Best results 0.5098412698412699
Best params {'C': 10, 'class_weight': None}
accuracy (mean, std) 0.5666666666666667 0.16996731711975951
f1 (mean, std) 0.5098412698412699 0.20862095036746608
balanced accuracy (mean, std) 0.5888888888888889 0.16329931618554522
precision (mean, std) 0.5055555555555555 0.2631715396072669
recall (mean, std) 0.5666666666666667 0.16996731711975951

Fitting 5 folds for each of 44 candidates, totalling 220 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | ela

In [19]:
best_result

{'best_estimator': LogisticRegression(C=10, multi_class='multinomial', random_state=42),
 'best_score': 0.5098412698412699,
 'best_params': {'C': 10, 'class_weight': None},
 'mean_test_f1_score': 0.5098412698412699,
 'std_test_f1_score': 0.20862095036746608,
 'mean_test_accuracy_score': 0.5666666666666667,
 'std_test_accuracy_score': 0.16996731711975951,
 'mean_test_balanced_accuracy_score': 0.5888888888888889,
 'std_test_balanced_accuracy_score': 0.16329931618554522,
 'mean_test_precision': 0.5055555555555555,
 'std_test_precision': 0.2631715396072669,
 'mean_test_recall': 0.5666666666666667,
 'std_test_recall': 0.16996731711975951}

In [20]:
df_ypredict = pd.concat([X.reset_index()[['video_name']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_global' + diapo_selection + '.csv')

In [21]:
df_ypredict = pd.concat([X.reset_index()[['video_name']],pd.DataFrame(y_predict_proba, columns=['predict_0','predict_1','predict_2'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_global_proba' + diapo_selection + '.csv')

### Audio diapos only

In [28]:
diapo_selection = '_audio_only'

In [29]:
df_ypredict = pd.read_csv('ypredict_' + features + '_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.drop(df_ypredict.columns[0],axis=1)
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')

#### En utilisant les annotations comme X

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                KNeighborsClassifier(),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2], 'class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]
X = df_annotations_stress[['video_name','8','9','10','11','17','stress_global']].iloc[:,:-1].set_index('video_name')
y = df_annotations_stress.iloc[:,-1]
cv = 5

In [31]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.1, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.7092063492063492
Best params {'C': 0.1, 'class_weight': 'balanced'}
accuracy (mean, std) 0.7666666666666667 0.08164965809277264
f1 (mean, std) 0.7092063492063492 0.134722169623853
balanced accuracy (mean, std) 0.7666666666666667 0.08164965809277264
precision (mean, std) 0.7166666666666666 0.20042393341719386
recall (mean, std) 0.7666666666666667 0.08164965809277264

Fitting 5 folds for each of 44 candidates, totalling 220 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2

ValueError: too many values to unpack (expected 3)

In [None]:
ypredict_stress_global = best_result['best_estimator'].predict(ypredict_stress_diapo)
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, precision_score, recall_score
print('Accuracy',accuracy_score(y.values,ypredict_stress_global))
print('F1',f1_score(y.values,ypredict_stress_global, average='weighted'))
print('Balanced accuracy',balanced_accuracy_score(y.values,ypredict_stress_global))
print('Precision',precision_score(y.values,ypredict_stress_global, average='weighted'))
print('Recall',recall_score(y.values,ypredict_stress_global, average='weighted'))

#### En utilisant les prédictions des diapos

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                KNeighborsClassifier(),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2], 'class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]
X = ypredict_stress_diapo
y = df_annotations_stress.iloc[:,-1]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [37]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.05, multi_class='multinomial', random_state=42)
Best results 0.3567460317460318
Best params {'C': 0.05, 'class_weight': None}
accuracy (mean, std) 0.4666666666666666 0.1247219128924647
f1 (mean, std) 0.3567460317460318 0.16746972064173757
balanced accuracy (mean, std) 0.4222222222222222 0.07535922203472521
precision (mean, std) 0.3911111111111111 0.2365805790219695
recall (mean, std) 0.4666666666666666 0.1247219128924647

Fitting 5 folds for each of 44 candidates, totalling 220 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 220 out of 220 | el

In [38]:
best_result

{'best_estimator': LogisticRegression(C=0.05, multi_class='multinomial', random_state=42),
 'best_score': 0.3567460317460318,
 'best_params': {'C': 0.05, 'class_weight': None},
 'mean_test_f1_score': 0.3567460317460318,
 'std_test_f1_score': 0.16746972064173757,
 'mean_test_accuracy_score': 0.4666666666666666,
 'std_test_accuracy_score': 0.1247219128924647,
 'mean_test_balanced_accuracy_score': 0.4222222222222222,
 'std_test_balanced_accuracy_score': 0.07535922203472521,
 'mean_test_precision': 0.3911111111111111,
 'std_test_precision': 0.2365805790219695,
 'mean_test_recall': 0.4666666666666666,
 'std_test_recall': 0.1247219128924647}

## Aggregation all frames within the video to predict the global stress

#### All diapo

In [39]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

X = df_total.iloc[:,2:].groupby(['video_name']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-27]

In [40]:
y = df_total.iloc[:,2:].groupby(['video_name']).agg({'stress_global':'mean'}).iloc[:,-1]

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                pipe,
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [42]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=1, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.1422222222222222
Best params {'C': 1, 'class_weight': 'balanced'}
accuracy (mean, std) 0.16666666666666666 0.10540925533894598
f1 (mean, std) 0.1422222222222222 0.1075886083280907
balanced accuracy (mean, std) 0.14444444444444443 0.0753592220347252
precision (mean, std) 0.1277777777777778 0.11055415967851333
recall (mean, std) 0.16666666666666666 0.10540925533894598

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.6s
[Parallel(n_jobs=

In [43]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=3)),
                 ('logistic',
                  LogisticRegression(C=0.01, multi_class='multinomial',
                                     random_state=42))]),
 'best_score': 0.3514285714285714,
 'best_params': {'logistic__C': 0.01,
  'logistic__class_weight': None,
  'pca__n_components': 3},
 'mean_test_f1_score': 0.3514285714285714,
 'std_test_f1_score': 0.25420148021245287,
 'mean_test_accuracy_score': 0.4,
 'std_test_accuracy_score': 0.2260776661041756,
 'mean_test_balanced_accuracy_score': 0.4111111111111111,
 'std_test_balanced_accuracy_score': 0.2398559238324767,
 'mean_test_precision': 0.36,
 'std_test_precision': 0.3076273198225973,
 'mean_test_recall': 0.4,
 'std_test_recall': 0.2260776661041756}

#### Audio diapo only

In [44]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

X_audio = df_total[df_total.diapo.isin([8,9,10,11,17])].iloc[:,2:].groupby(['video_name']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-27]

In [45]:
y = df_total.iloc[:,2:].groupby(['video_name']).agg({'stress_global':'mean'}).iloc[:,-1]

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                pipe,
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]
X = X_audio
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [47]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.2633333333333333
Best params {'C': 0.01, 'class_weight': 'balanced'}
accuracy (mean, std) 0.26666666666666666 0.0816496580927726
f1 (mean, std) 0.2633333333333333 0.07333333333333333
balanced accuracy (mean, std) 0.2333333333333333 0.08888888888888889
precision (mean, std) 0.3166666666666667 0.13788526273323173
recall (mean, std) 0.26666666666666666 0.0816496580927726

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_

In [48]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn', KNeighborsClassifier(n_neighbors=11, p=1))]),
 'best_score': 0.38126984126984126,
 'best_params': {'knn__n_neighbors': 11,
  'knn__p': 1,
  'knn__weights': 'uniform',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.38126984126984126,
 'std_test_f1_score': 0.203287171197036,
 'mean_test_accuracy_score': 0.4666666666666666,
 'std_test_accuracy_score': 0.19436506316151006,
 'mean_test_balanced_accuracy_score': 0.42222222222222217,
 'std_test_balanced_accuracy_score': 0.14315665251916806,
 'mean_test_precision': 0.3472222222222222,
 'std_test_precision': 0.1979836631245051,
 'mean_test_recall': 0.4666666666666666,
 'std_test_recall': 0.19436506316151006}