In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pathlib
from util import runGridSearchClassifiers

## Loading data

In [2]:
directory_path = '../../04_-_Dev/videos'
features = 'emobase_eGeMAPS'

In [3]:
df_total = pd.read_pickle(directory_path + '/audio_' + features + '_data.p')

In [4]:
df_total.shape

(2573581, 96)

In [5]:
df_total.video_name.nunique()

30

In [6]:
df_total[df_total.isna().any(axis=1)]

Unnamed: 0,frameIndex,frameTime,pcm_intensity_sma,pcm_loudness_sma,mfcc_sma[1],mfcc_sma[2],mfcc_sma[3],mfcc_sma[4],mfcc_sma[5],mfcc_sma[6],...,F2frequency_sma3nz_de,F2amplitudeLogRelF0_sma3nz_de,F3frequency_sma3nz_de,F3amplitudeLogRelF0_sma3nz_de,video_name,stress_global,type_candidat,sexe,stress,diapo


## Data processing

In [7]:
time_window = 5
df_total['frameTimeWindow'] = df_total.frameTime.apply(lambda x : np.floor(x / time_window) * time_window).astype(int)

In [5]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

In [9]:
# 5 seconds windows
X = df_total.iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]

In [10]:
y = df_total.iloc[:,3:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'min'}).iloc[:,-1]

In [11]:
X.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

In [12]:
from scipy.stats import kurtosis, skew
X_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]
y_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'mean'}).iloc[:,-1]

In [13]:
X_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

## Modèles

In [6]:
X = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

X_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

In [7]:
X = X.fillna(0)
X_audio = X_audio.fillna(0)

## Stress par diapos
### All diapos

On prédit le stress par time window de 5s avec un Random Forest

In [8]:
diapo_selection = '_all'

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [100, 150, 200], 'max_depth':[10, 15, 20, 25, 30], 'class_weight':[None,'balanced']}
                ]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [14]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 30 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 42.3min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 49.4min finished
Best estimator RandomForestClassifier(max_depth=30, n_estimators=200, n_jobs=-1,
                       random_state=42)
Best results 0.4382887921646796
Best params {'class_weight': None, 'max_depth': 30, 'n_estimators': 200}
accuracy (mean, std) 0.4466189356356167 0.21189250831745096
f1 (mean, std) 0.4382887921646796 0.21207369712943566
balanced accuracy (mean, std) 0.4322245755030847 0.1761420931580816
precision (mean, std) 0.56448780267131 0.22281610407514715
recall (mean, std) 0.4466189356356167 0.21189250831745096

f1_score (weight

In [15]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')

In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[ 660, 1243,  161,    0],
       [ 802, 1692,   96,    0],
       [ 324,  316,   13,    0],
       [   4,   16,    0,    0]])

In [138]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict


Unnamed: 0,video_name,diapo,frameTimeWindow,ypredict
0,Test_pour_AFPA,8,170,1
1,Test_pour_AFPA,8,175,1
2,Test_pour_AFPA,8,180,1
3,Test_pour_AFPA,8,185,1
4,Test_pour_AFPA,8,190,1
...,...,...,...,...
2162,WIN_20210417_14_53_12_Pro,17,490,1
2163,WIN_20210417_14_53_12_Pro,17,495,0
2164,WIN_20210417_14_53_12_Pro,17,500,1
2165,WIN_20210417_14_53_12_Pro,17,505,0


#### En utilisant la proportion des prédictions 0, 1 et 2

In [139]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,8,0.000000,1.000000,0.0
Test_pour_AFPA,9,0.050000,0.950000,0.0
Test_pour_AFPA,10,0.000000,1.000000,0.0
Test_pour_AFPA,11,0.000000,1.000000,0.0
Test_pour_AFPA,17,0.000000,1.000000,0.0
...,...,...,...,...
WIN_20210417_14_53_12_Pro,8,0.909091,0.090909,0.0
WIN_20210417_14_53_12_Pro,9,0.857143,0.142857,0.0
WIN_20210417_14_53_12_Pro,10,0.714286,0.285714,0.0
WIN_20210417_14_53_12_Pro,11,0.800000,0.200000,0.0


In [141]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress['stress'] = df_annotations_stress['stress'].astype(int)
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
0,Test_pour_AFPA,1,1
1,Test_pour_AFPA,8,1
2,Test_pour_AFPA,9,0
3,Test_pour_AFPA,10,0
4,Test_pour_AFPA,11,0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,0
236,WIN_20210417_14_53_12_Pro,11,0
237,WIN_20210417_14_53_12_Pro,12,0
238,WIN_20210417_14_53_12_Pro,17,1


In [142]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [143]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,8,0.000000,1.000000,0.0
Test_pour_AFPA,9,0.050000,0.950000,0.0
Test_pour_AFPA,10,0.000000,1.000000,0.0
Test_pour_AFPA,11,0.000000,1.000000,0.0
Test_pour_AFPA,17,0.000000,1.000000,0.0
...,...,...,...,...
WIN_20210417_14_53_12_Pro,8,0.909091,0.090909,0.0
WIN_20210417_14_53_12_Pro,9,0.857143,0.142857,0.0
WIN_20210417_14_53_12_Pro,10,0.714286,0.285714,0.0
WIN_20210417_14_53_12_Pro,11,0.800000,0.200000,0.0


In [144]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [145]:
best_result, y_predict,y_predict_proba, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=4, multi_class='multinomial', random_state=42)
Best results 0.4264285714285713
Best params {'C': 4, 'class_weight': None}
accuracy (mean, std) 0.46 0.31474857690967667
f1 (mean, std) 0.4264285714285713 0.3418098440854673
balanced accuracy (mean, std) 0.4869444444444444 0.2804484118842449
precision (mean, std) 0.4814444444444445 0.4037295421564745
recall (mean, std) 0.46 0.31474857690967667

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 2160 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 2400 

In [146]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=3)),
                 ('knn', KNeighborsClassifier(n_neighbors=15, p=1))]),
 'best_score': 0.5453703703703704,
 'best_params': {'knn__n_neighbors': 15,
  'knn__p': 1,
  'knn__weights': 'uniform',
  'pca__n_components': 3},
 'mean_test_f1_score': 0.5453703703703704,
 'std_test_f1_score': 0.31679454548429153,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.30258148581093913,
 'mean_test_balanced_accuracy_score': 0.5214814814814814,
 'std_test_balanced_accuracy_score': 0.31570185488795943,
 'mean_test_precision': 0.6301111111111111,
 'std_test_precision': 0.3579618056734372,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.30258148581093913}

In [136]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')

In [137]:
# Saving proba
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict_proba, columns=['stress_diapo_predict_0', 'stress_diapo_predict_1','stress_diapo_predict_2', 'stress_diapo_predict_3' ])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo_proba' + diapo_selection + '.csv') 

#### En aggrégeant les prédicitions des diapos

In [101]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [102]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,diapo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Test_pour_AFPA,8,1.000000,1,1,1.0,0.000000,1.0,1.00,-3.000000,0.000000
Test_pour_AFPA,9,0.950000,0,1,1.0,0.223607,1.0,1.00,15.052632,-4.129483
Test_pour_AFPA,10,1.000000,1,1,1.0,0.000000,1.0,1.00,-3.000000,0.000000
Test_pour_AFPA,11,1.000000,1,1,1.0,0.000000,1.0,1.00,-3.000000,0.000000
Test_pour_AFPA,17,1.000000,1,1,1.0,0.000000,1.0,1.00,-3.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,8,0.090909,0,1,0.0,0.301511,0.0,0.00,6.100000,2.846050
WIN_20210417_14_53_12_Pro,9,0.142857,0,1,0.0,0.358569,0.0,0.00,2.166667,2.041241
WIN_20210417_14_53_12_Pro,10,0.285714,0,1,0.0,0.468807,0.0,0.75,-1.100000,0.948683
WIN_20210417_14_53_12_Pro,11,0.200000,0,1,0.0,0.410391,0.0,0.00,0.250000,1.500000


In [107]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [111]:
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [112]:
best_result, y_predict, y_predict_proba,result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)
best_result

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 560 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    7.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.5155820105820106
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.5133333333333333 0.28604583937233247
f1 (mean, std) 0.5155820105820106 0.29546255643146463
balanced accuracy (mean, std) 0.502962962962963 0.2901919803501358
precision (mean, std) 0.6372222222222224 0.3567228235472272
recall (mean, std) 0.5133333333333333 0.28604583937233247

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s


{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=2)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.5328306878306878,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 2},
 'mean_test_f1_score': 0.5328306878306878,
 'std_test_f1_score': 0.31099281598143175,
 'mean_test_accuracy_score': 0.5399999999999999,
 'std_test_accuracy_score': 0.2973213749463701,
 'mean_test_balanced_accuracy_score': 0.5296296296296297,
 'std_test_balanced_accuracy_score': 0.28950729158224436,
 'mean_test_precision': 0.6267777777777778,
 'std_test_precision': 0.3520453288715061,
 'mean_test_recall': 0.5399999999999999,
 'std_test_recall': 0.2973213749463701}

In [113]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=2)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.5328306878306878,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 2},
 'mean_test_f1_score': 0.5328306878306878,
 'std_test_f1_score': 0.31099281598143175,
 'mean_test_accuracy_score': 0.5399999999999999,
 'std_test_accuracy_score': 0.2973213749463701,
 'mean_test_balanced_accuracy_score': 0.5296296296296297,
 'std_test_balanced_accuracy_score': 0.28950729158224436,
 'mean_test_precision': 0.6267777777777778,
 'std_test_precision': 0.3520453288715061,
 'mean_test_recall': 0.5399999999999999,
 'std_test_recall': 0.2973213749463701}

In [114]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[56, 12,  0,  0],
       [41, 25,  0,  0],
       [11,  4,  0,  0],
       [ 0,  1,  0,  0]])

In [115]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv') 

In [118]:
# Saving proba
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict_proba, columns=['stress_diapo_predict_0', 'stress_diapo_predict_1','stress_diapo_predict_2', 'stress_diapo_predict_3' ])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo_proba' + diapo_selection + '.csv') 

## Stress global

### En utilisant le stress prédit des diapos

In [147]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [148]:
ypredict_stress_diapo

diapo,8,9,10,11,17
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test_pour_AFPA,0.0,1.0,0.0,0.0,0.0
Video_1,0.0,0.0,0.0,0.0,1.0
WIN_20210323_19_17_40_Pro,0.0,0.0,1.0,1.0,0.0
WIN_20210329_10_16_02_Pro,0.0,1.0,1.0,0.0,0.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,0.0,0.0,0.0,0.0,0.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,0.0,0.0
WIN_20210403_18_49_15_Pro,1.0,0.0,0.0,0.0,1.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,1.0,1.0


In [149]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
df_annotations_stress

Unnamed: 0_level_0,1,8,9,10,11,12,17,18,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [150]:
Xy = ypredict_stress_diapo.merge(df_annotations_stress.iloc[:,-1],how='inner',on='video_name')
X = Xy.iloc[:,:-1]
y = Xy.iloc[:,-1]

In [151]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

#groups = X.reset_index()['video_name']
#loo = LeaveOneGroupOut()
#cv_loo = loo.split(X, y, groups)

In [152]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.5, multi_class='multinomial', random_state=42)
Best results 0.4530158730158731
Best params {'C': 0.5, 'class_weight': None}
accuracy (mean, std) 0.5000000000000001 0.18257418583505539
f1 (mean, std) 0.4530158730158731 0.16259299720850415
balanced accuracy (mean, std) 0.4444444444444445 0.1531560972454469
precision (mean, std) 0.47833333333333333 0.14544949486180953
recall (mean, std) 0.5000000000000001 0.18257418583505539

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | 

In [153]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=4)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.47968253968253965,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 4},
 'mean_test_f1_score': 0.47968253968253965,
 'std_test_f1_score': 0.16264505512980054,
 'mean_test_accuracy_score': 0.5333333333333334,
 'std_test_accuracy_score': 0.16329931618554522,
 'mean_test_balanced_accuracy_score': 0.47777777777777775,
 'std_test_balanced_accuracy_score': 0.11439589045541111,
 'mean_test_precision': 0.5005555555555555,
 'std_test_precision': 0.15522186431200288,
 'mean_test_recall': 0.5333333333333334,
 'std_test_recall': 0.16329931618554522}

In [154]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global' + diapo_selection + '.csv')

In [155]:
# Saving predict proba
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict_proba, columns=['stress_global_proba_0','stress_global_proba_1','stress_global_proba_2'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global_proba' + diapo_selection + '.csv')

#### Autre méthode

### En utilisant le stress prédit des time windows 5s

In [156]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.014085,0.985915,0.0
Video_1,0.694444,0.305556,0.0
WIN_20210323_19_17_40_Pro,0.472222,0.527778,0.0
WIN_20210329_10_16_02_Pro,0.027778,0.972222,0.0
WIN_20210330_13_10_29_Pro,0.0,1.0,0.0
WIN_20210331_21_22_52_Pro,0.084507,0.915493,0.0
WIN_20210402_14_27_50_Pro,0.013889,0.027778,0.958333
WIN_20210402_19_04_53_Pro,0.333333,0.666667,0.0
WIN_20210403_18_49_15_Pro,0.25,0.75,0.0
WIN_20210404_10_58_27_Pro,0.246575,0.753425,0.0


In [157]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [158]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [159]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [160]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=1, multi_class='multinomial', random_state=42)
Best results 0.39714285714285713
Best params {'C': 1, 'class_weight': None}
accuracy (mean, std) 0.4666666666666667 0.22110831935702668
f1 (mean, std) 0.39714285714285713 0.21098671594645593
balanced accuracy (mean, std) 0.4222222222222222 0.17777777777777776
precision (mean, std) 0.38499999999999995 0.20374275719914836
recall (mean, std) 0.4666666666666667 0.22110831935702668

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | e

In [161]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn',
                  KNeighborsClassifier(n_neighbors=6, p=1, weights='distance'))]),
 'best_score': 0.4933333333333333,
 'best_params': {'knn__n_neighbors': 6,
  'knn__p': 1,
  'knn__weights': 'distance',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.4933333333333333,
 'std_test_f1_score': 0.1768866554856213,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.1632993161855452,
 'mean_test_balanced_accuracy_score': 0.4777777777777777,
 'std_test_balanced_accuracy_score': 0.12957670877434,
 'mean_test_precision': 0.48888888888888893,
 'std_test_precision': 0.18392161508052055,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.1632993161855452}

#### Autre méthode

In [162]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict = df_ypredict.groupby(['video_name']).agg({'ypredict': ['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,0.985915,0.0,1.0,1.0,0.118678,1.0,1.0,66.014286,-8.247077
Video_1,0.305556,0.0,1.0,0.0,0.463875,0.0,1.0,-1.287273,0.844232
WIN_20210323_19_17_40_Pro,0.527778,0.0,1.0,1.0,0.502731,0.0,1.0,-1.987616,-0.111283
WIN_20210329_10_16_02_Pro,0.972222,0.0,1.0,1.0,0.165489,1.0,1.0,31.028571,-5.747049
WIN_20210330_13_10_29_Pro,1.0,1.0,1.0,1.0,0.0,1.0,1.0,-3.0,0.0
WIN_20210331_21_22_52_Pro,0.915493,0.0,1.0,1.0,0.280126,1.0,1.0,6.925641,-2.987581
WIN_20210402_14_27_50_Pro,1.944444,0.0,2.0,2.0,0.285267,2.0,2.0,31.264793,-5.513881
WIN_20210402_19_04_53_Pro,0.666667,0.0,1.0,1.0,0.474713,0.0,1.0,-1.5,-0.707107
WIN_20210403_18_49_15_Pro,0.75,0.0,1.0,1.0,0.436051,0.75,1.0,-0.666667,-1.154701
WIN_20210404_10_58_27_Pro,0.753425,0.0,1.0,1.0,0.434,1.0,1.0,-0.617172,-1.175937


In [163]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [164]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [165]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    0.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=10, multi_class='multinomial', random_state=42)
Best results 0.4344444444444445
Best params {'C': 10, 'class_weight': None}
accuracy (mean, std) 0.4333333333333333 0.13333333333333333
f1 (mean, std) 0.4344444444444445 0.11223872265932792
balanced accuracy (mean, std) 0.4333333333333333 0.12372809695177825
precision (mean, std) 0.5055555555555556 0.12717247935843998
recall (mean, std) 0.4333333333333333 0.13333333333333333

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | el

In [166]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.45087301587301587,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 1},
 'mean_test_f1_score': 0.45087301587301587,
 'std_test_f1_score': 0.18294923000902993,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.19436506316151,
 'mean_test_balanced_accuracy_score': 0.4777777777777777,
 'std_test_balanced_accuracy_score': 0.1670366264263656,
 'mean_test_precision': 0.43666666666666665,
 'std_test_precision': 0.1890064993330215,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.19436506316151}

In [None]:
df_ypredict_stress_global = pd.concat([ypredict_stress_diapo.reset_index(), pd.DataFrame(y_predict,columns=['predicted_stress_global'])], axis=1) 
df_ypredict_stress_global = df_ypredict_stress_global.set_index('video_name').sort_index()
df_ypredict_stress_global = df_ypredict_stress_global.iloc[:,-1]

In [None]:
df_ypredict_stress_global

### Audios diapos only

In [167]:
diapo_selection = '_audio_only'
diapo_audio_list = [8,9,10,11,17]
X = X_audio
y = y_audio

In [168]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [100, 150, 200], 'max_depth':[10, 15, 20, 25], 'class_weight':[None,'balanced']}
                ]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [169]:
best_result, y_predict, y_predict_proba,result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 24 candidates, totalling 720 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 15.2min finished
Best estimator RandomForestClassifier(max_depth=20, n_estimators=150, n_jobs=-1,
                       random_state=42)
Best results 0.485113011090835
Best params {'class_weight': None, 'max_depth': 20, 'n_estimators': 150}
accuracy (mean, std) 0.47611243309680507 0.2485920047305685
f1 (mean, std) 0.485113011090835 0.2896991582646307
balanced accuracy (mean, std) 0.4988648209525646 0.18131233940080954
precision (mean, std) 0.6586459986990544 0.32319878584073297
recall (mean, std) 0.47611243309680507 0.2485920047305685

f1_score (weighted) 0.45865607572630346
accuracy 0.47623442547300415


In [170]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')

In [171]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[490, 440,   8,   0],
       [388, 529,  58,   0],
       [ 98, 123,  13,   0],
       [  6,  14,   0,   0]])

In [191]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict


Unnamed: 0,video_name,diapo,frameTimeWindow,ypredict
0,Test_pour_AFPA,8,170,1
1,Test_pour_AFPA,8,175,1
2,Test_pour_AFPA,8,180,1
3,Test_pour_AFPA,8,185,1
4,Test_pour_AFPA,8,190,1
...,...,...,...,...
2162,WIN_20210417_14_53_12_Pro,17,490,1
2163,WIN_20210417_14_53_12_Pro,17,495,0
2164,WIN_20210417_14_53_12_Pro,17,500,1
2165,WIN_20210417_14_53_12_Pro,17,505,0


#### En utilisant la proportion des prédictions 0, 1 et 2

In [192]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,8,0.000000,1.000000,0.0
Test_pour_AFPA,9,0.050000,0.950000,0.0
Test_pour_AFPA,10,0.000000,1.000000,0.0
Test_pour_AFPA,11,0.000000,1.000000,0.0
Test_pour_AFPA,17,0.000000,1.000000,0.0
...,...,...,...,...
WIN_20210417_14_53_12_Pro,8,0.909091,0.090909,0.0
WIN_20210417_14_53_12_Pro,9,0.857143,0.142857,0.0
WIN_20210417_14_53_12_Pro,10,0.714286,0.285714,0.0
WIN_20210417_14_53_12_Pro,11,0.800000,0.200000,0.0


In [174]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress = df_annotations_stress[df_annotations_stress.diapo.isin(diapo_audio_list)]
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
6,Test_pour_AFPA,17,0.0
...,...,...,...
233,WIN_20210417_14_53_12_Pro,8,0.0
234,WIN_20210417_14_53_12_Pro,9,0.0
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0


In [175]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [176]:
y

0      1.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
145    0.0
146    0.0
147    0.0
148    0.0
149    1.0
Name: stress, Length: 150, dtype: float64

In [177]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [178]:
best_result, y_predict, y_predict_proba,result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    3.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=4, multi_class='multinomial', random_state=42)
Best results 0.4264285714285713
Best params {'C': 4, 'class_weight': None}
accuracy (mean, std) 0.46 0.31474857690967667
f1 (mean, std) 0.4264285714285713 0.3418098440854673
balanced accuracy (mean, std) 0.4869444444444444 0.2804484118842449
precision (mean, std) 0.4814444444444445 0.4037295421564745
recall (mean, std) 0.46 0.31474857690967667

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_job

In [179]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=3)),
                 ('knn', KNeighborsClassifier(n_neighbors=15, p=1))]),
 'best_score': 0.5453703703703704,
 'best_params': {'knn__n_neighbors': 15,
  'knn__p': 1,
  'knn__weights': 'uniform',
  'pca__n_components': 3},
 'mean_test_f1_score': 0.5453703703703704,
 'std_test_f1_score': 0.31679454548429153,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.30258148581093913,
 'mean_test_balanced_accuracy_score': 0.5214814814814814,
 'std_test_balanced_accuracy_score': 0.31570185488795943,
 'mean_test_precision': 0.6301111111111111,
 'std_test_precision': 0.3579618056734372,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.30258148581093913}

In [180]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')

#### En aggrégeant les prédictions des diapos

In [193]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict

Unnamed: 0,video_name,diapo,frameTimeWindow,ypredict
0,Test_pour_AFPA,8,170,1
1,Test_pour_AFPA,8,175,1
2,Test_pour_AFPA,8,180,1
3,Test_pour_AFPA,8,185,1
4,Test_pour_AFPA,8,190,1
...,...,...,...,...
2162,WIN_20210417_14_53_12_Pro,17,490,1
2163,WIN_20210417_14_53_12_Pro,17,495,0
2164,WIN_20210417_14_53_12_Pro,17,500,1
2165,WIN_20210417_14_53_12_Pro,17,505,0


In [194]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [195]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [196]:
Xy[Xy.isna().any(axis=1)]

Unnamed: 0,video_name,diapo,"(ypredict, mean)","(ypredict, min)","(ypredict, max)","(ypredict, median)","(ypredict, std)","(ypredict, percentil25)","(ypredict, percentil75)","(ypredict, kurtosis)","(ypredict, skew)",stress


In [197]:
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [198]:
best_result, y_predict,y_predict_proba, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)
best_result

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    5.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.5155820105820106
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.5133333333333333 0.28604583937233247
f1 (mean, std) 0.5155820105820106 0.29546255643146463
balanced accuracy (mean, std) 0.502962962962963 0.2901919803501358
precision (mean, std) 0.6372222222222224 0.3567228235472272
recall (mean, std) 0.5133333333333333 0.28604583937233247

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:    6.6s

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=2)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.5328306878306878,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 2},
 'mean_test_f1_score': 0.5328306878306878,
 'std_test_f1_score': 0.31099281598143175,
 'mean_test_accuracy_score': 0.5399999999999999,
 'std_test_accuracy_score': 0.2973213749463701,
 'mean_test_balanced_accuracy_score': 0.5296296296296297,
 'std_test_balanced_accuracy_score': 0.28950729158224436,
 'mean_test_precision': 0.6267777777777778,
 'std_test_precision': 0.3520453288715061,
 'mean_test_recall': 0.5399999999999999,
 'std_test_recall': 0.2973213749463701}

In [199]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=2)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.5328306878306878,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 2},
 'mean_test_f1_score': 0.5328306878306878,
 'std_test_f1_score': 0.31099281598143175,
 'mean_test_accuracy_score': 0.5399999999999999,
 'std_test_accuracy_score': 0.2973213749463701,
 'mean_test_balanced_accuracy_score': 0.5296296296296297,
 'std_test_balanced_accuracy_score': 0.28950729158224436,
 'mean_test_precision': 0.6267777777777778,
 'std_test_precision': 0.3520453288715061,
 'mean_test_recall': 0.5399999999999999,
 'std_test_recall': 0.2973213749463701}

In [200]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[56, 12,  0,  0],
       [41, 25,  0,  0],
       [11,  4,  0,  0],
       [ 0,  1,  0,  0]])

In [None]:
df_ypredict

## Stress global

### En utilisant le stress prédit des diapos

In [201]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [202]:
ypredict_stress_diapo

diapo,8,9,10,11,17
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test_pour_AFPA,0.0,1.0,0.0,0.0,0.0
Video_1,0.0,0.0,0.0,0.0,1.0
WIN_20210323_19_17_40_Pro,0.0,0.0,1.0,1.0,0.0
WIN_20210329_10_16_02_Pro,0.0,1.0,1.0,0.0,0.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,0.0,0.0,0.0,0.0,0.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,0.0,0.0
WIN_20210403_18_49_15_Pro,1.0,0.0,0.0,0.0,1.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,1.0,1.0


In [203]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress

Unnamed: 0,video_name,1,8,9,10,11,12,17,18,stress_global
0,Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
2,WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
4,WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
6,WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
7,WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
8,WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
9,WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [204]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
diapo_audio_list.append('stress_global')
df_annotations_stress = df_annotations_stress[[str(diapo) for diapo in diapo_audio_list]]
diapo_audio_list.pop()
df_annotations_stress


Unnamed: 0_level_0,8,9,10,11,17,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test_pour_AFPA,1.0,0.0,0.0,0.0,0.0,1.0
Video_1,0.0,0.0,0.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,0.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,1.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,1.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,1.0,1.0,1.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,2.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,1.0,1.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,2.0,2.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,1.0,1.0,1.0


In [205]:
Xy = ypredict_stress_diapo.merge(df_annotations_stress.iloc[:,-1],how='inner',on='video_name')
X = Xy.iloc[:,:-1]
y = Xy.iloc[:,-1]

In [206]:
Xy

Unnamed: 0_level_0,8,9,10,11,17,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test_pour_AFPA,0.0,1.0,0.0,0.0,0.0,1.0
Video_1,0.0,0.0,0.0,0.0,1.0,0.0
WIN_20210323_19_17_40_Pro,0.0,0.0,1.0,1.0,0.0,1.0
WIN_20210329_10_16_02_Pro,0.0,1.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,1.0,1.0,0.0,0.0,1.0,1.0
WIN_20210402_14_27_50_Pro,0.0,0.0,0.0,0.0,0.0,1.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,0.0,0.0,2.0
WIN_20210403_18_49_15_Pro,1.0,0.0,0.0,0.0,1.0,2.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,1.0,1.0,1.0


In [207]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

#groups = X.reset_index()['video_name']
#loo = LeaveOneGroupOut()
#cv_loo = loo.split(X, y, groups)

In [208]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.5, multi_class='multinomial', random_state=42)
Best results 0.4530158730158731
Best params {'C': 0.5, 'class_weight': None}
accuracy (mean, std) 0.5000000000000001 0.18257418583505539
f1 (mean, std) 0.4530158730158731 0.16259299720850415
balanced accuracy (mean, std) 0.4444444444444445 0.1531560972454469
precision (mean, std) 0.47833333333333333 0.14544949486180953
recall (mean, std) 0.5000000000000001 0.18257418583505539

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | 

In [209]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=4)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.47968253968253965,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 4},
 'mean_test_f1_score': 0.47968253968253965,
 'std_test_f1_score': 0.16264505512980054,
 'mean_test_accuracy_score': 0.5333333333333334,
 'std_test_accuracy_score': 0.16329931618554522,
 'mean_test_balanced_accuracy_score': 0.47777777777777775,
 'std_test_balanced_accuracy_score': 0.11439589045541111,
 'mean_test_precision': 0.5005555555555555,
 'std_test_precision': 0.15522186431200288,
 'mean_test_recall': 0.5333333333333334,
 'std_test_recall': 0.16329931618554522}

In [210]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global' + diapo_selection + '.csv')

In [211]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict_proba, columns=['stress_global_predict_0', 'stress_global_predict_1', 'stress_global_predict_2'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global_proba' + diapo_selection + '.csv')

#### Autre méthode

### En utilisant le stress prédit des time windows 5s

In [212]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.014085,0.985915,0.0
Video_1,0.694444,0.305556,0.0
WIN_20210323_19_17_40_Pro,0.472222,0.527778,0.0
WIN_20210329_10_16_02_Pro,0.027778,0.972222,0.0
WIN_20210330_13_10_29_Pro,0.0,1.0,0.0
WIN_20210331_21_22_52_Pro,0.084507,0.915493,0.0
WIN_20210402_14_27_50_Pro,0.013889,0.027778,0.958333
WIN_20210402_19_04_53_Pro,0.333333,0.666667,0.0
WIN_20210403_18_49_15_Pro,0.25,0.75,0.0
WIN_20210404_10_58_27_Pro,0.246575,0.753425,0.0


In [213]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [214]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [215]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [216]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=1, multi_class='multinomial', random_state=42)
Best results 0.39714285714285713
Best params {'C': 1, 'class_weight': None}
accuracy (mean, std) 0.4666666666666667 0.22110831935702668
f1 (mean, std) 0.39714285714285713 0.21098671594645593
balanced accuracy (mean, std) 0.4222222222222222 0.17777777777777776
precision (mean, std) 0.38499999999999995 0.20374275719914836
recall (mean, std) 0.4666666666666667 0.22110831935702668

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | e

In [217]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn',
                  KNeighborsClassifier(n_neighbors=6, p=1, weights='distance'))]),
 'best_score': 0.4933333333333333,
 'best_params': {'knn__n_neighbors': 6,
  'knn__p': 1,
  'knn__weights': 'distance',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.4933333333333333,
 'std_test_f1_score': 0.1768866554856213,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.1632993161855452,
 'mean_test_balanced_accuracy_score': 0.4777777777777777,
 'std_test_balanced_accuracy_score': 0.12957670877434,
 'mean_test_precision': 0.48888888888888893,
 'std_test_precision': 0.18392161508052055,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.1632993161855452}

#### Autre méthode

In [218]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict = df_ypredict.groupby(['video_name']).agg({'ypredict': ['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,0.985915,0.0,1.0,1.0,0.118678,1.0,1.0,66.014286,-8.247077
Video_1,0.305556,0.0,1.0,0.0,0.463875,0.0,1.0,-1.287273,0.844232
WIN_20210323_19_17_40_Pro,0.527778,0.0,1.0,1.0,0.502731,0.0,1.0,-1.987616,-0.111283
WIN_20210329_10_16_02_Pro,0.972222,0.0,1.0,1.0,0.165489,1.0,1.0,31.028571,-5.747049
WIN_20210330_13_10_29_Pro,1.0,1.0,1.0,1.0,0.0,1.0,1.0,-3.0,0.0
WIN_20210331_21_22_52_Pro,0.915493,0.0,1.0,1.0,0.280126,1.0,1.0,6.925641,-2.987581
WIN_20210402_14_27_50_Pro,1.944444,0.0,2.0,2.0,0.285267,2.0,2.0,31.264793,-5.513881
WIN_20210402_19_04_53_Pro,0.666667,0.0,1.0,1.0,0.474713,0.0,1.0,-1.5,-0.707107
WIN_20210403_18_49_15_Pro,0.75,0.0,1.0,1.0,0.436051,0.75,1.0,-0.666667,-1.154701
WIN_20210404_10_58_27_Pro,0.753425,0.0,1.0,1.0,0.434,1.0,1.0,-0.617172,-1.175937


In [219]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [220]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [221]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    0.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=10, multi_class='multinomial', random_state=42)
Best results 0.4344444444444445
Best params {'C': 10, 'class_weight': None}
accuracy (mean, std) 0.4333333333333333 0.13333333333333333
f1 (mean, std) 0.4344444444444445 0.11223872265932792
balanced accuracy (mean, std) 0.4333333333333333 0.12372809695177825
precision (mean, std) 0.5055555555555556 0.12717247935843998
recall (mean, std) 0.4333333333333333 0.13333333333333333

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | el

In [222]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.45087301587301587,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 1},
 'mean_test_f1_score': 0.45087301587301587,
 'std_test_f1_score': 0.18294923000902993,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.19436506316151,
 'mean_test_balanced_accuracy_score': 0.4777777777777777,
 'std_test_balanced_accuracy_score': 0.1670366264263656,
 'mean_test_precision': 0.43666666666666665,
 'std_test_precision': 0.1890064993330215,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.19436506316151}