In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pathlib
from util import runGridSearchClassifiers

## Loading data

In [2]:
directory_path = '../../04_-_Dev/videos'
features = 'emobase_eGeMAPS'

In [3]:
df_total = pd.read_pickle(directory_path + '/audio_' + features + '_data.p')

In [4]:
df_total.shape

(2573581, 96)

In [5]:
df_total.video_name.nunique()

30

In [6]:
df_total[df_total.isna().any(axis=1)]

Unnamed: 0,frameIndex,frameTime,pcm_intensity_sma,pcm_loudness_sma,mfcc_sma[1],mfcc_sma[2],mfcc_sma[3],mfcc_sma[4],mfcc_sma[5],mfcc_sma[6],...,F2frequency_sma3nz_de,F2amplitudeLogRelF0_sma3nz_de,F3frequency_sma3nz_de,F3amplitudeLogRelF0_sma3nz_de,video_name,stress_global,type_candidat,sexe,stress,diapo


## Data processing

In [7]:
time_window = 5
df_total['frameTimeWindow'] = df_total.frameTime.apply(lambda x : np.floor(x / time_window) * time_window).astype(int)

In [4]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

In [9]:
# 5 seconds windows
X = df_total.iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]

In [10]:
y = df_total.iloc[:,3:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'min'}).iloc[:,-1]

In [11]:
X.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

In [12]:
from scipy.stats import kurtosis, skew
X_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]
y_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'mean'}).iloc[:,-1]

In [13]:
X_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

## Modèles

In [5]:
X = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

X_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

In [6]:
X = X.fillna(0)
X_audio = X_audio.fillna(0)

## Stress par diapos
### All diapos

On prédit le stress par time window de 5s avec un Random Forest

In [7]:
diapo_selection = '_all'

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [100, 150, 200, 250, 300], 'max_depth':[10, 15, 20, 25,30], 'class_weight':[None,'balanced']}
                ]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [29]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   42.9s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min


KeyboardInterrupt: 

In [19]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[ 660, 1243,  161,    0],
       [ 802, 1692,   96,    0],
       [ 324,  316,   13,    0],
       [   4,   16,    0,    0]])

In [8]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict


Unnamed: 0,video_name,diapo,frameTimeWindow,ypredict
0,Test_pour_AFPA,1,0,2
1,Test_pour_AFPA,1,5,2
2,Test_pour_AFPA,1,10,0
3,Test_pour_AFPA,1,15,2
4,Test_pour_AFPA,1,20,2
...,...,...,...,...
5322,WIN_20210417_14_53_12_Pro,18,600,1
5323,WIN_20210417_14_53_12_Pro,18,605,1
5324,WIN_20210417_14_53_12_Pro,18,610,1
5325,WIN_20210417_14_53_12_Pro,18,615,0


#### En utilisant la proportion des prédictions 0, 1 et 2

In [9]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.147059,0.000000,0.852941
Test_pour_AFPA,8,0.100000,0.700000,0.200000
Test_pour_AFPA,9,0.000000,0.600000,0.400000
Test_pour_AFPA,10,0.133333,0.666667,0.200000
Test_pour_AFPA,11,0.000000,0.700000,0.300000
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.428571,0.571429,0.000000
WIN_20210417_14_53_12_Pro,11,0.600000,0.400000,0.000000
WIN_20210417_14_53_12_Pro,12,0.181818,0.818182,0.000000
WIN_20210417_14_53_12_Pro,17,0.571429,0.428571,0.000000


In [10]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
0,Test_pour_AFPA,1,1.0
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0
237,WIN_20210417_14_53_12_Pro,12,0.0
238,WIN_20210417_14_53_12_Pro,17,1.0


In [11]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [12]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.147059,0.000000,0.852941
Test_pour_AFPA,8,0.100000,0.700000,0.200000
Test_pour_AFPA,9,0.000000,0.600000,0.400000
Test_pour_AFPA,10,0.133333,0.666667,0.200000
Test_pour_AFPA,11,0.000000,0.700000,0.300000
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.428571,0.571429,0.000000
WIN_20210417_14_53_12_Pro,11,0.600000,0.400000,0.000000
WIN_20210417_14_53_12_Pro,12,0.181818,0.818182,0.000000
WIN_20210417_14_53_12_Pro,17,0.571429,0.428571,0.000000


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [22]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.3443129093129092
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4708333333333333 0.25552913510769937
f1 (mean, std) 0.3443129093129092 0.26764425176071577
balanced accuracy (mean, std) 0.4583333333333333 0.1070436048222094
precision (mean, std) 0.2869791666666667 0.25321768279184653
recall (mean, std) 0.4708333333333333 0.25552913510769937

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 2160 tasks      | elapsed:    8.

In [23]:
best_result

{'best_estimator': RandomForestClassifier(max_depth=15, n_estimators=50, n_jobs=-1,
                        random_state=42),
 'best_score': 0.5178600566100565,
 'best_params': {'class_weight': None, 'max_depth': 15, 'n_estimators': 50},
 'mean_test_f1_score': 0.5178600566100565,
 'std_test_f1_score': 0.18568798313383672,
 'mean_test_accuracy_score': 0.5125,
 'std_test_accuracy_score': 0.18638557705287537,
 'mean_test_balanced_accuracy_score': 0.4998544973544974,
 'std_test_balanced_accuracy_score': 0.2178320231885538,
 'mean_test_precision': 0.625203373015873,
 'std_test_precision': 0.23529179309121134,
 'mean_test_recall': 0.5125,
 'std_test_recall': 0.18638557705287537}

In [24]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')

#### En aggrégeant les prédicitions des diapos

In [30]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [31]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,diapo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Test_pour_AFPA,1,1.705882,0,2,2.0,0.718981,2.0,2.0,1.972414,-1.993092
Test_pour_AFPA,8,1.100000,0,2,1.0,0.567646,1.0,1.0,0.302021,0.076839
Test_pour_AFPA,9,1.400000,1,2,1.0,0.502625,1.0,2.0,-1.833333,0.408248
Test_pour_AFPA,10,1.066667,0,2,1.0,0.593617,1.0,1.0,-0.001096,0.003142
Test_pour_AFPA,11,1.300000,1,2,1.0,0.470162,1.0,2.0,-1.238095,0.872872
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.571429,0,1,1.0,0.513553,0.0,1.0,-1.916667,-0.288675
WIN_20210417_14_53_12_Pro,11,0.400000,0,1,0.0,0.502625,0.0,1.0,-1.833333,0.408248
WIN_20210417_14_53_12_Pro,12,0.818182,0,1,1.0,0.391675,1.0,1.0,0.722222,-1.649916
WIN_20210417_14_53_12_Pro,17,0.428571,0,1,0.0,0.534522,0.0,1.0,-1.916667,0.288675


In [32]:
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [33]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)
best_result

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    6.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    6.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=10, multi_class='multinomial', random_state=42)
Best results 0.44811167998667994
Best params {'C': 10, 'class_weight': None}
accuracy (mean, std) 0.475 0.21984843263788198
f1 (mean, std) 0.44811167998667994 0.2421748133536334
balanced accuracy (mean, std) 0.5155952380952381 0.16941256995469542
precision (mean, std) 0.5531498015873015 0.3002456518663868
recall (mean, std) 0.475 0.21984843263788198

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Paralle

{'best_estimator': RandomForestClassifier(max_depth=4, n_estimators=50, n_jobs=-1, random_state=42),
 'best_score': 0.5041521441521442,
 'best_params': {'class_weight': None, 'max_depth': 4, 'n_estimators': 50},
 'mean_test_f1_score': 0.5041521441521442,
 'std_test_f1_score': 0.2329587737151845,
 'mean_test_accuracy_score': 0.525,
 'std_test_accuracy_score': 0.19737865470545018,
 'mean_test_balanced_accuracy_score': 0.5563888888888889,
 'std_test_balanced_accuracy_score': 0.18285070552629498,
 'mean_test_precision': 0.6120982142857143,
 'std_test_precision': 0.2999199750510037,
 'mean_test_recall': 0.525,
 'std_test_recall': 0.19737865470545018}

In [34]:
best_result

{'best_estimator': RandomForestClassifier(max_depth=4, n_estimators=50, n_jobs=-1, random_state=42),
 'best_score': 0.5041521441521442,
 'best_params': {'class_weight': None, 'max_depth': 4, 'n_estimators': 50},
 'mean_test_f1_score': 0.5041521441521442,
 'std_test_f1_score': 0.2329587737151845,
 'mean_test_accuracy_score': 0.525,
 'std_test_accuracy_score': 0.19737865470545018,
 'mean_test_balanced_accuracy_score': 0.5563888888888889,
 'std_test_balanced_accuracy_score': 0.18285070552629498,
 'mean_test_precision': 0.6120982142857143,
 'std_test_precision': 0.2999199750510037,
 'mean_test_recall': 0.525,
 'std_test_recall': 0.19737865470545018}

In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[44, 57,  0,  0],
       [31, 80,  2,  0],
       [ 5, 18,  2,  0],
       [ 0,  1,  0,  0]])

In [37]:
df_ypredict

Unnamed: 0,"(video_name, )","(diapo, )",ypredict
0,Test_pour_AFPA,1,0.0
1,Test_pour_AFPA,8,0.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0
237,WIN_20210417_14_53_12_Pro,12,1.0
238,WIN_20210417_14_53_12_Pro,17,0.0


## Stress global

### En utilisant le stress prédit des diapos

In [8]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [9]:
ypredict_stress_diapo

diapo,1,8,9,10,11,12,17,18
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Test_pour_AFPA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Video_1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
WIN_20210323_19_17_40_Pro,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
WIN_20210329_10_16_02_Pro,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
WIN_20210330_13_10_29_Pro,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
WIN_20210331_21_22_52_Pro,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
WIN_20210402_14_27_50_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0
WIN_20210403_18_49_15_Pro,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
WIN_20210404_10_58_27_Pro,0.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0


In [10]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
df_annotations_stress

Unnamed: 0_level_0,1,8,9,10,11,12,17,18,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
Xy = ypredict_stress_diapo.merge(df_annotations_stress.iloc[:,-1],how='inner',on='video_name')
X = Xy.iloc[:,:-1]
y = Xy.iloc[:,-1]

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

#groups = X.reset_index()['video_name']
#loo = LeaveOneGroupOut()
#cv_loo = loo.split(X, y, groups)

In [13]:
best_result, y_predict, y_predict_proba, result_list = runGridSearchClassifiers(X, y, 5, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=10, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.45
Best params {'C': 10, 'class_weight': 'balanced'}
accuracy (mean, std) 0.4333333333333333 0.13333333333333333
f1 (mean, std) 0.45 0.12202003478482085
balanced accuracy (mean, std) 0.4 0.1625415426480866
precision (mean, std) 0.5444444444444445 0.13788526273323173
recall (mean, std) 0.4333333333333333 0.13333333333333333

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 385 out of 400 | elapsed:    1.5

In [14]:
best_result

{'best_estimator': RandomForestClassifier(max_depth=4, n_estimators=200, n_jobs=-1,
                        random_state=42),
 'best_score': 0.5174603174603175,
 'best_params': {'class_weight': None, 'max_depth': 4, 'n_estimators': 200},
 'mean_test_f1_score': 0.5174603174603175,
 'std_test_f1_score': 0.11860871473413895,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.12472191289246472,
 'mean_test_balanced_accuracy_score': 0.4888888888888888,
 'std_test_balanced_accuracy_score': 0.16996731711975951,
 'mean_test_precision': 0.5877777777777778,
 'std_test_precision': 0.16126046716912656,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.12472191289246472}

In [32]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global' + diapo_selection + '.csv')

In [26]:
# Saving predict proba
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(proba, columns=['stress_global_proba_0','stress_global_proba_1','stress_global_proba_2'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global_proba' + diapo_selection + '.csv')

#### Autre méthode

### En utilisant le stress prédit des time windows 5s

In [46]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.15873,0.296296,0.544974
Video_1,0.357616,0.642384,0.0
WIN_20210323_19_17_40_Pro,0.392857,0.607143,0.0
WIN_20210329_10_16_02_Pro,0.034014,0.965986,0.0
WIN_20210330_13_10_29_Pro,0.059603,0.933775,0.006623
WIN_20210331_21_22_52_Pro,0.258824,0.741176,0.0
WIN_20210402_14_27_50_Pro,0.016129,0.983871,0.0
WIN_20210402_19_04_53_Pro,0.92,0.08,0.0
WIN_20210403_18_49_15_Pro,0.248619,0.751381,0.0
WIN_20210404_10_58_27_Pro,0.705,0.19,0.105


In [47]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [48]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [50]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.4
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4 0.4898979485566357
f1 (mean, std) 0.4 0.4898979485566357
balanced accuracy (mean, std) 0.4 0.4898979485566357
precision (mean, std) 0.4 0.4898979485566357
recall (mean, std) 0.4 0.4898979485566357

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 2160 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:    6.5s finished
[Parallel(n_jobs=-

In [51]:
best_result

{'best_estimator': KNeighborsClassifier(n_neighbors=11),
 'best_score': 0.43333333333333335,
 'best_params': {'n_neighbors': 11, 'p': 2, 'weights': 'uniform'},
 'mean_test_f1_score': 0.43333333333333335,
 'std_test_f1_score': 0.49553562491061676,
 'mean_test_accuracy_score': 0.43333333333333335,
 'std_test_accuracy_score': 0.49553562491061676,
 'mean_test_balanced_accuracy_score': 0.43333333333333335,
 'std_test_balanced_accuracy_score': 0.49553562491061676,
 'mean_test_precision': 0.43333333333333335,
 'std_test_precision': 0.49553562491061676,
 'mean_test_recall': 0.43333333333333335,
 'std_test_recall': 0.49553562491061676}

#### Autre méthode

In [52]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict = df_ypredict.groupby(['video_name']).agg({'ypredict': ['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,1.386243,0.0,2.0,2.0,0.746639,1.0,2.0,-0.820796,-0.760219
Video_1,0.642384,0.0,1.0,1.0,0.480893,0.0,1.0,-1.647003,-0.594136
WIN_20210323_19_17_40_Pro,0.607143,0.0,1.0,1.0,0.489846,0.0,1.0,-1.807487,-0.438763
WIN_20210329_10_16_02_Pro,0.965986,0.0,1.0,1.0,0.181884,1.0,1.0,24.435211,-5.141518
WIN_20210330_13_10_29_Pro,0.94702,0.0,2.0,1.0,0.252668,1.0,1.0,10.946044,-2.676898
WIN_20210331_21_22_52_Pro,0.741176,0.0,1.0,1.0,0.439282,0.0,1.0,-0.787157,-1.101291
WIN_20210402_14_27_50_Pro,0.983871,0.0,1.0,1.0,0.126312,1.0,1.0,57.016393,-7.682213
WIN_20210402_19_04_53_Pro,0.08,0.0,1.0,0.0,0.272072,0.0,0.0,7.586957,3.096281
WIN_20210403_18_49_15_Pro,0.751381,0.0,1.0,1.0,0.433411,1.0,1.0,-0.646895,-1.16323
WIN_20210404_10_58_27_Pro,0.4,0.0,2.0,0.0,0.672504,0.0,1.0,0.608889,1.411207


In [53]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [55]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='accuracy_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.4666666666666666
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4666666666666666 0.1247219128924647
f1 (mean, std) 0.36388888888888893 0.1634785785777974
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.5s finished


In [56]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    4.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.05, multi_class='multinomial', random_state=42)
Best results 0.4666666666666667
Best params {'C': 0.05, 'class_weight': None}
accuracy (mean, std) 0.4666666666666667 0.49888765156985887
f1 (mean, std) 0.4666666666666667 0.49888765156985887
balanced accuracy (mean, std) 0.4666666666666667 0.49888765156985887
precision (mean, std) 0.4666666666666667 0.49888765156985887
recall (mean, std) 0.4666666666666667 0.49888765156985887

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks   

In [57]:
best_result

{'best_estimator': LogisticRegression(C=0.05, multi_class='multinomial', random_state=42),
 'best_score': 0.4666666666666667,
 'best_params': {'C': 0.05, 'class_weight': None},
 'mean_test_f1_score': 0.4666666666666667,
 'std_test_f1_score': 0.49888765156985887,
 'mean_test_accuracy_score': 0.4666666666666667,
 'std_test_accuracy_score': 0.49888765156985887,
 'mean_test_balanced_accuracy_score': 0.4666666666666667,
 'std_test_balanced_accuracy_score': 0.49888765156985887,
 'mean_test_precision': 0.4666666666666667,
 'std_test_precision': 0.49888765156985887,
 'mean_test_recall': 0.4666666666666667,
 'std_test_recall': 0.49888765156985887}

In [58]:
df_ypredict_stress_global = pd.concat([ypredict_stress_diapo.reset_index(), pd.DataFrame(y_predict,columns=['predicted_stress_global'])], axis=1) 
df_ypredict_stress_global = df_ypredict_stress_global.set_index('video_name').sort_index()
df_ypredict_stress_global = df_ypredict_stress_global.iloc[:,-1]

In [59]:
df_ypredict_stress_global

video_name
Test_pour_AFPA                     0.0
Video_1                            0.0
WIN_20210323_19_17_40_Pro          0.0
WIN_20210329_10_16_02_Pro          1.0
WIN_20210330_13_10_29_Pro          1.0
WIN_20210331_21_22_52_Pro          0.0
WIN_20210402_14_27_50_Pro          1.0
WIN_20210402_19_04_53_Pro          2.0
WIN_20210403_18_49_15_Pro          0.0
WIN_20210404_10_58_27_Pro          0.0
WIN_20210404_21_41_12_Pro          0.0
WIN_20210405_15_09_16_Pro          0.0
WIN_20210406_15_06_15_Pro          0.0
WIN_20210406_18_35_52_Pro          0.0
WIN_20210406_18_49_10_Pro          0.0
WIN_20210406_21_05_52_Pro          0.0
WIN_20210407_09_04_05_Pro          0.0
WIN_20210407_14_54_56_Pro_edit2    0.0
WIN_20210408_11_48_58_Pro          0.0
WIN_20210408_14_00_44_Pro          0.0
WIN_20210408_14_02_19_Pro          0.0
WIN_20210408_14_11_32_Pro          0.0
WIN_20210408_15_20_51_Pro          0.0
WIN_20210408_16_04_32_Pro          0.0
WIN_20210409_10_26_11_Pro          0.0
WIN_20210413_1

### Audios diapos only

In [33]:
diapo_selection = '_audio_only'
diapo_audio_list = [8,9,10,11,17]
X = X_audio
y = y_audio

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [100, 150, 200], 'max_depth':[10, 15, 20, 25], 'class_weight':[None,'balanced']}
                ]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [62]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 24 candidates, totalling 720 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 14.7min finished
Best estimator RandomForestClassifier(max_depth=20, n_estimators=150, n_jobs=-1,
                       random_state=42)
Best results 0.485113011090835
Best params {'class_weight': None, 'max_depth': 20, 'n_estimators': 150}
accuracy (mean, std) 0.47611243309680507 0.2485920047305685
f1 (mean, std) 0.485113011090835 0.2896991582646307
balanced accuracy (mean, std) 0.4988648209525646 0.18131233940080954
precision (mean, std) 0.6586459986990544 0.32319878584073297
recall (mean, std) 0.47611243309680507 0.2485920047305685



In [63]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')

In [64]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[490, 440,   8,   0],
       [388, 529,  58,   0],
       [ 98, 123,  13,   0],
       [  6,  14,   0,   0]])

In [35]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict


Unnamed: 0,video_name,diapo,frameTimeWindow,ypredict
0,Test_pour_AFPA,8,170,1
1,Test_pour_AFPA,8,175,1
2,Test_pour_AFPA,8,180,1
3,Test_pour_AFPA,8,185,1
4,Test_pour_AFPA,8,190,1
...,...,...,...,...
2162,WIN_20210417_14_53_12_Pro,17,490,1
2163,WIN_20210417_14_53_12_Pro,17,495,0
2164,WIN_20210417_14_53_12_Pro,17,500,1
2165,WIN_20210417_14_53_12_Pro,17,505,0


#### En utilisant la proportion des prédictions 0, 1 et 2

In [36]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,8,0.000000,1.000000,0.0
Test_pour_AFPA,9,0.050000,0.950000,0.0
Test_pour_AFPA,10,0.000000,1.000000,0.0
Test_pour_AFPA,11,0.000000,1.000000,0.0
Test_pour_AFPA,17,0.000000,1.000000,0.0
...,...,...,...,...
WIN_20210417_14_53_12_Pro,8,0.909091,0.090909,0.0
WIN_20210417_14_53_12_Pro,9,0.857143,0.142857,0.0
WIN_20210417_14_53_12_Pro,10,0.714286,0.285714,0.0
WIN_20210417_14_53_12_Pro,11,0.800000,0.200000,0.0


In [37]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress = df_annotations_stress[df_annotations_stress.diapo.isin(diapo_audio_list)]
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
6,Test_pour_AFPA,17,0.0
...,...,...,...
233,WIN_20210417_14_53_12_Pro,8,0.0
234,WIN_20210417_14_53_12_Pro,9,0.0
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0


In [38]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [39]:
y

0      1.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
145    0.0
146    0.0
147    0.0
148    0.0
149    1.0
Name: stress, Length: 150, dtype: float64

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [41]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    5.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=4, multi_class='multinomial', random_state=42)
Best results 0.4264285714285713
Best params {'C': 4, 'class_weight': None}
accuracy (mean, std) 0.46 0.31474857690967667
f1 (mean, std) 0.4264285714285713 0.3418098440854673
balanced accuracy (mean, std) 0.4869444444444444 0.2804484118842449
precision (mean, std) 0.4814444444444445 0.4037295421564745
recall (mean, std) 0.46 0.31474857690967667

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 2274 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 2400 

In [42]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=3)),
                 ('knn', KNeighborsClassifier(n_neighbors=15, p=1))]),
 'best_score': 0.5453703703703704,
 'best_params': {'knn__n_neighbors': 15,
  'knn__p': 1,
  'knn__weights': 'uniform',
  'pca__n_components': 3},
 'mean_test_f1_score': 0.5453703703703704,
 'std_test_f1_score': 0.31679454548429153,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.30258148581093913,
 'mean_test_balanced_accuracy_score': 0.5214814814814814,
 'std_test_balanced_accuracy_score': 0.31570185488795943,
 'mean_test_precision': 0.6301111111111111,
 'std_test_precision': 0.3579618056734372,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.30258148581093913}

In [43]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')

#### En aggrégeant les prédictions des diapos

In [73]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [74]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [75]:
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [76]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)
best_result

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    5.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    5.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.5155820105820106
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.5133333333333333 0.28604583937233247
f1 (mean, std) 0.5155820105820106 0.29546255643146463
balanced accuracy (mean, std) 0.502962962962963 0.2901919803501358
precision (mean, std) 0.6372222222222224 0.3567228235472272
recall (mean, std) 0.5133333333333333 0.28604583937233247

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=2)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.5328306878306878,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 2},
 'mean_test_f1_score': 0.5328306878306878,
 'std_test_f1_score': 0.31099281598143175,
 'mean_test_accuracy_score': 0.5399999999999999,
 'std_test_accuracy_score': 0.2973213749463701,
 'mean_test_balanced_accuracy_score': 0.5296296296296297,
 'std_test_balanced_accuracy_score': 0.28950729158224436,
 'mean_test_precision': 0.6267777777777778,
 'std_test_precision': 0.3520453288715061,
 'mean_test_recall': 0.5399999999999999,
 'std_test_recall': 0.2973213749463701}

In [77]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=2)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.5328306878306878,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 2},
 'mean_test_f1_score': 0.5328306878306878,
 'std_test_f1_score': 0.31099281598143175,
 'mean_test_accuracy_score': 0.5399999999999999,
 'std_test_accuracy_score': 0.2973213749463701,
 'mean_test_balanced_accuracy_score': 0.5296296296296297,
 'std_test_balanced_accuracy_score': 0.28950729158224436,
 'mean_test_precision': 0.6267777777777778,
 'std_test_precision': 0.3520453288715061,
 'mean_test_recall': 0.5399999999999999,
 'std_test_recall': 0.2973213749463701}

In [78]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[56, 12,  0,  0],
       [41, 25,  0,  0],
       [11,  4,  0,  0],
       [ 0,  1,  0,  0]])

In [80]:
df_ypredict

Unnamed: 0,video_name,diapo,ypredict
0,Test_pour_AFPA,8,0.0
1,Test_pour_AFPA,9,1.0
2,Test_pour_AFPA,10,0.0
3,Test_pour_AFPA,11,0.0
4,Test_pour_AFPA,17,0.0
...,...,...,...
145,WIN_20210417_14_53_12_Pro,8,0.0
146,WIN_20210417_14_53_12_Pro,9,0.0
147,WIN_20210417_14_53_12_Pro,10,0.0
148,WIN_20210417_14_53_12_Pro,11,0.0


## Stress global

### En utilisant le stress prédit des diapos

In [44]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [45]:
ypredict_stress_diapo

diapo,8,9,10,11,17
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test_pour_AFPA,0.0,1.0,0.0,0.0,0.0
Video_1,0.0,0.0,0.0,0.0,1.0
WIN_20210323_19_17_40_Pro,0.0,0.0,1.0,1.0,0.0
WIN_20210329_10_16_02_Pro,0.0,1.0,1.0,0.0,0.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,0.0,0.0,0.0,0.0,0.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,0.0,0.0
WIN_20210403_18_49_15_Pro,1.0,0.0,0.0,0.0,1.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,1.0,1.0


In [46]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress

Unnamed: 0,video_name,1,8,9,10,11,12,17,18,stress_global
0,Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
2,WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
4,WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
6,WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
7,WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
8,WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
9,WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [47]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
diapo_audio_list.append('stress_global')
df_annotations_stress = df_annotations_stress[[str(diapo) for diapo in diapo_audio_list]]
diapo_audio_list.pop()
df_annotations_stress


Unnamed: 0_level_0,8,9,10,11,17,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test_pour_AFPA,1.0,0.0,0.0,0.0,0.0,1.0
Video_1,0.0,0.0,0.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,0.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,1.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,1.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,1.0,1.0,1.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,2.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,1.0,1.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,2.0,2.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,1.0,1.0,1.0


In [48]:
Xy = ypredict_stress_diapo.merge(df_annotations_stress.iloc[:,-1],how='inner',on='video_name')
X = Xy.iloc[:,:-1]
y = Xy.iloc[:,-1]

In [49]:
Xy

Unnamed: 0_level_0,8,9,10,11,17,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test_pour_AFPA,0.0,1.0,0.0,0.0,0.0,1.0
Video_1,0.0,0.0,0.0,0.0,1.0,0.0
WIN_20210323_19_17_40_Pro,0.0,0.0,1.0,1.0,0.0,1.0
WIN_20210329_10_16_02_Pro,0.0,1.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,1.0,1.0,0.0,0.0,1.0,1.0
WIN_20210402_14_27_50_Pro,0.0,0.0,0.0,0.0,0.0,1.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,0.0,0.0,2.0
WIN_20210403_18_49_15_Pro,1.0,0.0,0.0,0.0,1.0,2.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,1.0,1.0,1.0


In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [51]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=3, multi_class='multinomial', random_state=42)
Best results 0.4666666666666667
Best params {'C': 3, 'class_weight': None}
accuracy (mean, std) 0.4666666666666667 0.49888765156985887
f1 (mean, std) 0.4666666666666667 0.49888765156985887
balanced accuracy (mean, std) 0.4666666666666667 0.49888765156985887
precision (mean, std) 0.4666666666666667 0.49888765156985887
recall (mean, std) 0.4666666666666667 0.49888765156985887

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:    4.1s
[P

In [52]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('logistic',
                  LogisticRegression(C=0.01, class_weight='balanced',
                                     multi_class='multinomial'))]),
 'best_score': 0.5333333333333333,
 'best_params': {'logistic__C': 0.01,
  'logistic__class_weight': 'balanced',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.5333333333333333,
 'std_test_f1_score': 0.49888765156985887,
 'mean_test_accuracy_score': 0.5333333333333333,
 'std_test_accuracy_score': 0.49888765156985887,
 'mean_test_balanced_accuracy_score': 0.5333333333333333,
 'std_test_balanced_accuracy_score': 0.49888765156985887,
 'mean_test_precision': 0.5333333333333333,
 'std_test_precision': 0.49888765156985887,
 'mean_test_recall': 0.5333333333333333,
 'std_test_recall': 0.49888765156985887}

In [90]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global' + diapo_selection + '.csv')

#### Autre méthode

### En utilisant le stress prédit des time windows 5s

In [91]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.014085,0.985915,0.0
Video_1,0.694444,0.305556,0.0
WIN_20210323_19_17_40_Pro,0.472222,0.527778,0.0
WIN_20210329_10_16_02_Pro,0.027778,0.972222,0.0
WIN_20210330_13_10_29_Pro,0.0,1.0,0.0
WIN_20210331_21_22_52_Pro,0.084507,0.915493,0.0
WIN_20210402_14_27_50_Pro,0.013889,0.027778,0.958333
WIN_20210402_19_04_53_Pro,0.333333,0.666667,0.0
WIN_20210403_18_49_15_Pro,0.25,0.75,0.0
WIN_20210404_10_58_27_Pro,0.246575,0.753425,0.0


In [92]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [93]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [95]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=2, multi_class='multinomial', random_state=42)
Best results 0.43333333333333335
Best params {'C': 2, 'class_weight': None}
accuracy (mean, std) 0.43333333333333335 0.49553562491061676
f1 (mean, std) 0.43333333333333335 0.49553562491061676
balanced accuracy (mean, std) 0.43333333333333335 0.49553562491061676
precision (mean, std) 0.43333333333333335 0.49553562491061676
recall (mean, std) 0.43333333333333335 0.49553562491061676

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 2160 tasks      | elapsed:    5

In [96]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn', KNeighborsClassifier(p=1, weights='distance'))]),
 'best_score': 0.5666666666666667,
 'best_params': {'knn__n_neighbors': 5,
  'knn__p': 1,
  'knn__weights': 'distance',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.5666666666666667,
 'std_test_f1_score': 0.4955356249106168,
 'mean_test_accuracy_score': 0.5666666666666667,
 'std_test_accuracy_score': 0.4955356249106168,
 'mean_test_balanced_accuracy_score': 0.5666666666666667,
 'std_test_balanced_accuracy_score': 0.4955356249106168,
 'mean_test_precision': 0.5666666666666667,
 'std_test_precision': 0.4955356249106168,
 'mean_test_recall': 0.5666666666666667,
 'std_test_recall': 0.4955356249106168}

#### Autre méthode

In [97]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict = df_ypredict.groupby(['video_name']).agg({'ypredict': ['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,0.985915,0.0,1.0,1.0,0.118678,1.0,1.0,66.014286,-8.247077
Video_1,0.305556,0.0,1.0,0.0,0.463875,0.0,1.0,-1.287273,0.844232
WIN_20210323_19_17_40_Pro,0.527778,0.0,1.0,1.0,0.502731,0.0,1.0,-1.987616,-0.111283
WIN_20210329_10_16_02_Pro,0.972222,0.0,1.0,1.0,0.165489,1.0,1.0,31.028571,-5.747049
WIN_20210330_13_10_29_Pro,1.0,1.0,1.0,1.0,0.0,1.0,1.0,-3.0,0.0
WIN_20210331_21_22_52_Pro,0.915493,0.0,1.0,1.0,0.280126,1.0,1.0,6.925641,-2.987581
WIN_20210402_14_27_50_Pro,1.944444,0.0,2.0,2.0,0.285267,2.0,2.0,31.264793,-5.513881
WIN_20210402_19_04_53_Pro,0.666667,0.0,1.0,1.0,0.474713,0.0,1.0,-1.5,-0.707107
WIN_20210403_18_49_15_Pro,0.75,0.0,1.0,1.0,0.436051,0.75,1.0,-0.666667,-1.154701
WIN_20210404_10_58_27_Pro,0.753425,0.0,1.0,1.0,0.434,1.0,1.0,-0.617172,-1.175937


In [98]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [99]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [100]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    3.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.4666666666666667
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4666666666666667 0.49888765156985887
f1 (mean, std) 0.4666666666666667 0.49888765156985887
balanced accuracy (mean, std) 0.4666666666666667 0.49888765156985887
precision (mean, std) 0.4666666666666667 0.49888765156985887
recall (mean, std) 0.4666666666666667 0.49888765156985887

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks   

In [101]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn',
                  KNeighborsClassifier(n_neighbors=7, p=1, weights='distance'))]),
 'best_score': 0.5666666666666667,
 'best_params': {'knn__n_neighbors': 7,
  'knn__p': 1,
  'knn__weights': 'distance',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.5666666666666667,
 'std_test_f1_score': 0.4955356249106168,
 'mean_test_accuracy_score': 0.5666666666666667,
 'std_test_accuracy_score': 0.4955356249106168,
 'mean_test_balanced_accuracy_score': 0.5666666666666667,
 'std_test_balanced_accuracy_score': 0.4955356249106168,
 'mean_test_precision': 0.5666666666666667,
 'std_test_precision': 0.4955356249106168,
 'mean_test_recall': 0.5666666666666667,
 'std_test_recall': 0.4955356249106168}