In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pathlib
from util import runGridSearchClassifiers

## Loading data

In [2]:
directory_path = '../../04_-_Dev/videos'
features = 'eGeMAPS'

In [3]:
df_total = pd.read_pickle(directory_path + '/audio_' + features + '_data.p')

In [4]:
df_total.shape

(2573581, 77)

In [5]:
df_total.video_name.nunique()

30

In [6]:
df_total[df_total.isna().any(axis=1)]

Unnamed: 0,frameIndex,frameTime,Loudness_sma3,alphaRatio_sma3,hammarbergIndex_sma3,slope0-500_sma3,slope500-1500_sma3,spectralFlux_sma3,mfcc1_sma3,mfcc2_sma3,...,F3frequency_sma3nz_de,F3frequency_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de,F3amplitudeLogRelF0_sma3nz_de_de,video_name,stress_global,type_candidat,sexe,stress,diapo


## Data processing

In [7]:
time_window = 5
df_total['frameTimeWindow'] = df_total.frameTime.apply(lambda x : np.floor(x / time_window) * time_window).astype(int)

In [22]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

In [9]:
# 5 seconds windows
X = df_total.iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]

In [10]:
y = df_total.iloc[:,3:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'min'}).iloc[:,-1]

In [11]:
X.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

In [12]:
from scipy.stats import kurtosis, skew
X_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]
y_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'mean'}).iloc[:,-1]

In [13]:
X_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

## Modèles

In [4]:
X = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

X_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

In [5]:
X = X.fillna(0)
X_audio = X_audio.fillna(0)

## Stress par diapos
### All diapos

In [16]:
diapo_selection = '_all'

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [100, 150, 200, 250, 300], 'max_depth':[10, 15, 20, 25,30], 'class_weight':[None,'balanced']}
                ]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [18]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 45.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 71.5min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 89.8min finished
Best estimator RandomForestClassifier(class_weight='balanced', max_depth=25, n_estimators=200,
                       n_jobs=-1, random_state=42)
Best results 0.4554559730379107
Best params {'class_weight': 'balanced', 'max_depth': 25, 'n_estimators': 200}
accuracy (mean, std) 0.4675823371167479 0.18079681975508957
f1 (mean, std) 0.4554559730379107 0.19441588225359313
balanced accuracy (mean, std) 0.4378091650738796 0.1378966434725613
precision (mean, std) 0.5619061329889

In [19]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[ 706, 1261,   97,    0],
       [ 757, 1763,   70,    0],
       [ 175,  469,    9,    0],
       [  13,    7,    0,    0]])

In [21]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)


In [22]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Loudness_sma3,Loudness_sma3,Loudness_sma3,Loudness_sma3,Loudness_sma3,Loudness_sma3,Loudness_sma3,Loudness_sma3,Loudness_sma3,alphaRatio_sma3,...,F3amplitudeLogRelF0_sma3nz_de,F3amplitudeLogRelF0_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de_de
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,std,min,skew,percentil75,mean,median,kurtosis,max,percentil25,std,...,percentil25,std,min,skew,percentil75,mean,median,kurtosis,max,percentil25
video_name,diapo,frameTimeWindow,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Test_pour_AFPA,1,0,0.002580,0.108240,0.012275,0.116547,0.114774,0.114900,0.022023,0.123974,0.112972,0.644744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0
Test_pour_AFPA,1,5,0.002815,0.109675,0.713457,0.117393,0.115736,0.115328,1.101698,0.127661,0.113882,0.576271,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0
Test_pour_AFPA,1,10,0.003511,0.105259,0.917789,0.115056,0.113579,0.113245,1.637765,0.125830,0.111469,0.664997,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0
Test_pour_AFPA,1,15,0.002994,0.110818,0.307721,0.119927,0.118081,0.117924,0.320136,0.128375,0.116204,0.604470,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0
Test_pour_AFPA,1,20,0.003337,0.108863,-0.395910,0.121091,0.118637,0.119101,-0.354434,0.126731,0.116397,0.621389,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,18,600,0.036212,0.001034,1.222678,0.051879,0.032633,0.017399,0.763651,0.183413,0.001687,6.452303,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0
WIN_20210417_14_53_12_Pro,18,605,0.018413,0.001034,0.391991,0.037869,0.024742,0.024232,-0.688585,0.078965,0.008823,6.727966,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0
WIN_20210417_14_53_12_Pro,18,610,0.020800,0.001034,0.554724,0.040993,0.025769,0.025452,-0.403783,0.094043,0.006183,6.678959,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0
WIN_20210417_14_53_12_Pro,18,615,0.012051,0.001102,-0.616833,0.052893,0.045493,0.047412,0.655261,0.078696,0.038895,3.681385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0


#### En utilisant la proportion des prédictions 0, 1 et 2

In [23]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.382353,0.588235,0.029412
Test_pour_AFPA,8,0.000000,1.000000,0.000000
Test_pour_AFPA,9,0.050000,0.850000,0.100000
Test_pour_AFPA,10,0.000000,0.866667,0.133333
Test_pour_AFPA,11,0.050000,0.750000,0.200000
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.500000,0.500000,0.000000
WIN_20210417_14_53_12_Pro,11,0.500000,0.500000,0.000000
WIN_20210417_14_53_12_Pro,12,0.121212,0.878788,0.000000
WIN_20210417_14_53_12_Pro,17,0.571429,0.428571,0.000000


In [24]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
0,Test_pour_AFPA,1,1.0
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0
237,WIN_20210417_14_53_12_Pro,12,0.0
238,WIN_20210417_14_53_12_Pro,17,1.0


In [25]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [26]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.382353,0.588235,0.029412
Test_pour_AFPA,8,0.000000,1.000000,0.000000
Test_pour_AFPA,9,0.050000,0.850000,0.100000
Test_pour_AFPA,10,0.000000,0.866667,0.133333
Test_pour_AFPA,11,0.050000,0.750000,0.200000
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.500000,0.500000,0.000000
WIN_20210417_14_53_12_Pro,11,0.500000,0.500000,0.000000
WIN_20210417_14_53_12_Pro,12,0.121212,0.878788,0.000000
WIN_20210417_14_53_12_Pro,17,0.571429,0.428571,0.000000


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [28]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=3, multi_class='multinomial', random_state=42)
Best results 0.427848309098309
Best params {'C': 3, 'class_weight': None}
accuracy (mean, std) 0.4708333333333333 0.20837499583416644
f1 (mean, std) 0.427848309098309 0.22548839374589055
balanced accuracy (mean, std) 0.47157407407407415 0.18919696885075035
precision (mean, std) 0.5127480158730159 0.2806579004408552
recall (mean, std) 0.4708333333333333 0.20837499583416644

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 2160 tasks      | elapsed:    7.6s
[Par

In [29]:
best_result

{'best_estimator': KNeighborsClassifier(n_neighbors=10, p=1),
 'best_score': 0.4718124468124468,
 'best_params': {'n_neighbors': 10, 'p': 1, 'weights': 'uniform'},
 'mean_test_f1_score': 0.4718124468124468,
 'std_test_f1_score': 0.24715423480439516,
 'mean_test_accuracy_score': 0.48333333333333334,
 'std_test_accuracy_score': 0.22530843057659627,
 'mean_test_balanced_accuracy_score': 0.4911640211640212,
 'std_test_balanced_accuracy_score': 0.2145744413207612,
 'mean_test_precision': 0.5790128968253968,
 'std_test_precision': 0.3078548168203973,
 'mean_test_recall': 0.48333333333333334,
 'std_test_recall': 0.22530843057659627}

#### En aggrégeant les prédicitions des diapos

In [30]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [31]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,diapo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Test_pour_AFPA,1,0.647059,0,2,1.0,0.543967,0.0,1.0,-0.881986,-0.031739
Test_pour_AFPA,8,1.000000,1,1,1.0,0.000000,1.0,1.0,-3.000000,0.000000
Test_pour_AFPA,9,1.050000,0,2,1.0,0.394034,1.0,1.0,3.537489,0.489863
Test_pour_AFPA,10,1.133333,1,2,1.0,0.351866,1.0,1.0,2.653846,2.157277
Test_pour_AFPA,11,1.150000,0,2,1.0,0.489360,1.0,1.0,0.714165,0.407794
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.500000,0,1,0.5,0.518875,0.0,1.0,-2.000000,0.000000
WIN_20210417_14_53_12_Pro,11,0.500000,0,1,0.5,0.512989,0.0,1.0,-2.000000,0.000000
WIN_20210417_14_53_12_Pro,12,0.878788,0,1,1.0,0.331434,1.0,1.0,3.387931,-2.321192
WIN_20210417_14_53_12_Pro,17,0.428571,0,1,0.0,0.534522,0.0,1.0,-1.916667,0.288675


In [32]:
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [33]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)
best_result

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    6.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    6.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.05, multi_class='multinomial', random_state=42)
Best results 0.4454970029970031
Best params {'C': 0.05, 'class_weight': None}
accuracy (mean, std) 0.4791666666666667 0.17702204821873335
f1 (mean, std) 0.4454970029970031 0.21477469725452009
balanced accuracy (mean, std) 0.501468253968254 0.15950713112686374
precision (mean, std) 0.5755902777777778 0.303154225948788
recall (mean, std) 0.4791666666666667 0.17702204821873335

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=2)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.47633468383468386,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 2},
 'mean_test_f1_score': 0.47633468383468386,
 'std_test_f1_score': 0.22897622547261173,
 'mean_test_accuracy_score': 0.5083333333333333,
 'std_test_accuracy_score': 0.1880085695446413,
 'mean_test_balanced_accuracy_score': 0.5117857142857143,
 'std_test_balanced_accuracy_score': 0.1599958515854876,
 'mean_test_precision': 0.5733978174603176,
 'std_test_precision': 0.30443782149676923,
 'mean_test_recall': 0.5083333333333333,
 'std_test_recall': 0.1880085695446413}

In [34]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=2)),
                 ('logistic',
                  LogisticRegression(C=0.5, multi_class='multinomial'))]),
 'best_score': 0.47633468383468386,
 'best_params': {'logistic__C': 0.5,
  'logistic__class_weight': None,
  'pca__n_components': 2},
 'mean_test_f1_score': 0.47633468383468386,
 'std_test_f1_score': 0.22897622547261173,
 'mean_test_accuracy_score': 0.5083333333333333,
 'std_test_accuracy_score': 0.1880085695446413,
 'mean_test_balanced_accuracy_score': 0.5117857142857143,
 'std_test_balanced_accuracy_score': 0.1599958515854876,
 'mean_test_precision': 0.5733978174603176,
 'std_test_precision': 0.30443782149676923,
 'mean_test_recall': 0.5083333333333333,
 'std_test_recall': 0.1880085695446413}

In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[33, 67,  1,  0],
       [24, 89,  0,  0],
       [ 5, 20,  0,  0],
       [ 1,  0,  0,  0]])

In [36]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')

In [37]:
df_ypredict

Unnamed: 0,"(video_name, )","(diapo, )",ypredict
0,Test_pour_AFPA,1,1.0
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,1.0
236,WIN_20210417_14_53_12_Pro,11,1.0
237,WIN_20210417_14_53_12_Pro,12,1.0
238,WIN_20210417_14_53_12_Pro,17,1.0


## Stress global

### En utilisant le stress prédit des diapos

In [38]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [39]:
ypredict_stress_diapo

diapo,1,8,9,10,11,12,17,18
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
Video_1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
WIN_20210323_19_17_40_Pro,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210330_13_10_29_Pro,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0
WIN_20210331_21_22_52_Pro,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
WIN_20210402_19_04_53_Pro,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210404_10_58_27_Pro,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0


In [40]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
df_annotations_stress

Unnamed: 0_level_0,1,8,9,10,11,12,17,18,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [41]:
Xy = ypredict_stress_diapo.merge(df_annotations_stress.iloc[:,-1],how='inner',on='video_name')
X = Xy.iloc[:,:-1]
y = Xy.iloc[:,-1]

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [43]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.4
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4 0.4898979485566357
f1 (mean, std) 0.4 0.4898979485566357
balanced accuracy (mean, std) 0.4 0.4898979485566357
precision (mean, std) 0.4 0.4898979485566357
recall (mean, std) 0.4 0.4898979485566357

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:    7.9s finished
[Parallel(n_jobs=-

In [44]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('logistic',
                  LogisticRegression(C=0.01, class_weight='balanced',
                                     multi_class='multinomial'))]),
 'best_score': 0.43333333333333335,
 'best_params': {'logistic__C': 0.01,
  'logistic__class_weight': 'balanced',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.43333333333333335,
 'std_test_f1_score': 0.49553562491061687,
 'mean_test_accuracy_score': 0.43333333333333335,
 'std_test_accuracy_score': 0.49553562491061687,
 'mean_test_balanced_accuracy_score': 0.43333333333333335,
 'std_test_balanced_accuracy_score': 0.49553562491061687,
 'mean_test_precision': 0.43333333333333335,
 'std_test_precision': 0.49553562491061687,
 'mean_test_recall': 0.43333333333333335,
 'std_test_recall': 0.49553562491061687}

In [45]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global' + diapo_selection + '.csv')

#### Autre méthode

### En utilisant le stress prédit des time windows 5s

In [46]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.206349,0.703704,0.089947
Video_1,0.668874,0.331126,0.0
WIN_20210323_19_17_40_Pro,0.369048,0.630952,0.0
WIN_20210329_10_16_02_Pro,0.020408,0.979592,0.0
WIN_20210330_13_10_29_Pro,0.231788,0.761589,0.006623
WIN_20210331_21_22_52_Pro,0.405882,0.594118,0.0
WIN_20210402_14_27_50_Pro,0.048387,0.94086,0.010753
WIN_20210402_19_04_53_Pro,0.468571,0.531429,0.0
WIN_20210403_18_49_15_Pro,0.370166,0.629834,0.0
WIN_20210404_10_58_27_Pro,0.415,0.2,0.385


In [47]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [48]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [50]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.4
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4 0.4898979485566357
f1 (mean, std) 0.4 0.4898979485566357
balanced accuracy (mean, std) 0.4 0.4898979485566357
precision (mean, std) 0.4 0.4898979485566357
recall (mean, std) 0.4 0.4898979485566357

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 2160 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:    6.5s finished
[Parallel(n_jobs=-

In [51]:
best_result

{'best_estimator': LogisticRegression(C=0.01, multi_class='multinomial', random_state=42),
 'best_score': 0.4,
 'best_params': {'C': 0.01, 'class_weight': None},
 'mean_test_f1_score': 0.4,
 'std_test_f1_score': 0.4898979485566357,
 'mean_test_accuracy_score': 0.4,
 'std_test_accuracy_score': 0.4898979485566357,
 'mean_test_balanced_accuracy_score': 0.4,
 'std_test_balanced_accuracy_score': 0.4898979485566357,
 'mean_test_precision': 0.4,
 'std_test_precision': 0.4898979485566357,
 'mean_test_recall': 0.4,
 'std_test_recall': 0.4898979485566357}

#### Autre méthode

In [52]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict = df_ypredict.groupby(['video_name']).agg({'ypredict': ['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,0.883598,0.0,2.0,1.0,0.533152,1.0,1.0,0.322698,-0.107005
Video_1,0.331126,0.0,1.0,0.0,0.472184,0.0,1.0,-1.48495,0.717669
WIN_20210323_19_17_40_Pro,0.630952,0.0,1.0,1.0,0.483989,0.0,1.0,-1.705417,-0.542755
WIN_20210329_10_16_02_Pro,0.979592,0.0,1.0,1.0,0.141875,1.0,1.0,44.020833,-6.783866
WIN_20210330_13_10_29_Pro,0.774834,0.0,2.0,1.0,0.434698,1.0,1.0,-0.149888,-1.06916
WIN_20210331_21_22_52_Pro,0.594118,0.0,1.0,1.0,0.492513,0.0,1.0,-1.853064,-0.383323
WIN_20210402_14_27_50_Pro,0.962366,0.0,2.0,1.0,0.240905,1.0,1.0,13.197787,-2.239898
WIN_20210402_19_04_53_Pro,0.531429,0.0,1.0,1.0,0.500443,0.0,1.0,-1.984133,-0.125963
WIN_20210403_18_49_15_Pro,0.629834,0.0,1.0,1.0,0.484188,0.0,1.0,-1.710788,-0.537784
WIN_20210404_10_58_27_Pro,0.97,0.0,2.0,1.0,0.896167,0.0,2.0,-1.746059,0.05872


In [53]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [55]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='accuracy_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.4
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4 0.13333333333333333
f1 (mean, std) 0.28134920634920635 0.11968348700732404
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.5s finished


In [56]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    4.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.43333333333333335
Best params {'C': 0.01, 'class_weight': 'balanced'}
accuracy (mean, std) 0.43333333333333335 0.49553562491061676
f1 (mean, std) 0.43333333333333335 0.49553562491061676
balanced accuracy (mean, std) 0.43333333333333335 0.49553562491061676
precision (mean, std) 0.43333333333333335 0.49553562491061676
recall (mean, std) 0.43333333333333335 0.49553562491061676

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Pa

In [57]:
best_result

{'best_estimator': LogisticRegression(C=0.01, class_weight='balanced', multi_class='multinomial',
                    random_state=42),
 'best_score': 0.43333333333333335,
 'best_params': {'C': 0.01, 'class_weight': 'balanced'},
 'mean_test_f1_score': 0.43333333333333335,
 'std_test_f1_score': 0.49553562491061676,
 'mean_test_accuracy_score': 0.43333333333333335,
 'std_test_accuracy_score': 0.49553562491061676,
 'mean_test_balanced_accuracy_score': 0.43333333333333335,
 'std_test_balanced_accuracy_score': 0.49553562491061676,
 'mean_test_precision': 0.43333333333333335,
 'std_test_precision': 0.49553562491061676,
 'mean_test_recall': 0.43333333333333335,
 'std_test_recall': 0.49553562491061676}

In [59]:
df_ypredict_stress_global = pd.concat([ypredict_stress_diapo.reset_index(), pd.DataFrame(y_predict,columns=['predicted_stress_global'])], axis=1) 
df_ypredict_stress_global = df_ypredict_stress_global.set_index('video_name').sort_index()
df_ypredict_stress_global = df_ypredict_stress_global.iloc[:,-1]

In [60]:
df_ypredict_stress_global

video_name
Test_pour_AFPA                     0.0
Video_1                            0.0
WIN_20210323_19_17_40_Pro          0.0
WIN_20210329_10_16_02_Pro          2.0
WIN_20210330_13_10_29_Pro          0.0
WIN_20210331_21_22_52_Pro          0.0
WIN_20210402_14_27_50_Pro          2.0
WIN_20210402_19_04_53_Pro          0.0
WIN_20210403_18_49_15_Pro          0.0
WIN_20210404_10_58_27_Pro          0.0
WIN_20210404_21_41_12_Pro          0.0
WIN_20210405_15_09_16_Pro          0.0
WIN_20210406_15_06_15_Pro          1.0
WIN_20210406_18_35_52_Pro          0.0
WIN_20210406_18_49_10_Pro          0.0
WIN_20210406_21_05_52_Pro          0.0
WIN_20210407_09_04_05_Pro          2.0
WIN_20210407_14_54_56_Pro_edit2    0.0
WIN_20210408_11_48_58_Pro          2.0
WIN_20210408_14_00_44_Pro          0.0
WIN_20210408_14_02_19_Pro          0.0
WIN_20210408_14_11_32_Pro          1.0
WIN_20210408_15_20_51_Pro          0.0
WIN_20210408_16_04_32_Pro          1.0
WIN_20210409_10_26_11_Pro          1.0
WIN_20210413_1

### Audios diapos only

In [6]:
diapo_selection = '_audio_only'
diapo_audio_list = [8,9,10,11,17]
X = X_audio
y = y_audio

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut

models_list = [RandomForestClassifier(random_state = 42, n_jobs=-1)]
parameters_list = [
                {'n_estimators': [100, 150, 200], 'max_depth':[10, 15, 20, 25], 'class_weight':[None,'balanced']}
                ]
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [8]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 24 candidates, totalling 720 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 12.8min finished
Best estimator RandomForestClassifier(class_weight='balanced', max_depth=15, n_estimators=200,
                       n_jobs=-1, random_state=42)
Best results 0.473820896113761
Best params {'class_weight': 'balanced', 'max_depth': 15, 'n_estimators': 200}
accuracy (mean, std) 0.4532739690298054 0.2125814366324185
f1 (mean, std) 0.473820896113761 0.2570275449510871
balanced accuracy (mean, std) 0.4890630325639611 0.1544732352276775
precision (mean, std) 0.6719007906886402 0.307504318895477
recall (mean, std) 0.4532739690298054 0.2125814366324185



In [9]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[462, 476,   0,   0],
       [427, 507,  41,   0],
       [ 53, 168,  13,   0],
       [ 12,   8,   0,   0]])

In [11]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)


#### En utilisant la proportion des prédictions 0, 1 et 2

In [13]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,8,0.100000,0.900000,0.0
Test_pour_AFPA,9,0.150000,0.850000,0.0
Test_pour_AFPA,10,0.066667,0.933333,0.0
Test_pour_AFPA,11,0.000000,1.000000,0.0
Test_pour_AFPA,17,0.000000,1.000000,0.0
...,...,...,...,...
WIN_20210417_14_53_12_Pro,8,1.000000,0.000000,0.0
WIN_20210417_14_53_12_Pro,9,0.952381,0.047619,0.0
WIN_20210417_14_53_12_Pro,10,0.857143,0.142857,0.0
WIN_20210417_14_53_12_Pro,11,0.850000,0.150000,0.0


In [14]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress = df_annotations_stress[df_annotations_stress.diapo.isin(diapo_audio_list)]
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
6,Test_pour_AFPA,17,0.0
...,...,...,...
233,WIN_20210417_14_53_12_Pro,8,0.0
234,WIN_20210417_14_53_12_Pro,9,0.0
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0


In [15]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [17]:
y

0      1.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
145    0.0
146    0.0
147    0.0
148    0.0
149    1.0
Name: stress, Length: 150, dtype: float64

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42))])

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [19]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=4, multi_class='multinomial', random_state=42)
Best results 0.3952592592592593
Best params {'C': 4, 'class_weight': None}
accuracy (mean, std) 0.4400000000000001 0.3282275633357645
f1 (mean, std) 0.3952592592592593 0.33794627151028545
balanced accuracy (mean, std) 0.4506481481481482 0.29032733347450124
precision (mean, std) 0.4493333333333335 0.3834312918314774
recall (mean, std) 0.4400000000000001 0.3282275633357645

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 2160 tasks      | elapsed:    8.1s
[Para

In [20]:
best_result

{'best_estimator': RandomForestClassifier(max_depth=4, n_estimators=200, n_jobs=-1,
                        random_state=42),
 'best_score': 0.5154285714285715,
 'best_params': {'class_weight': None, 'max_depth': 4, 'n_estimators': 200},
 'mean_test_f1_score': 0.5154285714285715,
 'std_test_f1_score': 0.28360383216536345,
 'mean_test_accuracy_score': 0.5266666666666667,
 'std_test_accuracy_score': 0.26574841903993,
 'mean_test_balanced_accuracy_score': 0.5175,
 'std_test_balanced_accuracy_score': 0.26970791814685585,
 'mean_test_precision': 0.607111111111111,
 'std_test_precision': 0.33736861182823985,
 'mean_test_recall': 0.5266666666666667,
 'std_test_recall': 0.26574841903993}

#### En aggrégeant les prédictions des diapos

In [23]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [24]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [27]:
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [28]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)
best_result

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    6.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    6.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.05, multi_class='multinomial', random_state=42)
Best results 0.4976402116402116
Best params {'C': 0.05, 'class_weight': None}
accuracy (mean, std) 0.5266666666666667 0.29431653406192154
f1 (mean, std) 0.4976402116402116 0.3176051064880893
balanced accuracy (mean, std) 0.5512962962962963 0.25825440185371007
precision (mean, std) 0.5644444444444445 0.3640739987189553
recall (mean, std) 0.5266666666666667 0.29431653406192154

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks     

{'best_estimator': RandomForestClassifier(max_depth=6, n_estimators=150, n_jobs=-1,
                        random_state=42),
 'best_score': 0.5377830687830688,
 'best_params': {'class_weight': None, 'max_depth': 6, 'n_estimators': 150},
 'mean_test_f1_score': 0.5377830687830688,
 'std_test_f1_score': 0.29325873293616445,
 'mean_test_accuracy_score': 0.5399999999999999,
 'std_test_accuracy_score': 0.2739829678404602,
 'mean_test_balanced_accuracy_score': 0.5210185185185184,
 'std_test_balanced_accuracy_score': 0.2818723072588922,
 'mean_test_precision': 0.6157777777777778,
 'std_test_precision': 0.3401186430615421,
 'mean_test_recall': 0.5399999999999999,
 'std_test_recall': 0.2739829678404602}

In [29]:
best_result

{'best_estimator': RandomForestClassifier(max_depth=6, n_estimators=150, n_jobs=-1,
                        random_state=42),
 'best_score': 0.5377830687830688,
 'best_params': {'class_weight': None, 'max_depth': 6, 'n_estimators': 150},
 'mean_test_f1_score': 0.5377830687830688,
 'std_test_f1_score': 0.29325873293616445,
 'mean_test_accuracy_score': 0.5399999999999999,
 'std_test_accuracy_score': 0.2739829678404602,
 'mean_test_balanced_accuracy_score': 0.5210185185185184,
 'std_test_balanced_accuracy_score': 0.2818723072588922,
 'mean_test_precision': 0.6157777777777778,
 'std_test_precision': 0.3401186430615421,
 'mean_test_recall': 0.5399999999999999,
 'std_test_recall': 0.2739829678404602}

In [30]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[43, 23,  2,  0],
       [27, 38,  1,  0],
       [10,  5,  0,  0],
       [ 1,  0,  0,  0]])

In [31]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')

In [32]:
df_ypredict

Unnamed: 0,video_name,diapo,ypredict
0,Test_pour_AFPA,8,0.0
1,Test_pour_AFPA,9,1.0
2,Test_pour_AFPA,10,2.0
3,Test_pour_AFPA,11,0.0
4,Test_pour_AFPA,17,0.0
...,...,...,...
145,WIN_20210417_14_53_12_Pro,8,1.0
146,WIN_20210417_14_53_12_Pro,9,0.0
147,WIN_20210417_14_53_12_Pro,10,0.0
148,WIN_20210417_14_53_12_Pro,11,0.0


## Stress global

### En utilisant le stress prédit des diapos

In [33]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [34]:
ypredict_stress_diapo

diapo,8,9,10,11,17
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test_pour_AFPA,0.0,1.0,2.0,0.0,0.0
Video_1,0.0,0.0,0.0,1.0,1.0
WIN_20210323_19_17_40_Pro,0.0,1.0,0.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,0.0,0.0,0.0,0.0
WIN_20210330_13_10_29_Pro,0.0,2.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,1.0,1.0,1.0,1.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,0.0,1.0,1.0,1.0,1.0
WIN_20210403_18_49_15_Pro,0.0,1.0,0.0,1.0,1.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,0.0,1.0


In [35]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress

Unnamed: 0,video_name,1,8,9,10,11,12,17,18,stress_global
0,Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
2,WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
4,WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
6,WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
7,WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
8,WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
9,WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [36]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
diapo_audio_list.append('stress_global')
df_annotations_stress = df_annotations_stress[[str(diapo) for diapo in diapo_audio_list]]
diapo_audio_list.pop()
df_annotations_stress


Unnamed: 0_level_0,8,9,10,11,17,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test_pour_AFPA,1.0,0.0,0.0,0.0,0.0,1.0
Video_1,0.0,0.0,0.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,0.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,1.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,1.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,1.0,1.0,1.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,2.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,1.0,1.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,2.0,2.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,1.0,1.0,1.0


In [37]:
Xy = ypredict_stress_diapo.merge(df_annotations_stress.iloc[:,-1],how='inner',on='video_name')
X = Xy.iloc[:,:-1]
y = Xy.iloc[:,-1]

In [38]:
Xy

Unnamed: 0_level_0,8,9,10,11,17,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test_pour_AFPA,0.0,1.0,2.0,0.0,0.0,1.0
Video_1,0.0,0.0,0.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,0.0,1.0,0.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,0.0,0.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,2.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,0.0,1.0,1.0,1.0,1.0,2.0
WIN_20210403_18_49_15_Pro,0.0,1.0,0.0,1.0,1.0,2.0
WIN_20210404_10_58_27_Pro,1.0,1.0,1.0,0.0,1.0,1.0


In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [40]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 585 out of 600 | elapsed:    2.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.4
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4 0.4898979485566357
f1 (mean, std) 0.4 0.4898979485566357
balanced accuracy (mean, std) 0.4 0.4898979485566357
precision (mean, std) 0.4 0.4898979485566357
recall (mean, std) 0.4 0.4898979485566357

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:    4.1s
[Parallel(

In [41]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn', KNeighborsClassifier(n_neighbors=9, p=1))]),
 'best_score': 0.4666666666666667,
 'best_params': {'knn__n_neighbors': 9,
  'knn__p': 1,
  'knn__weights': 'uniform',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.4666666666666667,
 'std_test_f1_score': 0.49888765156985887,
 'mean_test_accuracy_score': 0.4666666666666667,
 'std_test_accuracy_score': 0.49888765156985887,
 'mean_test_balanced_accuracy_score': 0.4666666666666667,
 'std_test_balanced_accuracy_score': 0.49888765156985887,
 'mean_test_precision': 0.4666666666666667,
 'std_test_precision': 0.49888765156985887,
 'mean_test_recall': 0.4666666666666667,
 'std_test_recall': 0.49888765156985887}

In [42]:
# Saving predictions
df_ypredict = pd.concat([X.reset_index()[['video_name']],
                        pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.to_csv('ypredict_' + features + '_tw5_global' + diapo_selection + '.csv')

#### Autre méthode

### En utilisant le stress prédit des time windows 5s

In [43]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.070423,0.929577,0.0
Video_1,0.625,0.375,0.0
WIN_20210323_19_17_40_Pro,0.513889,0.486111,0.0
WIN_20210329_10_16_02_Pro,0.0,1.0,0.0
WIN_20210330_13_10_29_Pro,0.09589,0.90411,0.0
WIN_20210331_21_22_52_Pro,0.380282,0.619718,0.0
WIN_20210402_14_27_50_Pro,0.0,0.388889,0.611111
WIN_20210402_19_04_53_Pro,0.375,0.611111,0.013889
WIN_20210403_18_49_15_Pro,0.263889,0.736111,0.0
WIN_20210404_10_58_27_Pro,0.493151,0.506849,0.0


In [44]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [45]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [47]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, class_weight='balanced', multi_class='multinomial',
                   random_state=42)
Best results 0.43333333333333335
Best params {'C': 0.01, 'class_weight': 'balanced'}
accuracy (mean, std) 0.43333333333333335 0.49553562491061676
f1 (mean, std) 0.43333333333333335 0.49553562491061676
balanced accuracy (mean, std) 0.43333333333333335 0.49553562491061676
precision (mean, std) 0.43333333333333335 0.49553562491061676
recall (mean, std) 0.43333333333333335 0.49553562491061676

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Pa

In [48]:
best_result

{'best_estimator': KNeighborsClassifier(n_neighbors=7, p=1),
 'best_score': 0.4666666666666667,
 'best_params': {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'},
 'mean_test_f1_score': 0.4666666666666667,
 'std_test_f1_score': 0.49888765156985887,
 'mean_test_accuracy_score': 0.4666666666666667,
 'std_test_accuracy_score': 0.49888765156985887,
 'mean_test_balanced_accuracy_score': 0.4666666666666667,
 'std_test_balanced_accuracy_score': 0.49888765156985887,
 'mean_test_precision': 0.4666666666666667,
 'std_test_precision': 0.49888765156985887,
 'mean_test_recall': 0.4666666666666667,
 'std_test_recall': 0.49888765156985887}

#### Autre méthode

In [49]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5' + diapo_selection + '.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict = df_ypredict.groupby(['video_name']).agg({'ypredict': ['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,0.929577,0.0,1.0,1.0,0.257679,1.0,1.0,9.275758,-3.357939
Video_1,0.375,0.0,1.0,0.0,0.48752,0.0,1.0,-1.733333,0.516398
WIN_20210323_19_17_40_Pro,0.486111,0.0,1.0,0.0,0.503315,0.0,1.0,-1.996911,0.055577
WIN_20210329_10_16_02_Pro,1.0,1.0,1.0,1.0,0.0,1.0,1.0,-3.0,0.0
WIN_20210330_13_10_29_Pro,0.90411,0.0,1.0,1.0,0.296479,1.0,1.0,5.534632,-2.744928
WIN_20210331_21_22_52_Pro,0.619718,0.0,1.0,1.0,0.488911,0.0,1.0,-1.756734,-0.49322
WIN_20210402_14_27_50_Pro,1.611111,1.0,2.0,2.0,0.490919,1.0,2.0,-1.792208,-0.455842
WIN_20210402_19_04_53_Pro,0.638889,0.0,2.0,1.0,0.511985,0.0,1.0,-1.195937,-0.258664
WIN_20210403_18_49_15_Pro,0.736111,0.0,1.0,1.0,0.443833,0.0,1.0,-0.852036,-1.071431
WIN_20210404_10_58_27_Pro,0.506849,0.0,1.0,1.0,0.503413,0.0,1.0,-1.999249,-0.0274


In [50]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

models_list = [
                LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42),
                Pipeline(steps=[('pca', pca), ('logistic', LogisticRegression(multi_class='multinomial', fit_intercept=True))]),
                KNeighborsClassifier(),
                Pipeline(steps=[('pca', pca), ('knn', KNeighborsClassifier())]),
                RandomForestClassifier(random_state = 42, n_jobs=-1)
                ]

parameters_list = [
                    {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'class_weight' : [None, 'balanced']},
                    {'pca__n_components': [1, 2, 3, 4],
                        'logistic__C': [0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4 , 5, 10], 'logistic__class_weight' : [None, 'balanced']},
                    {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]},
                    {'pca__n_components': [1, 2, 3, 4],
                        'knn__n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'knn__weights' : ['uniform', 'distance'],                              'knn__p': [1, 2]},
                    {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}
                    ]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

In [52]:
best_result, y_predict, result_list = runGridSearchClassifiers(X, y, cv_loo, models_list, parameters_list, 
                                                                output_predict=True, n_jobs=-1, verbose=True)

Fitting 30 folds for each of 20 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    4.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Best estimator LogisticRegression(C=0.01, multi_class='multinomial', random_state=42)
Best results 0.36666666666666664
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.36666666666666664 0.48189440982669857
f1 (mean, std) 0.36666666666666664 0.48189440982669857
balanced accuracy (mean, std) 0.36666666666666664 0.48189440982669857
precision (mean, std) 0.36666666666666664 0.48189440982669857
recall (mean, std) 0.36666666666666664 0.48189440982669857

Fitting 30 folds for each of 80 candidates, totalling 2400 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed

In [53]:
best_result

{'best_estimator': Pipeline(steps=[('pca', PCA(n_components=1)),
                 ('knn', KNeighborsClassifier(n_neighbors=12, p=1))]),
 'best_score': 0.5,
 'best_params': {'knn__n_neighbors': 12,
  'knn__p': 1,
  'knn__weights': 'uniform',
  'pca__n_components': 1},
 'mean_test_f1_score': 0.5,
 'std_test_f1_score': 0.5,
 'mean_test_accuracy_score': 0.5,
 'std_test_accuracy_score': 0.5,
 'mean_test_balanced_accuracy_score': 0.5,
 'std_test_balanced_accuracy_score': 0.5,
 'mean_test_precision': 0.5,
 'std_test_precision': 0.5,
 'mean_test_recall': 0.5,
 'std_test_recall': 0.5}