In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pathlib

## Loading data

In [5]:
directory_path = '../../04_-_Dev/videos'
features = 'emobase_eGeMAPS'

In [6]:
df_total = pd.read_pickle(directory_path + '/audio_' + features + '_data.p')

In [7]:
df_total.shape

(2573581, 96)

In [8]:
df_total.video_name.nunique()

30

In [9]:
df_total[df_total.isna().any(axis=1)]

Unnamed: 0,frameIndex,frameTime,pcm_intensity_sma,pcm_loudness_sma,mfcc_sma[1],mfcc_sma[2],mfcc_sma[3],mfcc_sma[4],mfcc_sma[5],mfcc_sma[6],...,F2frequency_sma3nz_de,F2amplitudeLogRelF0_sma3nz_de,F3frequency_sma3nz_de,F3amplitudeLogRelF0_sma3nz_de,video_name,stress_global,type_candidat,sexe,stress,diapo


## Data processing

In [11]:
time_window = 5
df_total['frameTimeWindow'] = df_total.frameTime.apply(lambda x : np.floor(x / time_window) * time_window).astype(int)

In [12]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

In [13]:
# 5 seconds windows
X = df_total.iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]

In [14]:
y = df_total.iloc[:,3:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'min'}).iloc[:,-1]

In [15]:
X.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

In [16]:
from scipy.stats import kurtosis, skew
X_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]
y_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'mean'}).iloc[:,-1]

In [17]:
X_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

## Modèles

In [18]:
X = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

X_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

In [19]:
X = X.fillna(0)
X_audio = X_audio.fillna(0)

## Stress par diapos
### All diapos

In [21]:
parameters = {'n_estimators': [100, 150, 200, 250, 300], 'max_depth':[10, 15, 20, 25, 30], 'class_weight':[None,'balanced']}

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = RandomForestClassifier(random_state = 42, n_jobs=-1)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' , 
                             'balanced_accuracy_score' : 'balanced_accuracy', 
                             'precision' : 'precision_weighted', 'recall' : 'recall_weighted'}, 
                    refit= 'f1_score' ,#'accuracy_score',
                    cv=cv_loo, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])
print('balanced accuracy (mean, std)', clf.cv_results_['mean_test_balanced_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_balanced_accuracy_score'][clf.best_index_])
print('precision (mean, std)', clf.cv_results_['mean_test_precision'][clf.best_index_], clf.cv_results_['std_test_precision'][clf.best_index_])
print('recall (mean, std)', clf.cv_results_['mean_test_recall'][clf.best_index_], clf.cv_results_['std_test_recall'][clf.best_index_])

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1500 out of 1500 | elapsed: 118.0min finished
Best results 0.4410820121016199
Best params {'class_weight': None, 'max_depth': 25, 'n_estimators': 100}
accuracy (mean, std) 0.44753414895219035 0.20909936801440648
f1 (mean, std) 0.4410820121016199 0.21421490442691712
balanced accuracy (mean, std) 0.4359010719510516 0.18146746822295337
precision (mean, std) 0.5851920904882791 0.23676273007526016
recall (mean, std) 0.44753414895219035 0.20909936801440648


In [22]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

#model = RandomForestClassifier(random_state = 42, n_jobs=-1, max_depth= 10, n_estimators= 50)
model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_2.csv')

In [23]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[ 652, 1269,  143,    0],
       [ 810, 1704,   76,    0],
       [ 324,  317,   12,    0],
       [   7,   13,    0,    0]])

In [24]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)


In [25]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.147059,0.088235,0.764706
Test_pour_AFPA,8,0.200000,0.700000,0.100000
Test_pour_AFPA,9,0.150000,0.700000,0.150000
Test_pour_AFPA,10,0.000000,0.666667,0.333333
Test_pour_AFPA,11,0.150000,0.550000,0.300000
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.428571,0.571429,0.000000
WIN_20210417_14_53_12_Pro,11,0.400000,0.600000,0.000000
WIN_20210417_14_53_12_Pro,12,0.121212,0.878788,0.000000
WIN_20210417_14_53_12_Pro,17,0.571429,0.428571,0.000000


In [26]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [27]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,diapo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Test_pour_AFPA,1,1.617647,0,2,2.0,0.739152,2.0,2.0,0.684926,-1.555231
Test_pour_AFPA,8,0.900000,0,2,1.0,0.567646,1.0,1.0,0.302021,-0.076839
Test_pour_AFPA,9,1.000000,0,2,1.0,0.561951,1.0,1.0,0.333333,0.000000
Test_pour_AFPA,10,1.333333,1,2,1.0,0.487950,1.0,2.0,-1.500000,0.707107
Test_pour_AFPA,11,1.150000,0,2,1.0,0.670820,1.0,2.0,-0.706063,-0.163677
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.571429,0,1,1.0,0.513553,0.0,1.0,-1.916667,-0.288675
WIN_20210417_14_53_12_Pro,11,0.600000,0,1,1.0,0.502625,0.0,1.0,-1.833333,-0.408248
WIN_20210417_14_53_12_Pro,12,0.878788,0,1,1.0,0.331434,1.0,1.0,3.387931,-2.321192
WIN_20210417_14_53_12_Pro,17,0.428571,0,1,0.0,0.534522,0.0,1.0,-1.916667,0.288675


In [28]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
0,Test_pour_AFPA,1,1.0
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0
237,WIN_20210417_14_53_12_Pro,12,0.0
238,WIN_20210417_14_53_12_Pro,17,1.0


In [29]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [30]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,"(ypredict, mean)","(ypredict, min)","(ypredict, max)","(ypredict, median)","(ypredict, std)","(ypredict, percentil25)","(ypredict, percentil75)","(ypredict, kurtosis)","(ypredict, skew)"
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Test_pour_AFPA,1,1.617647,0,2,2.0,0.739152,2.0,2.0,0.684926,-1.555231
Test_pour_AFPA,8,0.900000,0,2,1.0,0.567646,1.0,1.0,0.302021,-0.076839
Test_pour_AFPA,9,1.000000,0,2,1.0,0.561951,1.0,1.0,0.333333,0.000000
Test_pour_AFPA,10,1.333333,1,2,1.0,0.487950,1.0,2.0,-1.500000,0.707107
Test_pour_AFPA,11,1.150000,0,2,1.0,0.670820,1.0,2.0,-0.706063,-0.163677
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.571429,0,1,1.0,0.513553,0.0,1.0,-1.916667,-0.288675
WIN_20210417_14_53_12_Pro,11,0.600000,0,1,1.0,0.502625,0.0,1.0,-1.833333,-0.408248
WIN_20210417_14_53_12_Pro,12,0.878788,0,1,1.0,0.331434,1.0,1.0,3.387931,-2.321192
WIN_20210417_14_53_12_Pro,17,0.428571,0,1,0.0,0.534522,0.0,1.0,-1.916667,0.288675


In [34]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y


#model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_jobs=-1)
parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#model = RandomForestClassifier(random_state = 42, n_jobs=-1)
#parameters = {'n_estimators': [100, 150, 200], 'max_depth':[10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' , 
                             'balanced_accuracy_score' : 'balanced_accuracy', 
                             'precision' : 'precision_weighted', 'recall' : 'recall_weighted'}, 
                    refit='f1_score',#'accuracy_score',
                    cv=cv_loo, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])
print('balanced accuracy (mean, std)', clf.cv_results_['mean_test_balanced_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_balanced_accuracy_score'][clf.best_index_])
print('precision (mean, std)', clf.cv_results_['mean_test_precision'][clf.best_index_], clf.cv_results_['std_test_precision'][clf.best_index_])
print('recall (mean, std)', clf.cv_results_['mean_test_recall'][clf.best_index_], clf.cv_results_['std_test_recall'][clf.best_index_])

Fitting 30 folds for each of 44 candidates, totalling 1320 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.46468374218374214
Best params {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
accuracy (mean, std) 0.4583333333333333 0.18633899812498245
f1 (mean, std) 0.46468374218374214 0.2081739450618909
balanced accuracy (mean, std) 0.4529100529100529 0.1997839739228038
precision (mean, std) 0.564484126984127 0.26785592298370076
recall (mean, std) 0.4583333333333333 0.18633899812498245
[Parallel(n_jobs=1)]: Done 1320 out of 1320 | elapsed:   18.2s finished


In [35]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','diapo','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo_2.csv')

In [36]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[56, 45,  0,  0],
       [58, 53,  2,  0],
       [15,  9,  1,  0],
       [ 1,  0,  0,  0]])

In [33]:
#print(X_no_name.columns[np.argsort(clf.best_estimator_.feature_importances_)[:-20:-1]])

## Stress global

### En utilisant le stress prédit des diapos

In [37]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [38]:
ypredict_stress_diapo # un peu nul - le modèle prédit 1 partout

diapo,1,8,9,10,11,12,17,18
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Test_pour_AFPA,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Video_1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
WIN_20210329_10_16_02_Pro,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,0.0,0.0,1.0,2.0,2.0
WIN_20210403_18_49_15_Pro,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
WIN_20210404_10_58_27_Pro,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [39]:
df_annotations_stress = pd.read_csv('annotations.csv')
#df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
#df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
#df_annotations_stress.columns = ['video_name','diapo','stress']
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
#df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0_level_0,1,8,9,10,11,12,17,18,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
# En utilisant les annotations dees stress des diapos comme X
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2], 'class_weight' : [None, 'balanced']}

X = df_annotations_stress.iloc[:,:-1]
#X = ypredict_stress_diapo
y = df_annotations_stress.iloc[:,-1]

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)
clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' , 
                             'balanced_accuracy_score' : 'balanced_accuracy', 
                             'precision' : 'precision_weighted', 'recall' : 'recall_weighted'}, 
                    refit='f1_score',
                    cv=5, verbose=1)
clf.fit(X, y)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])
print('balanced accuracy (mean, std)', clf.cv_results_['mean_test_balanced_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_balanced_accuracy_score'][clf.best_index_])
print('precision (mean, std)', clf.cv_results_['mean_test_precision'][clf.best_index_], clf.cv_results_['std_test_precision'][clf.best_index_])
print('recall (mean, std)', clf.cv_results_['mean_test_recall'][clf.best_index_], clf.cv_results_['std_test_recall'][clf.best_index_])
# Prediction
#clf.best_estimator_.fit(X,y)
#ypredict_stress_global = clf.best_estimator_.predict(ypredict_stress_diapo)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.7651587301587301
Best params {'C': 0.1, 'class_weight': None}
accuracy (mean, std) 0.8 0.0666666666666667
f1 (mean, std) 0.7651587301587301 0.11176124986563704
balanced accuracy (mean, std) 0.8 0.0666666666666667
precision (mean, std) 0.8016666666666665 0.16758635142870432
recall (mean, std) 0.8 0.0666666666666667
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    0.7s finished


In [45]:
clf.best_estimator_.fit(X,y)
ypredict_stress_global = clf.best_estimator_.predict(ypredict_stress_diapo)
ypredict_stress_global

array([0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 1., 0., 1., 0., 0., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.])

In [46]:
from sklearn.metrics import accuracy_score, f1_score
print('Accuracy',accuracy_score(y.values,ypredict_stress_global))
print('F1',f1_score(y.values,ypredict_stress_global, average='weighted'))

Accuracy 0.3333333333333333
F1 0.2710239651416122


#### Autre méthode

In [51]:
# En utilisant les prédictions des stress des diapos comme X (et non les annotations)
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

#X = df_annotations_stress.iloc[:,:-1]
X = ypredict_stress_diapo
y = df_annotations_stress.iloc[:,-1]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2], 'class_weight' : [None, 'balanced']}

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_jobs=-1)
parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
model = RandomForestClassifier(random_state = 42, n_jobs=-1)
parameters = {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' , 
                             'balanced_accuracy_score' : 'balanced_accuracy', 
                             'precision' : 'precision_weighted', 'recall' : 'recall_weighted'}, 
                    refit='f1_score',
                    cv=cv_loo, verbose=1)
clf.fit(X, y)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])
print('balanced accuracy (mean, std)', clf.cv_results_['mean_test_balanced_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_balanced_accuracy_score'][clf.best_index_])
print('precision (mean, std)', clf.cv_results_['mean_test_precision'][clf.best_index_], clf.cv_results_['std_test_precision'][clf.best_index_])
print('recall (mean, std)', clf.cv_results_['mean_test_recall'][clf.best_index_], clf.cv_results_['std_test_recall'][clf.best_index_])



Fitting 30 folds for each of 56 candidates, totalling 1680 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.4666666666666667
Best params {'class_weight': None, 'max_depth': 3, 'n_estimators': 100}
accuracy (mean, std) 0.4666666666666667 0.49888765156985887
f1 (mean, std) 0.4666666666666667 0.49888765156985887
balanced accuracy (mean, std) 0.4666666666666667 0.49888765156985887
precision (mean, std) 0.4666666666666667 0.49888765156985887
recall (mean, std) 0.4666666666666667 0.49888765156985887
[Parallel(n_jobs=1)]: Done 1680 out of 1680 | elapsed:  3.3min finished


In [52]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo_2bis.csv')

### En utilisant le stress prédit des time windows 5s

In [53]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.174603,0.417989,0.407407
Video_1,0.344371,0.655629,0.0
WIN_20210323_19_17_40_Pro,0.369048,0.630952,0.0
WIN_20210329_10_16_02_Pro,0.095238,0.904762,0.0
WIN_20210330_13_10_29_Pro,0.006623,0.986755,0.006623
WIN_20210331_21_22_52_Pro,0.288235,0.711765,0.0
WIN_20210402_14_27_50_Pro,0.016129,0.962366,0.021505
WIN_20210402_19_04_53_Pro,0.931429,0.062857,0.005714
WIN_20210403_18_49_15_Pro,0.281768,0.718232,0.0
WIN_20210404_10_58_27_Pro,0.65,0.255,0.095


In [73]:
#df_ypredict[['video_name','diapo','ypredict']].groupby(['video_name','diapo']).agg({'mean','min','max', 'median', 'std', #percentil25, percentil75, kurtosis, skew})

In [54]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [55]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [56]:
Xy

Unnamed: 0,video_name,0,1,2,stress_global
0,Test_pour_AFPA,0.174603,0.417989,0.407407,1.0
1,Video_1,0.344371,0.655629,0.0,0.0
2,WIN_20210323_19_17_40_Pro,0.369048,0.630952,0.0,1.0
3,WIN_20210329_10_16_02_Pro,0.095238,0.904762,0.0,1.0
4,WIN_20210330_13_10_29_Pro,0.006623,0.986755,0.006623,0.0
5,WIN_20210331_21_22_52_Pro,0.288235,0.711765,0.0,1.0
6,WIN_20210402_14_27_50_Pro,0.016129,0.962366,0.021505,1.0
7,WIN_20210402_19_04_53_Pro,0.931429,0.062857,0.005714,2.0
8,WIN_20210403_18_49_15_Pro,0.281768,0.718232,0.0,2.0
9,WIN_20210404_10_58_27_Pro,0.65,0.255,0.095,1.0


In [60]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

#model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_jobs=-1)
parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#model = RandomForestClassifier(random_state = 42, n_jobs=-1)
#parameters = {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted', 
                             'balanced_accuracy_score' : 'balanced_accuracy', 
                             'precision' : 'precision_weighted', 'recall' : 'recall_weighted' }, 
                    refit='f1_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])
print('balanced accuracy (mean, std)', clf.cv_results_['mean_test_balanced_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_balanced_accuracy_score'][clf.best_index_])
print('precision (mean, std)', clf.cv_results_['mean_test_precision'][clf.best_index_], clf.cv_results_['std_test_precision'][clf.best_index_])
print('recall (mean, std)', clf.cv_results_['mean_test_recall'][clf.best_index_], clf.cv_results_['std_test_recall'][clf.best_index_])


Fitting 5 folds for each of 44 candidates, totalling 220 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.35174603174603175
Best params {'n_neighbors': 4, 'p': 2, 'weights': 'uniform'}
accuracy (mean, std) 0.4333333333333333 0.08164965809277261
f1 (mean, std) 0.35174603174603175 0.05540205551332949
balanced accuracy (mean, std) 0.42222222222222217 0.07535922203472521
precision (mean, std) 0.34777777777777774 0.10373280015096716
recall (mean, std) 0.4333333333333333 0.08164965809277261
[Parallel(n_jobs=1)]: Done 220 out of 220 | elapsed:    2.6s finished


In [61]:
from sklearn.model_selection import cross_val_predict
ypredict_stress_global = cross_val_predict(clf.best_estimator_,X_no_name,y_no_name,cv=5)

In [62]:
ypredict_stress_global

array([0., 0., 0., 0., 1., 0., 1., 2., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 0., 1., 0., 0., 2., 0., 0., 0.])

In [64]:
from sklearn.metrics import accuracy_score, f1_score
print('Accuracy',accuracy_score(y_no_name.values,ypredict_stress_global))
print('F1',f1_score(y_no_name,ypredict_stress_global, average='weighted'))

Accuracy 0.43333333333333335
F1 0.40176470588235297


In [65]:
pd.concat([y_no_name, pd.DataFrame(ypredict_stress_global, columns=['stress_global_predict'])],axis=1)

Unnamed: 0,stress_global,stress_global_predict
0,1.0,0.0
1,0.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0
5,1.0,0.0
6,1.0,1.0
7,2.0,2.0
8,2.0,0.0
9,1.0,0.0


### Autre approche 

In [66]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
#df_ypredict = df_ypredict[['video_name','diapo','ypredict']].groupby(['video_name','diapo']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew})
df_ypredict = df_ypredict[['video_name','ypredict']].groupby(['video_name']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,percentil75,min,std,percentil25,mean,skew,max,kurtosis,median
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,2.0,0.0,0.728437,1.0,1.232804,-0.387119,2.0,-1.041292,1.0
Video_1,1.0,0.0,0.476744,0.0,0.655629,-0.655056,1.0,-1.570901,1.0
WIN_20210323_19_17_40_Pro,1.0,0.0,0.483989,0.0,0.630952,-0.542755,1.0,-1.705417,1.0
WIN_20210329_10_16_02_Pro,1.0,0.0,0.294547,1.0,0.904762,-2.757764,1.0,5.605263,1.0
WIN_20210330_13_10_29_Pro,1.0,0.0,0.11547,1.0,1.0,0.0,2.0,72.5,1.0
WIN_20210331_21_22_52_Pro,1.0,0.0,0.454279,0.0,0.711765,-0.935065,1.0,-1.125654,1.0
WIN_20210402_14_27_50_Pro,1.0,0.0,0.194445,1.0,1.005376,0.654047,2.0,23.535147,1.0
WIN_20210402_19_04_53_Pro,0.0,0.0,0.284001,0.0,0.074286,3.975646,2.0,16.400354,0.0
WIN_20210403_18_49_15_Pro,1.0,0.0,0.451109,0.0,0.718232,-0.97022,1.0,-1.058673,1.0
WIN_20210404_10_58_27_Pro,1.0,0.0,0.6627,0.0,0.445,1.189224,2.0,0.169115,0.0


In [67]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [68]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]
Xy

Unnamed: 0,video_name,"(ypredict, percentil75)","(ypredict, min)","(ypredict, std)","(ypredict, percentil25)","(ypredict, mean)","(ypredict, skew)","(ypredict, max)","(ypredict, kurtosis)","(ypredict, median)",stress_global
0,Test_pour_AFPA,2.0,0.0,0.728437,1.0,1.232804,-0.387119,2.0,-1.041292,1.0,1.0
1,Video_1,1.0,0.0,0.476744,0.0,0.655629,-0.655056,1.0,-1.570901,1.0,0.0
2,WIN_20210323_19_17_40_Pro,1.0,0.0,0.483989,0.0,0.630952,-0.542755,1.0,-1.705417,1.0,1.0
3,WIN_20210329_10_16_02_Pro,1.0,0.0,0.294547,1.0,0.904762,-2.757764,1.0,5.605263,1.0,1.0
4,WIN_20210330_13_10_29_Pro,1.0,0.0,0.11547,1.0,1.0,0.0,2.0,72.5,1.0,0.0
5,WIN_20210331_21_22_52_Pro,1.0,0.0,0.454279,0.0,0.711765,-0.935065,1.0,-1.125654,1.0,1.0
6,WIN_20210402_14_27_50_Pro,1.0,0.0,0.194445,1.0,1.005376,0.654047,2.0,23.535147,1.0,1.0
7,WIN_20210402_19_04_53_Pro,0.0,0.0,0.284001,0.0,0.074286,3.975646,2.0,16.400354,0.0,2.0
8,WIN_20210403_18_49_15_Pro,1.0,0.0,0.451109,0.0,0.718232,-0.97022,1.0,-1.058673,1.0,2.0
9,WIN_20210404_10_58_27_Pro,1.0,0.0,0.6627,0.0,0.445,1.189224,2.0,0.169115,0.0,1.0


In [73]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

#from sklearn.neighbors import KNeighborsClassifier
#model = KNeighborsClassifier(n_jobs=-1)
#parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#model = RandomForestClassifier(random_state = 42, n_jobs=-1)
#parameters = {'n_estimators': [50, 100, 200], 'max_depth':[3,4, 5,6, 10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' , 
                             'balanced_accuracy_score' : 'balanced_accuracy', 
                             'precision' : 'precision_weighted', 'recall' : 'recall_weighted'}, 
                    refit='f1_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])
print('balanced accuracy (mean, std)', clf.cv_results_['mean_test_balanced_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_balanced_accuracy_score'][clf.best_index_])
print('precision (mean, std)', clf.cv_results_['mean_test_precision'][clf.best_index_], clf.cv_results_['std_test_precision'][clf.best_index_])
print('recall (mean, std)', clf.cv_results_['mean_test_recall'][clf.best_index_], clf.cv_results_['std_test_recall'][clf.best_index_])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.3699206349206349
Best params {'C': 0.5, 'class_weight': None}
accuracy (mean, std) 0.4333333333333333 0.2260776661041756
f1 (mean, std) 0.3699206349206349 0.20778929721787115
balanced accuracy (mean, std) 0.4111111111111111 0.20667861375264746
precision (mean, std) 0.3488888888888889 0.21697698575120591
recall (mean, std) 0.4333333333333333 0.2260776661041756
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    2.0s finished


In [74]:
from sklearn.model_selection import cross_val_predict
ypredict_stress_global = cross_val_predict(clf.best_estimator_,X_no_name,y_no_name,cv=5)