In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pathlib

## Loading data

In [2]:
directory_path = '../../04_-_Dev/videos'
features = 'eGeMAPS'

In [3]:
df_total = pd.read_pickle(directory_path + '/audio_' + features + '_data.p')

In [4]:
df_total.shape

(2573581, 77)

In [5]:
df_total.video_name.nunique()

30

In [6]:
df_total[df_total.isna().any(axis=1)]

Unnamed: 0,frameIndex,frameTime,Loudness_sma3,alphaRatio_sma3,hammarbergIndex_sma3,slope0-500_sma3,slope500-1500_sma3,spectralFlux_sma3,mfcc1_sma3,mfcc2_sma3,...,F3frequency_sma3nz_de,F3frequency_sma3nz_de_de,F3amplitudeLogRelF0_sma3nz_de,F3amplitudeLogRelF0_sma3nz_de_de,video_name,stress_global,type_candidat,sexe,stress,diapo


## Data processing

In [7]:
time_window = 5
df_total['frameTimeWindow'] = df_total.frameTime.apply(lambda x : np.floor(x / time_window) * time_window).astype(int)

In [8]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

In [9]:
# 5 seconds windows
X = df_total.iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]

In [10]:
y = df_total.iloc[:,3:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'min'}).iloc[:,-1]

In [11]:
X.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

In [12]:
from scipy.stats import kurtosis, skew
X_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]
y_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'mean'}).iloc[:,-1]

In [13]:
X_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

## Modèles

In [14]:
X = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

X_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

In [15]:
X = X.fillna(0)
X_audio = X_audio.fillna(0)

## Stress par diapos
### All diapos

In [17]:
parameters = {'n_estimators': [100, 150, 200, 250, 300], 'max_depth':[10, 15, 20, 25, 30], 'class_weight':[None,'balanced']}

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = RandomForestClassifier(random_state = 42, n_jobs=-1)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit= 'f1_score' ,#'accuracy_score',
                    cv=cv_loo, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1500 out of 1500 | elapsed: 104.5min finished
Best results 0.45449414449983017
Best params {'class_weight': 'balanced', 'max_depth': 10, 'n_estimators': 150}
accuracy (mean, std) 0.44510016364811356 0.18494293958689356
f1 (mean, std) 0.45449414449983017 0.18411403803477927


In [18]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

#model = RandomForestClassifier(random_state = 42, n_jobs=-1, max_depth= 10, n_estimators= 50)
model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_2.csv')

In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[ 941,  920,  203,    0],
       [ 918, 1329,  343,    0],
       [ 206,  371,   76,    0],
       [  16,    4,    0,    0]])

In [20]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)


In [21]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.000000,0.000000,1.000000
Test_pour_AFPA,8,0.200000,0.400000,0.400000
Test_pour_AFPA,9,0.300000,0.150000,0.550000
Test_pour_AFPA,10,0.066667,0.266667,0.666667
Test_pour_AFPA,11,0.200000,0.250000,0.550000
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.571429,0.428571,0.000000
WIN_20210417_14_53_12_Pro,11,0.750000,0.250000,0.000000
WIN_20210417_14_53_12_Pro,12,0.181818,0.818182,0.000000
WIN_20210417_14_53_12_Pro,17,0.857143,0.142857,0.000000


In [22]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [23]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,diapo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Test_pour_AFPA,1,2.000000,2,2,2.0,0.000000,2.0,2.00,-3.000000,0.000000
Test_pour_AFPA,8,1.200000,0,2,1.0,0.788811,1.0,2.00,-1.153061,-0.343622
Test_pour_AFPA,9,1.250000,0,2,2.0,0.910465,0.0,2.00,-1.537415,-0.509776
Test_pour_AFPA,10,1.600000,0,2,2.0,0.632456,1.0,2.00,0.505102,-1.262546
Test_pour_AFPA,11,1.350000,0,2,2.0,0.812728,1.0,2.00,-1.054047,-0.707642
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.428571,0,1,0.0,0.513553,0.0,1.00,-1.916667,0.288675
WIN_20210417_14_53_12_Pro,11,0.250000,0,1,0.0,0.444262,0.0,0.25,-0.666667,1.154701
WIN_20210417_14_53_12_Pro,12,0.818182,0,1,1.0,0.391675,1.0,1.00,0.722222,-1.649916
WIN_20210417_14_53_12_Pro,17,0.142857,0,1,0.0,0.377964,0.0,0.00,2.166667,2.041241


In [24]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
0,Test_pour_AFPA,1,1.0
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0
237,WIN_20210417_14_53_12_Pro,12,0.0
238,WIN_20210417_14_53_12_Pro,17,1.0


In [25]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [26]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,"(ypredict, mean)","(ypredict, min)","(ypredict, max)","(ypredict, median)","(ypredict, std)","(ypredict, percentil25)","(ypredict, percentil75)","(ypredict, kurtosis)","(ypredict, skew)"
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Test_pour_AFPA,1,2.000000,2,2,2.0,0.000000,2.0,2.00,-3.000000,0.000000
Test_pour_AFPA,8,1.200000,0,2,1.0,0.788811,1.0,2.00,-1.153061,-0.343622
Test_pour_AFPA,9,1.250000,0,2,2.0,0.910465,0.0,2.00,-1.537415,-0.509776
Test_pour_AFPA,10,1.600000,0,2,2.0,0.632456,1.0,2.00,0.505102,-1.262546
Test_pour_AFPA,11,1.350000,0,2,2.0,0.812728,1.0,2.00,-1.054047,-0.707642
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.428571,0,1,0.0,0.513553,0.0,1.00,-1.916667,0.288675
WIN_20210417_14_53_12_Pro,11,0.250000,0,1,0.0,0.444262,0.0,0.25,-0.666667,1.154701
WIN_20210417_14_53_12_Pro,12,0.818182,0,1,1.0,0.391675,1.0,1.00,0.722222,-1.649916
WIN_20210417_14_53_12_Pro,17,0.142857,0,1,0.0,0.377964,0.0,0.00,2.166667,2.041241


In [30]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y


model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

#from sklearn.neighbors import KNeighborsClassifier
#model = KNeighborsClassifier(n_jobs=-1)
#parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#model = RandomForestClassifier(random_state = 42, n_jobs=-1)
#parameters = {'n_estimators': [100, 150, 200], 'max_depth':[10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='f1_score',#'accuracy_score',
                    cv=cv_loo, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 30 folds for each of 16 candidates, totalling 480 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.4806971731971731
Best params {'C': 0.1, 'class_weight': None}
accuracy (mean, std) 0.5208333333333334 0.2261160591869189
f1 (mean, std) 0.4806971731971731 0.2522482688304581
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:   13.6s finished


In [31]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','diapo','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo_2.csv')

In [32]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[38, 63,  0,  0],
       [26, 87,  0,  0],
       [ 7, 18,  0,  0],
       [ 1,  0,  0,  0]])

In [33]:
#print(X_no_name.columns[np.argsort(clf.best_estimator_.feature_importances_)[:-20:-1]])

## Stress global

### En utilisant le stress prédit des diapos

In [34]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [35]:
ypredict_stress_diapo # un peu nul - le modèle prédit 1 partout

diapo,1,8,9,10,11,12,17,18
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Test_pour_AFPA,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Video_1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
WIN_20210323_19_17_40_Pro,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210330_13_10_29_Pro,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
WIN_20210331_21_22_52_Pro,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
WIN_20210402_19_04_53_Pro,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
WIN_20210404_10_58_27_Pro,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0


In [36]:
df_annotations_stress = pd.read_csv('annotations.csv')
#df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
#df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
#df_annotations_stress.columns = ['video_name','diapo','stress']
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
#df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0_level_0,1,8,9,10,11,12,17,18,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [37]:
# En utilisant les annotations dees stress des diapos comme X
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2], 'class_weight' : [None, 'balanced']}

X = df_annotations_stress.iloc[:,:-1]
#X = ypredict_stress_diapo
y = df_annotations_stress.iloc[:,-1]

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)
clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='f1_score',
                    cv=5, verbose=1)
clf.fit(X, y)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)

# Prediction
#clf.best_estimator_.fit(X,y)
#ypredict_stress_global = clf.best_estimator_.predict(ypredict_stress_diapo)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.7651587301587301
Best params {'C': 0.1, 'class_weight': None}
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    0.6s finished


In [58]:
from sklearn.model_selection import cross_val_predict
ypredict_stress_global = cross_val_predict(clf.best_estimator_,X,y,cv=5)

In [59]:
ypredict_stress_global

array([0., 0., 0., 0., 0., 0., 2., 0., 0., 2., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 2.])

In [60]:
df_ypredict_stress_global = pd.concat([ypredict_stress_diapo.reset_index(), pd.DataFrame(ypredict_stress_global,columns=['predicted_stress_global'])], axis=1) 
df_ypredict_stress_global = df_ypredict_stress_global.set_index('video_name').sort_index()
df_ypredict_stress_global = df_ypredict_stress_global.iloc[:,-1]

In [63]:
from sklearn.metrics import accuracy_score, f1_score
print('Accuracy',accuracy_score(y.values,ypredict_stress_global))
print('F1',f1_score(y.values,ypredict_stress_global, average='weighted'))

Accuracy 0.3333333333333333
F1 0.21621621621621623


In [66]:
pd.concat([y, pd.DataFrame(ypredict_stress_global,columns=['stress_global_predict'])],axis=1)

Unnamed: 0,stress_global,stress_global_predict
0,1.0,0.0
1,0.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,0.0
5,1.0,0.0
6,1.0,2.0
7,2.0,0.0
8,2.0,0.0
9,1.0,2.0


#### Autre méthode

In [70]:
# En utilisant les prédictions des stress des diapos comme X (et non les annotations)
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

#X = df_annotations_stress.iloc[:,:-1]
X = ypredict_stress_diapo
y = df_annotations_stress.iloc[:,-1]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2], 'class_weight' : [None, 'balanced']}

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

#from sklearn.neighbors import KNeighborsClassifier
#model = KNeighborsClassifier(n_jobs=-1)
#parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#model = RandomForestClassifier(random_state = 42, n_jobs=-1)
#parameters = {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='f1_score',
                    cv=cv_loo, verbose=1)
clf.fit(X, y)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])



Fitting 30 folds for each of 12 candidates, totalling 360 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.4
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4 0.4898979485566357
f1 (mean, std) 0.4 0.4898979485566357
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:    4.0s finished


In [71]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo_2bis.csv')

### En utilisant le stress prédit des time windows 5s

In [72]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.079365,0.116402,0.804233
Video_1,0.788079,0.211921,0.0
WIN_20210323_19_17_40_Pro,0.505952,0.488095,0.005952
WIN_20210329_10_16_02_Pro,0.006803,0.986395,0.006803
WIN_20210330_13_10_29_Pro,0.231788,0.708609,0.059603
WIN_20210331_21_22_52_Pro,0.529412,0.441176,0.029412
WIN_20210402_14_27_50_Pro,0.032258,0.446237,0.521505
WIN_20210402_19_04_53_Pro,0.502857,0.462857,0.034286
WIN_20210403_18_49_15_Pro,0.441989,0.558011,0.0
WIN_20210404_10_58_27_Pro,0.385,0.055,0.56


In [73]:
#df_ypredict[['video_name','diapo','ypredict']].groupby(['video_name','diapo']).agg({'mean','min','max', 'median', 'std', #percentil25, percentil75, kurtosis, skew})

In [74]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [75]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [76]:
Xy

Unnamed: 0,video_name,0,1,2,stress_global
0,Test_pour_AFPA,0.079365,0.116402,0.804233,1.0
1,Video_1,0.788079,0.211921,0.0,0.0
2,WIN_20210323_19_17_40_Pro,0.505952,0.488095,0.005952,1.0
3,WIN_20210329_10_16_02_Pro,0.006803,0.986395,0.006803,1.0
4,WIN_20210330_13_10_29_Pro,0.231788,0.708609,0.059603,0.0
5,WIN_20210331_21_22_52_Pro,0.529412,0.441176,0.029412,1.0
6,WIN_20210402_14_27_50_Pro,0.032258,0.446237,0.521505,1.0
7,WIN_20210402_19_04_53_Pro,0.502857,0.462857,0.034286,2.0
8,WIN_20210403_18_49_15_Pro,0.441989,0.558011,0.0,2.0
9,WIN_20210404_10_58_27_Pro,0.385,0.055,0.56,1.0


In [79]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_jobs=-1)
parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
model = RandomForestClassifier(random_state = 42, n_jobs=-1)
parameters = {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5, 6, 10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='f1_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.3944444444444445
Best params {'class_weight': None, 'max_depth': 6, 'n_estimators': 50}
accuracy (mean, std) 0.4 0.27080128015453203
f1 (mean, std) 0.3944444444444445 0.27603900243298507
[Parallel(n_jobs=1)]: Done 280 out of 280 | elapsed:   34.1s finished


In [80]:
from sklearn.model_selection import cross_val_predict
ypredict_stress_global = cross_val_predict(clf.best_estimator_,X_no_name,y_no_name,cv=5)

In [81]:
ypredict_stress_global

array([1., 0., 1., 2., 2., 1., 2., 1., 2., 1., 2., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 2., 2., 1., 0., 2., 0., 2., 2.])

In [82]:
from sklearn.metrics import accuracy_score, f1_score
print('Accuracy',accuracy_score(y_no_name.values,ypredict_stress_global))
print('F1',f1_score(y_no_name,ypredict_stress_global, average='weighted'))

Accuracy 0.4
F1 0.4081220043572985


In [84]:
pd.concat([y_no_name, pd.DataFrame(ypredict_stress_global, columns=['stress_global_predict'])],axis=1)

Unnamed: 0,stress_global,stress_global_predict
0,1.0,1.0
1,0.0,0.0
2,1.0,1.0
3,1.0,2.0
4,0.0,2.0
5,1.0,1.0
6,1.0,2.0
7,2.0,1.0
8,2.0,2.0
9,1.0,1.0


### Autre approche 

In [85]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
#df_ypredict = df_ypredict[['video_name','diapo','ypredict']].groupby(['video_name','diapo']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew})
df_ypredict = df_ypredict[['video_name','ypredict']].groupby(['video_name']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,percentil75,std,percentil25,skew,max,mean,kurtosis,median,min
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,2.0,0.600058,2.0,-2.028772,2.0,1.724868,2.762736,2.0,0.0
Video_1,0.0,0.410029,0.0,1.409843,1.0,0.211921,-0.012342,0.0,0.0
WIN_20210323_19_17_40_Pro,1.0,0.513296,0.0,0.133228,2.0,0.5,-1.654959,0.0,0.0
WIN_20210329_10_16_02_Pro,1.0,0.117041,1.0,0.0,2.0,1.0,70.5,1.0,0.0
WIN_20210330_13_10_29_Pro,1.0,0.513311,1.0,-0.238038,2.0,0.827815,0.240391,1.0,0.0
WIN_20210331_21_22_52_Pro,1.0,0.557361,0.0,0.514135,2.0,0.5,-0.802721,0.0,0.0
WIN_20210402_14_27_50_Pro,2.0,0.562228,1.0,-0.506664,2.0,1.489247,-0.77708,2.0,0.0
WIN_20210402_19_04_53_Pro,1.0,0.565163,0.0,0.451128,2.0,0.531429,-0.799954,0.0,0.0
WIN_20210403_18_49_15_Pro,1.0,0.498001,0.0,-0.233622,1.0,0.558011,-1.945421,1.0,0.0
WIN_20210404_10_58_27_Pro,2.0,0.958629,0.0,-0.355013,2.0,1.175,-1.811922,2.0,0.0


In [86]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [87]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]
Xy

Unnamed: 0,video_name,"(ypredict, percentil75)","(ypredict, std)","(ypredict, percentil25)","(ypredict, skew)","(ypredict, max)","(ypredict, mean)","(ypredict, kurtosis)","(ypredict, median)","(ypredict, min)",stress_global
0,Test_pour_AFPA,2.0,0.600058,2.0,-2.028772,2.0,1.724868,2.762736,2.0,0.0,1.0
1,Video_1,0.0,0.410029,0.0,1.409843,1.0,0.211921,-0.012342,0.0,0.0,0.0
2,WIN_20210323_19_17_40_Pro,1.0,0.513296,0.0,0.133228,2.0,0.5,-1.654959,0.0,0.0,1.0
3,WIN_20210329_10_16_02_Pro,1.0,0.117041,1.0,0.0,2.0,1.0,70.5,1.0,0.0,1.0
4,WIN_20210330_13_10_29_Pro,1.0,0.513311,1.0,-0.238038,2.0,0.827815,0.240391,1.0,0.0,0.0
5,WIN_20210331_21_22_52_Pro,1.0,0.557361,0.0,0.514135,2.0,0.5,-0.802721,0.0,0.0,1.0
6,WIN_20210402_14_27_50_Pro,2.0,0.562228,1.0,-0.506664,2.0,1.489247,-0.77708,2.0,0.0,1.0
7,WIN_20210402_19_04_53_Pro,1.0,0.565163,0.0,0.451128,2.0,0.531429,-0.799954,0.0,0.0,2.0
8,WIN_20210403_18_49_15_Pro,1.0,0.498001,0.0,-0.233622,1.0,0.558011,-1.945421,1.0,0.0,2.0
9,WIN_20210404_10_58_27_Pro,2.0,0.958629,0.0,-0.355013,2.0,1.175,-1.811922,2.0,0.0,1.0


In [92]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_jobs=-1)
parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#model = RandomForestClassifier(random_state = 42, n_jobs=-1)
#parameters = {'n_estimators': [50, 100, 200], 'max_depth':[3,4, 5,6, 10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='f1_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 5 folds for each of 44 candidates, totalling 220 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.4127777777777778
Best params {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
accuracy (mean, std) 0.4666666666666667 0.06666666666666668
f1 (mean, std) 0.4127777777777778 0.05245329300040219
[Parallel(n_jobs=1)]: Done 220 out of 220 | elapsed:    2.4s finished


In [93]:
from sklearn.model_selection import cross_val_predict
ypredict_stress_global = cross_val_predict(clf.best_estimator_,X_no_name,y_no_name,cv=5)