In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pathlib

## Loading data

In [2]:
directory_path = '../../04_-_Dev/videos'
features = 'emobase'

In [None]:
df_total = pd.read_pickle(directory_path + '/audio_' + features + '_data.p')

In [4]:
df_total.shape

(2573701, 60)

In [5]:
df_total.video_name.nunique()

30

In [7]:
df_total[df_total.isna().any(axis=1)]

Unnamed: 0,frameIndex,frameTime,pcm_intensity_sma,pcm_loudness_sma,mfcc_sma[1],mfcc_sma[2],mfcc_sma[3],mfcc_sma[4],mfcc_sma[5],mfcc_sma[6],...,pcm_zcr_sma_de,voiceProb_sma_de,F0_sma_de,F0env_sma_de,video_name,stress_global,type_candidat,sexe,stress,diapo


## Data processing

In [3]:
time_window = 5
df_total['frameTimeWindow'] = df_total.frameTime.apply(lambda x : np.floor(x / time_window) * time_window).astype(int)

NameError: name 'df_total' is not defined

In [3]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

In [11]:
# 5 seconds windows
X = df_total.iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]

In [16]:
y = df_total.iloc[:,3:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'min'}).iloc[:,-1]

In [18]:
X.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

In [21]:
from scipy.stats import kurtosis, skew
X_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]
y_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'mean'}).iloc[:,-1]

In [22]:
X_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

## Modèles

In [4]:
X = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

X_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

In [5]:
X = X.fillna(0)
X_audio = X_audio.fillna(0)

## Stress par diapos
### All diapos

In [5]:
parameters = {'n_estimators': [100, 150, 200, 250, 300], 'max_depth':[10, 15, 20, 25, 30], 'class_weight':[None,'balanced', 'balanced_subsample']}

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = RandomForestClassifier(random_state = 42, n_jobs=-1)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit= 'f1_score' ,#'accuracy_score',
                    cv=cv_loo, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 30 folds for each of 75 candidates, totalling 2250 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2250 out of 2250 | elapsed: 157.5min finished
Best results 0.4529982783484758
Best params {'class_weight': 'balanced', 'max_depth': 15, 'n_estimators': 200}
accuracy (mean, std) 0.4603521332304298 0.2114706580772435
f1 (mean, std) 0.4529982783484758 0.20974192525992463


In [6]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

#model = RandomForestClassifier(random_state = 42, n_jobs=-1, max_depth= 10, n_estimators= 50)
model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_2.csv')

In [7]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[ 654, 1239,  171,    0],
       [ 814, 1749,   27,    0],
       [ 316,  307,   30,    0],
       [   7,   13,    0,    0]])

In [8]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)


In [8]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.088235,0.147059,0.764706
Test_pour_AFPA,8,0.100000,0.600000,0.300000
Test_pour_AFPA,9,0.050000,0.800000,0.150000
Test_pour_AFPA,10,0.000000,0.733333,0.266667
Test_pour_AFPA,11,0.050000,0.750000,0.200000
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.285714,0.714286,0.000000
WIN_20210417_14_53_12_Pro,11,0.500000,0.500000,0.000000
WIN_20210417_14_53_12_Pro,12,0.151515,0.848485,0.000000
WIN_20210417_14_53_12_Pro,17,0.571429,0.428571,0.000000


In [11]:
# Autre méthode
X = df_ypredict.groupby(['video_name','diapo']).agg({'ypredict':['mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew]})

In [12]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,median,std,percentil25,percentil75,kurtosis,skew
video_name,diapo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Test_pour_AFPA,1,0.823529,0,1,1.0,0.386953,1.0,1.0,0.880952,-1.697337
Test_pour_AFPA,8,0.800000,0,1,1.0,0.421637,1.0,1.0,0.250000,-1.500000
Test_pour_AFPA,9,1.050000,1,2,1.0,0.223607,1.0,1.0,15.052632,4.129483
Test_pour_AFPA,10,1.000000,1,1,1.0,0.000000,1.0,1.0,-3.000000,0.000000
Test_pour_AFPA,11,0.950000,0,1,1.0,0.223607,1.0,1.0,15.052632,-4.129483
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.571429,0,1,1.0,0.513553,0.0,1.0,-1.916667,-0.288675
WIN_20210417_14_53_12_Pro,11,0.400000,0,1,0.0,0.502625,0.0,1.0,-1.833333,0.408248
WIN_20210417_14_53_12_Pro,12,0.848485,0,1,1.0,0.364110,1.0,1.0,1.778571,-1.943855
WIN_20210417_14_53_12_Pro,17,0.428571,0,1,0.0,0.534522,0.0,1.0,-1.916667,0.288675


In [13]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
0,Test_pour_AFPA,1,1.0
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0
237,WIN_20210417_14_53_12_Pro,12,0.0
238,WIN_20210417_14_53_12_Pro,17,1.0


In [14]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [15]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,"(ypredict, mean)","(ypredict, min)","(ypredict, max)","(ypredict, median)","(ypredict, std)","(ypredict, percentil25)","(ypredict, percentil75)","(ypredict, kurtosis)","(ypredict, skew)"
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Test_pour_AFPA,1,0.823529,0,1,1.0,0.386953,1.0,1.0,0.880952,-1.697337
Test_pour_AFPA,8,0.800000,0,1,1.0,0.421637,1.0,1.0,0.250000,-1.500000
Test_pour_AFPA,9,1.050000,1,2,1.0,0.223607,1.0,1.0,15.052632,4.129483
Test_pour_AFPA,10,1.000000,1,1,1.0,0.000000,1.0,1.0,-3.000000,0.000000
Test_pour_AFPA,11,0.950000,0,1,1.0,0.223607,1.0,1.0,15.052632,-4.129483
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.571429,0,1,1.0,0.513553,0.0,1.0,-1.916667,-0.288675
WIN_20210417_14_53_12_Pro,11,0.400000,0,1,0.0,0.502625,0.0,1.0,-1.833333,0.408248
WIN_20210417_14_53_12_Pro,12,0.848485,0,1,1.0,0.364110,1.0,1.0,1.778571,-1.943855
WIN_20210417_14_53_12_Pro,17,0.428571,0,1,0.0,0.534522,0.0,1.0,-1.916667,0.288675


In [26]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y


model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

#from sklearn.neighbors import KNeighborsClassifier
#model = KNeighborsClassifier(n_jobs=-1)
#parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#model = RandomForestClassifier(random_state = 42, n_jobs=-1)
#parameters = {'n_estimators': [100, 150, 200], 'max_depth':[10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='f1_score',#'accuracy_score',
                    cv=cv_loo, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 30 folds for each of 16 candidates, totalling 480 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.45559810559810554
Best params {'C': 2, 'class_weight': None}
accuracy (mean, std) 0.4666666666666667 0.20649186155606444
f1 (mean, std) 0.45559810559810554 0.22814809352070492
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:   12.7s finished


In [27]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','diapo','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo_2.csv')

In [28]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_predict)

array([[36, 65,  0,  0],
       [33, 76,  4,  0],
       [ 5, 20,  0,  0],
       [ 0,  1,  0,  0]])

In [29]:
print(X_no_name.columns[np.argsort(clf.best_estimator_.feature_importances_)[:-20:-1]])

AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'

## Stress global

### En utilisant le stress prédit des diapos

In [6]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [7]:
ypredict_stress_diapo # un peu nul - le modèle prédit 1 partout

diapo,1,8,9,10,11,12,17,18
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Test_pour_AFPA,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
Video_1,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
WIN_20210329_10_16_02_Pro,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
WIN_20210330_13_10_29_Pro,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
WIN_20210331_21_22_52_Pro,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,0.0,1.0,1.0,2.0,1.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
WIN_20210404_10_58_27_Pro,0.0,0.0,1.0,2.0,1.0,1.0,0.0,2.0


In [8]:
df_annotations_stress = pd.read_csv('annotations.csv')
#df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
#df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
#df_annotations_stress.columns = ['video_name','diapo','stress']
df_annotations_stress = df_annotations_stress.set_index(['video_name'])
#df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0_level_0,1,8,9,10,11,12,17,18,stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# En utilisant les annotations dees stress des diapos comme X
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2], 'class_weight' : [None, 'balanced']}

X = df_annotations_stress.iloc[:,:-1]
#X = ypredict_stress_diapo
y = df_annotations_stress.iloc[:,-1]

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)
clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='f1_score',
                    cv=5, verbose=1)
clf.fit(X, y)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)

# Prediction
clf.best_estimator_.fit(X,y)
ypredict_stress_global = clf.best_estimator_.predict(ypredict_stress_diapo)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.7651587301587301
Best params {'C': 0.1, 'class_weight': None}
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    0.9s finished


In [10]:
df_ypredict_stress_global = pd.concat([ypredict_stress_diapo.reset_index(), pd.DataFrame(ypredict_stress_global,columns=['predicted_stress_global'])], axis=1) 
df_ypredict_stress_global = df_ypredict_stress_global.set_index('video_name').sort_index()
df_ypredict_stress_global = df_ypredict_stress_global.iloc[:,-1]

In [11]:
from sklearn.metrics import accuracy_score, f1_score
print('Accuracy',accuracy_score(y.sort_index().values,df_ypredict_stress_global))
print('F1',f1_score(y.sort_index().values,df_ypredict_stress_global, average='weighted'))

Accuracy 0.43333333333333335
F1 0.40387464387464383


In [12]:
pd.concat([y, df_ypredict_stress_global],axis=1)

Unnamed: 0_level_0,stress_global,predicted_stress_global
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Test_pour_AFPA,1.0,1.0
Video_1,0.0,1.0
WIN_20210323_19_17_40_Pro,1.0,0.0
WIN_20210329_10_16_02_Pro,1.0,0.0
WIN_20210330_13_10_29_Pro,0.0,1.0
WIN_20210331_21_22_52_Pro,1.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0
WIN_20210402_19_04_53_Pro,2.0,2.0
WIN_20210403_18_49_15_Pro,2.0,1.0
WIN_20210404_10_58_27_Pro,1.0,1.0


#### Autre méthode

In [15]:
# En utilisant les prédictions des stress des diapos comme X (et non les annotations)
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

#X = df_annotations_stress.iloc[:,:-1]
X = ypredict_stress_diapo
y = df_annotations_stress.iloc[:,-1]

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2], 'class_weight' : [None, 'balanced']}

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)
clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='f1_score',
                    cv=cv_loo, verbose=1)
clf.fit(X, y)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])



Fitting 30 folds for each of 12 candidates, totalling 360 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.4
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.4 0.4898979485566357
f1 (mean, std) 0.4 0.4898979485566357
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:    5.0s finished


In [17]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo_2bis.csv')

### En utilisant le stress prédit des time windows 5s

In [18]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
#df_ypredict = df_ypredict.reset_index()
df_ypredict


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.185185,0.798942,0.015873
Video_1,0.357616,0.642384,0.0
WIN_20210323_19_17_40_Pro,0.303571,0.690476,0.005952
WIN_20210329_10_16_02_Pro,0.190476,0.809524,0.0
WIN_20210330_13_10_29_Pro,0.271523,0.728477,0.0
WIN_20210331_21_22_52_Pro,0.329412,0.670588,0.0
WIN_20210402_14_27_50_Pro,0.005376,0.994624,0.0
WIN_20210402_19_04_53_Pro,0.897143,0.08,0.022857
WIN_20210403_18_49_15_Pro,0.287293,0.712707,0.0
WIN_20210404_10_58_27_Pro,0.81,0.115,0.075


In [71]:
#df_ypredict[['video_name','diapo','ypredict']].groupby(['video_name','diapo']).agg({'mean','min','max', 'median', 'std', #percentil25, percentil75, kurtosis, skew})

Unnamed: 0_level_0,Unnamed: 1_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,Unnamed: 1_level_1,max,std,mean,percentil75,percentil25,skew,min,median,kurtosis
video_name,diapo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Test_pour_AFPA,1,1,0.386953,0.823529,1.0,1.0,-1.697337,0,1.0,0.880952
Test_pour_AFPA,8,1,0.421637,0.800000,1.0,1.0,-1.500000,0,1.0,0.250000
Test_pour_AFPA,9,2,0.223607,1.050000,1.0,1.0,4.129483,1,1.0,15.052632
Test_pour_AFPA,10,1,0.000000,1.000000,1.0,1.0,0.000000,1,1.0,-3.000000
Test_pour_AFPA,11,1,0.223607,0.950000,1.0,1.0,-4.129483,0,1.0,15.052632
...,...,...,...,...,...,...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,1,0.513553,0.571429,1.0,0.0,-0.288675,0,1.0,-1.916667
WIN_20210417_14_53_12_Pro,11,1,0.502625,0.400000,1.0,0.0,0.408248,0,0.0,-1.833333
WIN_20210417_14_53_12_Pro,12,1,0.364110,0.848485,1.0,1.0,-1.943855,0,1.0,1.778571
WIN_20210417_14_53_12_Pro,17,1,0.534522,0.428571,1.0,0.0,0.288675,0,0.0,-1.916667


In [19]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [20]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [21]:
Xy

Unnamed: 0,video_name,0,1,2,stress_global
0,Test_pour_AFPA,0.185185,0.798942,0.015873,1.0
1,Video_1,0.357616,0.642384,0.0,0.0
2,WIN_20210323_19_17_40_Pro,0.303571,0.690476,0.005952,1.0
3,WIN_20210329_10_16_02_Pro,0.190476,0.809524,0.0,1.0
4,WIN_20210330_13_10_29_Pro,0.271523,0.728477,0.0,0.0
5,WIN_20210331_21_22_52_Pro,0.329412,0.670588,0.0,1.0
6,WIN_20210402_14_27_50_Pro,0.005376,0.994624,0.0,1.0
7,WIN_20210402_19_04_53_Pro,0.897143,0.08,0.022857,2.0
8,WIN_20210403_18_49_15_Pro,0.287293,0.712707,0.0,2.0
9,WIN_20210404_10_58_27_Pro,0.81,0.115,0.075,1.0


In [22]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='accuracy_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.4999999999999999
Best params {'C': 10, 'class_weight': None}
accuracy (mean, std) 0.4999999999999999 0.14907119849998596
f1 (mean, std) 0.4666666666666666 0.14622830112321866
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.8s finished


In [23]:
clf.best_estimator_.coef_

array([[-0.09728546, -0.44356063,  0.54014903],
       [-0.32105667,  1.15486685, -0.83380717],
       [ 0.41834213, -0.71130622,  0.29365814]])

In [24]:
clf.best_estimator_.intercept_

array([ 0.51909082, -0.61004525,  0.09095443])

In [25]:
ypredict_stress_global = clf.best_estimator_.predict(df_ypredict)

In [26]:
df_ypredict_stress_global = pd.concat([ypredict_stress_diapo.reset_index(), pd.DataFrame(ypredict_stress_global,columns=['predicted_stress_global'])], axis=1) 
df_ypredict_stress_global = df_ypredict_stress_global.set_index('video_name').sort_index()
df_ypredict_stress_global = df_ypredict_stress_global.iloc[:,-1]

In [27]:
df_ypredict_stress_global

video_name
Test_pour_AFPA                     1.0
Video_1                            0.0
WIN_20210323_19_17_40_Pro          0.0
WIN_20210329_10_16_02_Pro          1.0
WIN_20210330_13_10_29_Pro          0.0
WIN_20210331_21_22_52_Pro          0.0
WIN_20210402_14_27_50_Pro          1.0
WIN_20210402_19_04_53_Pro          2.0
WIN_20210403_18_49_15_Pro          0.0
WIN_20210404_10_58_27_Pro          0.0
WIN_20210404_21_41_12_Pro          0.0
WIN_20210405_15_09_16_Pro          0.0
WIN_20210406_15_06_15_Pro          1.0
WIN_20210406_18_35_52_Pro          0.0
WIN_20210406_18_49_10_Pro          1.0
WIN_20210406_21_05_52_Pro          0.0
WIN_20210407_09_04_05_Pro          0.0
WIN_20210407_14_54_56_Pro_edit2    0.0
WIN_20210408_11_48_58_Pro          0.0
WIN_20210408_14_00_44_Pro          1.0
WIN_20210408_14_02_19_Pro          0.0
WIN_20210408_14_11_32_Pro          1.0
WIN_20210408_15_20_51_Pro          0.0
WIN_20210408_16_04_32_Pro          1.0
WIN_20210409_10_26_11_Pro          0.0
WIN_20210413_1

In [28]:
from sklearn.metrics import accuracy_score, f1_score
print('Accuracy',accuracy_score(y.sort_index().values,df_ypredict_stress_global))
print('F1',f1_score(y.sort_index().values,df_ypredict_stress_global, average='weighted'))

Accuracy 0.6
F1 0.5733333333333334


In [29]:
pd.concat([y, df_ypredict_stress_global.reset_index()],axis=1)

Unnamed: 0,stress_global,video_name,predicted_stress_global
0,1.0,Test_pour_AFPA,1.0
1,0.0,Video_1,0.0
2,1.0,WIN_20210323_19_17_40_Pro,0.0
3,1.0,WIN_20210329_10_16_02_Pro,1.0
4,0.0,WIN_20210330_13_10_29_Pro,0.0
5,1.0,WIN_20210331_21_22_52_Pro,0.0
6,1.0,WIN_20210402_14_27_50_Pro,1.0
7,2.0,WIN_20210402_19_04_53_Pro,2.0
8,2.0,WIN_20210403_18_49_15_Pro,0.0
9,1.0,WIN_20210404_10_58_27_Pro,0.0


### Autre approche 

In [30]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_2.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
#df_ypredict = df_ypredict[['video_name','diapo','ypredict']].groupby(['video_name','diapo']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew})
df_ypredict = df_ypredict[['video_name','ypredict']].groupby(['video_name']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew})
df_ypredict


Unnamed: 0_level_0,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict,ypredict
Unnamed: 0_level_1,std,skew,max,mean,median,percentil75,percentil25,kurtosis,min
video_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Test_pour_AFPA,0.416303,-1.07429,2.0,0.830688,1.0,1.0,1.0,0.987649,0.0
Video_1,0.480893,-0.594136,1.0,0.642384,1.0,1.0,0.0,-1.647003,0.0
WIN_20210323_19_17_40_Pro,0.471455,-0.712361,2.0,0.702381,1.0,1.0,0.0,-1.029843,0.0
WIN_20210329_10_16_02_Pro,0.394019,-1.576482,1.0,0.809524,1.0,1.0,1.0,0.485294,0.0
WIN_20210330_13_10_29_Pro,0.446225,-1.02745,1.0,0.728477,1.0,1.0,0.0,-0.944346,0.0
WIN_20210331_21_22_52_Pro,0.471388,-0.725908,1.0,0.670588,1.0,1.0,0.0,-1.473058,0.0
WIN_20210402_14_27_50_Pro,0.073324,-13.527949,1.0,0.994624,1.0,1.0,1.0,181.005405,0.0
WIN_20210402_19_04_53_Pro,0.395625,3.293186,2.0,0.125714,0.0,0.0,0.0,10.58608,0.0
WIN_20210403_18_49_15_Pro,0.453755,-0.940144,1.0,0.712707,1.0,1.0,0.0,-1.11613,0.0
WIN_20210404_10_58_27_Pro,0.588649,2.085987,2.0,0.265,0.0,0.0,0.0,3.033203,0.0


In [31]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [32]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]
Xy

Unnamed: 0,video_name,"(ypredict, std)","(ypredict, skew)","(ypredict, max)","(ypredict, mean)","(ypredict, median)","(ypredict, percentil75)","(ypredict, percentil25)","(ypredict, kurtosis)","(ypredict, min)",stress_global
0,Test_pour_AFPA,0.416303,-1.07429,2.0,0.830688,1.0,1.0,1.0,0.987649,0.0,1.0
1,Video_1,0.480893,-0.594136,1.0,0.642384,1.0,1.0,0.0,-1.647003,0.0,0.0
2,WIN_20210323_19_17_40_Pro,0.471455,-0.712361,2.0,0.702381,1.0,1.0,0.0,-1.029843,0.0,1.0
3,WIN_20210329_10_16_02_Pro,0.394019,-1.576482,1.0,0.809524,1.0,1.0,1.0,0.485294,0.0,1.0
4,WIN_20210330_13_10_29_Pro,0.446225,-1.02745,1.0,0.728477,1.0,1.0,0.0,-0.944346,0.0,0.0
5,WIN_20210331_21_22_52_Pro,0.471388,-0.725908,1.0,0.670588,1.0,1.0,0.0,-1.473058,0.0,1.0
6,WIN_20210402_14_27_50_Pro,0.073324,-13.527949,1.0,0.994624,1.0,1.0,1.0,181.005405,0.0,1.0
7,WIN_20210402_19_04_53_Pro,0.395625,3.293186,2.0,0.125714,0.0,0.0,0.0,10.58608,0.0,2.0
8,WIN_20210403_18_49_15_Pro,0.453755,-0.940144,1.0,0.712707,1.0,1.0,0.0,-1.11613,0.0,2.0
9,WIN_20210404_10_58_27_Pro,0.588649,2.085987,2.0,0.265,0.0,0.0,0.0,3.033203,0.0,1.0


In [35]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_jobs=-1)
parameters = {'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12,  15, 20], 'weights' : ['uniform', 'distance'], 'p': [1, 2]}

#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#model = RandomForestClassifier(random_state = 42, n_jobs=-1)
#parameters = {'n_estimators': [50, 100, 200], 'max_depth':[3,4, 5,6, 10, 15, 20], 'class_weight':[None,'balanced']}

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='f1_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 5 folds for each of 44 candidates, totalling 220 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.385079365079365
Best params {'n_neighbors': 6, 'p': 1, 'weights': 'uniform'}
accuracy (mean, std) 0.4666666666666667 0.19436506316151003
f1 (mean, std) 0.385079365079365 0.20965027448427734
[Parallel(n_jobs=1)]: Done 220 out of 220 | elapsed:    2.3s finished
