In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import pathlib

## Loading data

In [2]:
directory_path = '../../04_-_Dev/videos'
features = 'emobase'

In [None]:
df_total = pd.read_pickle(directory_path + '/audio_' + features + '_data.p')

In [4]:
df_total.shape

(2573701, 60)

In [5]:
df_total.video_name.nunique()

30

In [7]:
df_total[df_total.isna().any(axis=1)]

Unnamed: 0,frameIndex,frameTime,pcm_intensity_sma,pcm_loudness_sma,mfcc_sma[1],mfcc_sma[2],mfcc_sma[3],mfcc_sma[4],mfcc_sma[5],mfcc_sma[6],...,pcm_zcr_sma_de,voiceProb_sma_de,F0_sma_de,F0env_sma_de,video_name,stress_global,type_candidat,sexe,stress,diapo


## Data processing

In [8]:
time_window = 5
df_total['frameTimeWindow'] = df_total.frameTime.apply(lambda x : np.floor(x / time_window) * time_window).astype(int)

In [10]:
from scipy.stats import kurtosis, skew

def percentil25(x): 
    return np.percentile(x, q=25)

def percentil75(x): 
    return np.percentile(x, q=75)

In [11]:
# 5 seconds windows
X = df_total.iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]

In [16]:
y = df_total.iloc[:,3:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'min'}).iloc[:,-1]

In [18]:
X.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

In [21]:
from scipy.stats import kurtosis, skew
X_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'mean','min','max', 'median', 'std', percentil25, percentil75, kurtosis, skew}).iloc[:,:-18]
y_audio = df_total.loc[df_total['diapo'].isin([8, 9, 10, 11, 17]),:].iloc[:,2:].groupby(['video_name','diapo','frameTimeWindow']).agg({'stress':'mean'}).iloc[:,-1]

In [22]:
X_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio.to_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

## Modèles

In [3]:
X = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X.p')
y = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y.p')

X_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_X_audio_questions_only.p')
y_audio = pd.read_pickle(directory_path + '/audio_' + features + '_tw5_data_y_audio_questions_only.p')

In [4]:
X.reset_index().video_name.nunique(),X_audio.reset_index().video_name.nunique()

(30, 30)

## Stress par diapos
### All diapos

In [8]:
parameters = {'n_estimators': [100, 150, 200, 250, 300], 'max_depth':[10, 15, 20, 25, 30]}

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = RandomForestRegressor(random_state = 42, n_jobs=-1)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'neg_mean_squared_error' : 'neg_mean_squared_error', 'r2' : 'r2' }, 
                    refit='neg_mean_squared_error',
                    cv=cv_loo, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)

Fitting 30 folds for each of 25 candidates, totalling 750 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [10]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

#model = RandomForestClassifier(random_state = 42, n_jobs=-1, max_depth= 10, n_estimators= 50)
model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name','diapo','frameTimeWindow']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5.csv')

In [6]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)


In [7]:
X = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index=['video_name','diapo'], aggfunc='count', fill_value=0)
X_sum = X.sum(axis=1).values.copy()
for col_number in range(len(X.columns)):
    X.iloc[:,col_number] = X.iloc[:,col_number] / X_sum
X
 

Unnamed: 0_level_0,ypredict,0,1,2
video_name,diapo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_pour_AFPA,1,0.088235,0.147059,0.764706
Test_pour_AFPA,8,0.100000,0.600000,0.300000
Test_pour_AFPA,9,0.050000,0.800000,0.150000
Test_pour_AFPA,10,0.000000,0.733333,0.266667
Test_pour_AFPA,11,0.050000,0.750000,0.200000
...,...,...,...,...
WIN_20210417_14_53_12_Pro,10,0.285714,0.714286,0.000000
WIN_20210417_14_53_12_Pro,11,0.500000,0.500000,0.000000
WIN_20210417_14_53_12_Pro,12,0.151515,0.848485,0.000000
WIN_20210417_14_53_12_Pro,17,0.571429,0.428571,0.000000


In [11]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0,video_name,diapo,stress
0,Test_pour_AFPA,1,1.0
1,Test_pour_AFPA,8,1.0
2,Test_pour_AFPA,9,0.0
3,Test_pour_AFPA,10,0.0
4,Test_pour_AFPA,11,0.0
...,...,...,...
235,WIN_20210417_14_53_12_Pro,10,0.0
236,WIN_20210417_14_53_12_Pro,11,0.0
237,WIN_20210417_14_53_12_Pro,12,0.0
238,WIN_20210417_14_53_12_Pro,17,1.0


In [22]:
Xy = X.merge(df_annotations_stress, how='inner', on=['video_name','diapo'])
X = Xy.iloc[:,:-1].set_index(['video_name','diapo'])
y = Xy.iloc[:,-1]

In [29]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='accuracy_score',
                    cv=cv_loo, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 30 folds for each of 16 candidates, totalling 480 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.45
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.45 0.24706611800622655
f1 (mean, std) 0.32875735375735365 0.25289381017017226
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:    9.8s finished


In [31]:
# Getting predictions with a leave one interview out
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)

model = clf.best_estimator_

ytest_predict = np.zeros(len(y))
for train_index, test_index in loo.split(X, y, groups):
    #print(train_index, test_index)
    Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]
    ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

    model.fit(Xtrain, ytrain)
    ytest_predict_temp = model.predict(Xtest)
    #print(ytest_predict_temp.shape)
    #print(test_index.shape)
    ytest_predict[test_index] = ytest_predict_temp
y_predict = ytest_predict

df_ypredict = pd.concat([X.reset_index()[['video_name','diapo']],pd.DataFrame(y_predict, columns=['ypredict'])],axis=1)
df_ypredict.columns = ['video_name','diapo','ypredict']
df_ypredict.to_csv('ypredict_' + features + '_tw5_diapo.csv')

In [28]:
parameters = {'n_estimators': [50, 100, 150, 200], 'max_depth':[3, 4, 5,10, 15, 20, 25]}
#parameters = {'n_estimators': [50, 100], 'max_depth':[3, 4, 5, 6]}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = RandomForestClassifier(random_state = 42, n_jobs=-1)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='accuracy_score',
                    cv=cv_loo, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 30 folds for each of 28 candidates, totalling 840 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 840 out of 840 | elapsed:  2.1min finished
Best results 0.4083333333333333
Best params {'max_depth': 15, 'n_estimators': 200}
accuracy (mean, std) 0.4083333333333333 0.21874007914011145
f1 (mean, std) 0.39981900044400037 0.2234510476716193


## Stress global

### En utilisant le stress prédit des diapos

In [36]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5_diapo.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','ypredict']
ypredict_stress_diapo = df_ypredict.pivot_table(values='ypredict', columns='diapo',index='video_name',aggfunc='mean')


In [37]:
ypredict_stress_diapo # un peu nul - le modèle prédit 1 partout

diapo,1,8,9,10,11,12,17,18
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Test_pour_AFPA,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Video_1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210323_19_17_40_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210329_10_16_02_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210330_13_10_29_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210331_21_22_52_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210402_19_04_53_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
WIN_20210404_10_58_27_Pro,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [38]:
df_annotations_stress = pd.read_csv('annotations.csv')
#df_annotations_stress = df_annotations_stress.drop(df_annotations_stress.columns[-1],axis=1).set_index('video_name').stack()
#df_annotations_stress = pd.DataFrame(df_annotations_stress).reset_index()
#df_annotations_stress.columns = ['video_name','diapo','stress']
#df_annotations_stress = df_annotations_stress.set_index(['video_name','diapo'])
#df_annotations_stress['diapo'] = df_annotations_stress['diapo'].astype(int)
df_annotations_stress

Unnamed: 0,video_name,1,8,9,10,11,12,17,18,stress_global
0,Test_pour_AFPA,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,Video_1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
2,WIN_20210323_19_17_40_Pro,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,WIN_20210329_10_16_02_Pro,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
4,WIN_20210330_13_10_29_Pro,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,WIN_20210331_21_22_52_Pro,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
6,WIN_20210402_14_27_50_Pro,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
7,WIN_20210402_19_04_53_Pro,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0
8,WIN_20210403_18_49_15_Pro,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0
9,WIN_20210404_10_58_27_Pro,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### En utilisant le stress prédit des time windows 5s

In [43]:
df_ypredict = pd.read_csv('ypredict_' + features + '_tw5.csv')
df_ypredict = df_ypredict.iloc[:,1:]
df_ypredict.columns = ['video_name','diapo','frameTimeWindow','ypredict']
df_ypredict['ypredict'] = df_ypredict['ypredict'].astype(int)
df_ypredict = df_ypredict.pivot_table(values='frameTimeWindow', columns='ypredict', index='video_name', aggfunc='count', fill_value=0)
df_ypredict_sum = df_ypredict.sum(axis=1).values.copy()
for col_number in range(len(df_ypredict.columns)):
    df_ypredict.iloc[:,col_number] = df_ypredict.iloc[:,col_number] / df_ypredict_sum
df_ypredict = df_ypredict.reset_index()


ypredict,0,1,2
video_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Test_pour_AFPA,0.15873,0.433862,0.407407
Video_1,0.350993,0.649007,0.0
WIN_20210323_19_17_40_Pro,0.321429,0.678571,0.0
WIN_20210329_10_16_02_Pro,0.231293,0.768707,0.0
WIN_20210330_13_10_29_Pro,0.264901,0.735099,0.0
WIN_20210331_21_22_52_Pro,0.329412,0.670588,0.0
WIN_20210402_14_27_50_Pro,0.005376,0.978495,0.016129
WIN_20210402_19_04_53_Pro,0.88,0.12,0.0
WIN_20210403_18_49_15_Pro,0.19337,0.80663,0.0
WIN_20210404_10_58_27_Pro,0.84,0.11,0.05


In [48]:
df_annotations_stress = pd.read_csv('annotations.csv')
df_annotations_stress = df_annotations_stress[['video_name','stress_global']]

In [50]:
Xy = df_ypredict.merge(df_annotations_stress, on='video_name')
X = Xy.iloc[:,:-1].set_index('video_name')
y = Xy.iloc[:,-1]

In [51]:
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 10, 20], 'class_weight' : [None, 'balanced']}
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

groups = X.reset_index()['video_name']
loo = LeaveOneGroupOut()
cv_loo = loo.split(X, y, groups)
X_no_name = X
y_no_name = y

model = LogisticRegression(multi_class='multinomial', fit_intercept=True, random_state=42)

clf = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    scoring={'accuracy_score' : 'accuracy', 'f1_score' : 'f1_weighted' }, 
                    refit='accuracy_score',
                    cv=5, verbose=1)
clf.fit(X_no_name, y_no_name)
print('Best results', clf.best_score_)
print('Best params', clf.best_params_)
print('accuracy (mean, std)', clf.cv_results_['mean_test_accuracy_score'][clf.best_index_], clf.cv_results_['std_test_accuracy_score'][clf.best_index_])
print('f1 (mean, std)', clf.cv_results_['mean_test_f1_score'][clf.best_index_], clf.cv_results_['std_test_f1_score'][clf.best_index_])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Best results 0.39999999999999997
Best params {'C': 0.01, 'class_weight': None}
accuracy (mean, std) 0.39999999999999997 0.08164965809277261
f1 (mean, std) 0.2333333333333333 0.0816496580927726
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.0s finished


In [52]:
clf.best_estimator_.coef_

array([[ 0.00145937, -0.00313367,  0.00167404],
       [-0.00639217,  0.00611232,  0.00028031],
       [ 0.0049328 , -0.00297866, -0.00195435]])

In [53]:
clf.best_estimator_.intercept_

array([ 0.19725725,  0.01208429, -0.20934153])

In [34]:
print(X_no_name.columns[np.argsort(clf.best_estimator_.feature_importances_)[:-20:-1]])

MultiIndex([(      'mfcc_sma[8]', 'percentil75'),
            (   'lspFreq_sma[7]',      'median'),
            (      'mfcc_sma[3]',        'mean'),
            (      'mfcc_sma[7]', 'percentil75'),
            (      'mfcc_sma[3]',      'median'),
            (      'mfcc_sma[3]', 'percentil75'),
            (      'mfcc_sma[8]',      'median'),
            (      'mfcc_sma[7]',      'median'),
            ('lspFreq_sma_de[3]', 'percentil25'),
            (     'mfcc_sma[11]',        'mean'),
            ('lspFreq_sma_de[7]',         'min'),
            (   'lspFreq_sma[3]', 'percentil75'),
            (   'lspFreq_sma[5]',         'max'),
            ('lspFreq_sma_de[7]',         'std'),
            (     'mfcc_sma[10]', 'percentil75'),
            (     'mfcc_sma[11]', 'percentil75'),
            (      'mfcc_sma[7]',        'mean'),
            (   'lspFreq_sma[0]', 'percentil75'),
            (   'lspFreq_sma[3]',      'median')],
           )
