In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

from sklearn.metrics import confusion_matrix
from keras.layers import Dense, Activation, Dropout
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import random

from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.utils.class_weight import compute_class_weight
from itertools import combinations
import joblib

In [2]:
df1 = pd.read_csv('/Users/ohidabinteamin/Documents/Stress Prediction Project Three Datasets/StudentLife/week 01/Audio/raw_audio_features_studentlife.csv')
df1 = df1.rename(columns={'Date': 'date'})
df1 = df1.drop('Unnamed: 0', axis=1)
df2 = pd.read_csv('/Users/ohidabinteamin/Documents/Stress Prediction Project Three Datasets/StudentLife/week 01/Stress/recreating_dailystress_features.csv')
df2 = df2.drop('Unnamed: 0', axis=1)

In [3]:
df = pd.merge(df1, df2, on=['uid', 'date'])
print(df.columns)

df = df.sort_values(by='date')

Index(['uid', 'date', 'morning_noise_duration', 'morning_silent_duration',
       'morning_voice_duration', 'afternoon_noise_duration',
       'afternoon_silent_duration', 'afternoon_voice_duration',
       'evening_noise_duration', 'evening_silent_duration',
       'evening_voice_duration', 'night_noise_duration',
       'night_silent_duration', 'night_voice_duration', 'stress_ratings'],
      dtype='object')


In [4]:
df.isnull().sum()

uid                          0
date                         0
morning_noise_duration       0
morning_silent_duration      0
morning_voice_duration       0
afternoon_noise_duration     0
afternoon_silent_duration    0
afternoon_voice_duration     0
evening_noise_duration       0
evening_silent_duration      0
evening_voice_duration       0
night_noise_duration         0
night_silent_duration        0
night_voice_duration         0
stress_ratings               0
dtype: int64

In [5]:
df = df.dropna()
print(len(df))

1216


In [6]:
df.columns

Index(['uid', 'date', 'morning_noise_duration', 'morning_silent_duration',
       'morning_voice_duration', 'afternoon_noise_duration',
       'afternoon_silent_duration', 'afternoon_voice_duration',
       'evening_noise_duration', 'evening_silent_duration',
       'evening_voice_duration', 'night_noise_duration',
       'night_silent_duration', 'night_voice_duration', 'stress_ratings'],
      dtype='object')

In [7]:
len(df.columns)

15

In [8]:
df['stress_ratings'].value_counts()

stress_ratings
medium stress    484
low stress       366
high stress      366
Name: count, dtype: int64

In [9]:
binary_lh_data = df[df['stress_ratings'].isin(['low stress', 'high stress'])]

In [10]:
print(len(binary_lh_data['uid'].unique()))
binary_lh_data['uid'].value_counts()

46


uid
u59    54
u16    47
u19    41
u49    33
u10    29
u33    29
u04    27
u58    26
u00    25
u57    24
u08    23
u44    20
u43    20
u52    20
u46    18
u32    18
u22    17
u45    17
u35    16
u01    14
u12    14
u51    13
u02    13
u56    13
u36    12
u25    12
u03    12
u07    12
u24    11
u53    11
u30    11
u27    10
u42    10
u14     9
u54     9
u41     6
u31     6
u23     6
u47     5
u18     5
u17     4
u05     3
u34     2
u20     2
u15     2
u09     1
Name: count, dtype: int64

In [11]:
# uid_counts = binary_lh_data['uid'].value_counts()
# uids_to_keep = uid_counts[uid_counts >= 5].index
# binary_lh_data = binary_lh_data[binary_lh_data['uid'].isin(uids_to_keep)]

# print('Length of Data: ', len(binary_lh_data))
# binary_lh_data = binary_lh_data.sort_values(by=['uid', 'date'])
# print(binary_lh_data['uid'].unique())

In [12]:
X = binary_lh_data.drop(columns=['stress_ratings', 'uid', 'date'])
y = binary_lh_data['stress_ratings']
groups = binary_lh_data['uid']

stress_map = {'low stress': 0, 'high stress': 1}
y_encoded = y.map(stress_map).values 

In [13]:
print(f"Number of features in X: {X.shape[1]}")

Number of features in X: 12


In [14]:
logo = LeaveOneGroupOut()

best_thresholds = []
balanced_accs = []
auc_scores = []

In [15]:
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)
tf.random.set_seed(seed_value)

In [16]:
param_grid = [
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
     'penalty': ['l2'],
     'solver': ['newton-cg', 'lbfgs', 'sag']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
     'penalty': ['l1', 'l2'],
     'solver': ['liblinear']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
     'l1_ratio': [0.2, 0.5, 0.8],
     'penalty': ['elasticnet'],
     'solver': ['saga']}
]

In [17]:
scaler = StandardScaler()
model_logistic_rfe = LogisticRegression(max_iter=5000)

rfe = RFE(model_logistic_rfe, n_features_to_select=15)  
X_ = scaler.fit_transform(X)
rfe.fit(X_, y_encoded) 

best_features = np.where(rfe.support_)[0]
print(f"Selected Features: {best_features}")

num_splits = len(np.unique(groups))
print(num_splits)

with tqdm(total=num_splits, desc="LOSO CV Progress", unit="fold", bar_format="{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}] {percentage:3.0f}%") as pbar:
    for train_idx, test_idx in logo.split(X, y_encoded, groups=groups):
        X_train, X_test = X.iloc[train_idx, best_features], X.iloc[test_idx, best_features]
        y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

        print('X and y train shapes: ')
        print(X_train.shape)
        print(y_train.shape)

        print('X and y test shapes: ')
        print(X_test.shape)
        print(y_test.shape)

        scaler = StandardScaler()
        X_train_normalized = scaler.fit_transform(X_train)
        X_test_normalized = scaler.transform(X_test)
        
        model_logistic = GridSearchCV(LogisticRegression(max_iter=5000, class_weight='balanced'),
                                      param_grid=param_grid, cv=3, verbose=True, n_jobs=-1, scoring='roc_auc')
        if len(np.unique(y_train)) > 1 and len(np.unique(y_test)) > 1:
            model_logistic.fit(X_train_normalized, y_train)
    
            y_test_pred_proba = model_logistic.predict_proba(X_test_normalized)[:, 1]
            thresholds = np.arange(0.01, 1.0, 0.01)
            best_threshold = 0.5
            best_metric = 0.0
    
            for threshold in thresholds:
                y_test_pred_binary = (y_test_pred_proba > threshold).astype(int)
                metric_value = balanced_accuracy_score(y_test, y_test_pred_binary)
                if metric_value > best_metric:
                    best_metric = metric_value
                    best_threshold = threshold
            if len(np.unique(y_test)) > 1:
                auc_score = roc_auc_score(y_test, y_test_pred_proba)
                auc_scores.append(auc_score)
                print(f"AUC Score: {auc_score}")
            else:
                auc_scores.append(None)
                print(f"Skipping AUC computation for this fold as y_test contains only one class: {np.unique(y_test)}")
    
            y_test_pred_binary = (y_test_pred_proba > best_threshold).astype(int)
            balanced_acc = balanced_accuracy_score(y_test, y_test_pred_binary)
            balanced_accs.append(balanced_acc)
        
            print(f"Balanced Accuracy: {balanced_acc}")

        else:
            print(f"Skipping this subject for having single class: y_train = {np.unique(y_train)}, y_test = {np.unique(y_test)}")
        
        pbar.update(1)



Selected Features: [ 0  1  2  3  4  5  6  7  8  9 10 11]
46


LOSO CV Progress:   0%|                                    | 0/46 [00:00<?]   0%

X and y train shapes: 
(707, 12)
(707,)
X and y test shapes: 
(25, 12)
(25,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:   4%|█▎                              | 2/46 [00:01<00:19]   4%

AUC Score: 0.6346153846153846
Balanced Accuracy: 0.7147435897435898
X and y train shapes: 
(718, 12)
(718,)
X and y test shapes: 
(14, 12)
(14,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.7111111111111111
Balanced Accuracy: 0.6444444444444444
X and y train shapes: 
(719, 12)
(719,)
X and y test shapes: 
(13, 12)
(13,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:   9%|██▋                             | 4/46 [00:01<00:09]   9%

AUC Score: 0.475
Balanced Accuracy: 0.625
X and y train shapes: 
(720, 12)
(720,)
X and y test shapes: 
(12, 12)
(12,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5142857142857143
Balanced Accuracy: 0.5714285714285714
X and y train shapes: 
(705, 12)
(705,)
X and y test shapes: 
(27, 12)
(27,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  13%|████                            | 6/46 [00:01<00:06]  13%

AUC Score: 0.7637362637362637
Balanced Accuracy: 0.7005494505494505
X and y train shapes: 
(729, 12)
(729,)
X and y test shapes: 
(3, 12)
(3,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 1.0
Balanced Accuracy: 0.75
X and y train shapes: 
(720, 12)
(720,)
X and y test shapes: 
(12, 12)
(12,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  17%|█████▍                          | 8/46 [00:01<00:05]  17%

AUC Score: 0.3428571428571428
Balanced Accuracy: 0.5285714285714286
X and y train shapes: 
(709, 12)
(709,)
X and y test shapes: 
(23, 12)
(23,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.525
Balanced Accuracy: 0.5375
X and y train shapes: 
(731, 12)
(731,)
X and y test shapes: 
(1, 12)
(1,)
Skipping this subject for having single class: y_train = [0 1], y_test = [1]
X and y train shapes: 
(703, 12)
(703,)
X and y test shapes: 
(29, 12)
(29,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  22%|██████▌                        | 10/46 [00:01<00:03]  22%

AUC Score: 0.4805194805194805
Balanced Accuracy: 0.5454545454545454
X and y train shapes: 
(718, 12)
(718,)
X and y test shapes: 
(14, 12)
(14,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.9583333333333334
Balanced Accuracy: 0.8541666666666667
X and y train shapes: 
(723, 12)
(723,)
X and y test shapes: 
(9, 12)
(9,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.8
Balanced Accuracy: 0.75


LOSO CV Progress:  30%|█████████▏                     | 14/46 [00:02<00:03]  30%

X and y train shapes: 
(730, 12)
(730,)
X and y test shapes: 
(2, 12)
(2,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.0
Balanced Accuracy: 0.5
X and y train shapes: 
(685, 12)
(685,)
X and y test shapes: 
(47, 12)
(47,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.6333333333333334
Balanced Accuracy: 0.6058823529411765


LOSO CV Progress:  35%|██████████▍                    | 16/46 [00:02<00:02]  35%

X and y train shapes: 
(728, 12)
(728,)
X and y test shapes: 
(4, 12)
(4,)
Skipping this subject for having single class: y_train = [0 1], y_test = [0]
X and y train shapes: 
(727, 12)
(727,)
X and y test shapes: 
(5, 12)
(5,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 1.0
Balanced Accuracy: 1.0
X and y train shapes: 
(691, 12)
(691,)
X and y test shapes: 
(41, 12)
(41,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  39%|███████████▋                   | 18/46 [00:02<00:02]  39%

AUC Score: 0.36451612903225805
Balanced Accuracy: 0.5129032258064516
X and y train shapes: 
(730, 12)
(730,)
X and y test shapes: 
(2, 12)
(2,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 1.0
Balanced Accuracy: 1.0
X and y train shapes: 
(715, 12)
(715,)
X and y test shapes: 
(17, 12)
(17,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  43%|█████████████                  | 20/46 [00:02<00:02]  43%

AUC Score: 0.6
Balanced Accuracy: 0.6666666666666666
X and y train shapes: 
(726, 12)
(726,)
X and y test shapes: 
(6, 12)
(6,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.6
Balanced Accuracy: 0.8
X and y train shapes: 
(721, 12)
(721,)
X and y test shapes: 
(11, 12)
(11,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.3333333333333333
Balanced Accuracy: 0.5833333333333333


LOSO CV Progress:  48%|██████████████▎                | 22/46 [00:03<00:02]  48%

X and y train shapes: 
(720, 12)
(720,)
X and y test shapes: 
(12, 12)
(12,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.65625
Balanced Accuracy: 0.75
X and y train shapes: 
(722, 12)
(722,)
X and y test shapes: 
(10, 12)
(10,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  52%|███████████████▋               | 24/46 [00:03<00:02]  52%

AUC Score: 0.5
Balanced Accuracy: 0.6875
X and y train shapes: 
(721, 12)
(721,)
X and y test shapes: 
(11, 12)
(11,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.7222222222222222
Balanced Accuracy: 0.75
X and y train shapes: 
(726, 12)
(726,)
X and y test shapes: 
(6, 12)
(6,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  59%|█████████████████▌             | 27/46 [00:03<00:01]  59%

AUC Score: 0.6
Balanced Accuracy: 0.8
X and y train shapes: 
(714, 12)
(714,)
X and y test shapes: 
(18, 12)
(18,)
Skipping this subject for having single class: y_train = [0 1], y_test = [0]
X and y train shapes: 
(703, 12)
(703,)
X and y test shapes: 
(29, 12)
(29,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.31249999999999994
Balanced Accuracy: 0.5
X and y train shapes: 
(730, 12)
(730,)
X and y test shapes: 
(2, 12)
(2,)
Skipping this subject for having single class: y_train = [0 1], y_test = [0]
X and y train shapes: 
(716, 12)
(716,)
X and y test shapes: 
(16, 12)
(16,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.19999999999999996
Balanced Accuracy: 0.5909090909090909


LOSO CV Progress:  67%|████████████████████▏          | 31/46 [00:03<00:01]  67%

X and y train shapes: 
(720, 12)
(720,)
X and y test shapes: 
(12, 12)
(12,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.75
Balanced Accuracy: 0.7
X and y train shapes: 
(726, 12)
(726,)
X and y test shapes: 
(6, 12)
(6,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.7777777777777778
Balanced Accuracy: 0.6666666666666666


LOSO CV Progress:  72%|█████████████████████▌         | 33/46 [00:04<00:01]  72%

X and y train shapes: 
(722, 12)
(722,)
X and y test shapes: 
(10, 12)
(10,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.8095238095238095
Balanced Accuracy: 0.7619047619047619
X and y train shapes: 
(712, 12)
(712,)
X and y test shapes: 
(20, 12)
(20,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5490196078431373
Balanced Accuracy: 0.6274509803921569


LOSO CV Progress:  76%|██████████████████████▊        | 35/46 [00:04<00:01]  76%

X and y train shapes: 
(712, 12)
(712,)
X and y test shapes: 
(20, 12)
(20,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.20833333333333331
Balanced Accuracy: 0.5
X and y train shapes: 
(715, 12)
(715,)
X and y test shapes: 
(17, 12)
(17,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.6944444444444444
Balanced Accuracy: 0.7638888888888888
X and y train shapes: 
(714, 12)
(714,)
X and y test shapes: 
(18, 12)
(18,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5
Balanced Accuracy: 0.525
X and y train shapes: 
(727, 12)
(727,)
X and y test shapes: 
(5, 12)
(5,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  80%|████████████████████████▏      | 37/46 [00:04<00:00]  80%

AUC Score: 0.8333333333333334
Balanced Accuracy: 0.75
X and y train shapes: 
(699, 12)
(699,)
X and y test shapes: 
(33, 12)
(33,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.41735537190082644
Balanced Accuracy: 0.5681818181818182
X and y train shapes: 
(719, 12)
(719,)
X and y test shapes: 
(13, 12)
(13,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  87%|██████████████████████████     | 40/46 [00:04<00:00]  87%

AUC Score: 0.8333333333333334
Balanced Accuracy: 0.9166666666666667
X and y train shapes: 
(712, 12)
(712,)
X and y test shapes: 
(20, 12)
(20,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.4270833333333333
Balanced Accuracy: 0.5
X and y train shapes: 
(721, 12)
(721,)
X and y test shapes: 
(11, 12)
(11,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  91%|███████████████████████████▍   | 42/46 [00:05<00:00]  91%

AUC Score: 0.38888888888888884
Balanced Accuracy: 0.5555555555555556
X and y train shapes: 
(723, 12)
(723,)
X and y test shapes: 
(9, 12)
(9,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.8
Balanced Accuracy: 0.775
X and y train shapes: 
(719, 12)
(719,)
X and y test shapes: 
(13, 12)
(13,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.47619047619047616
Balanced Accuracy: 0.5952380952380952


LOSO CV Progress:  96%|████████████████████████████▋  | 44/46 [00:05<00:00]  96%

X and y train shapes: 
(708, 12)
(708,)
X and y test shapes: 
(24, 12)
(24,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.8403361344537814
Balanced Accuracy: 0.7142857142857143
X and y train shapes: 
(706, 12)
(706,)
X and y test shapes: 
(26, 12)
(26,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress: 100%|██████████████████████████████ | 46/46 [00:05<00:00] 100%

AUC Score: 0.5583333333333333
Balanced Accuracy: 0.6333333333333333
X and y train shapes: 
(678, 12)
(678,)
X and y test shapes: 
(54, 12)
(54,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.6680761099365751
Balanced Accuracy: 0.6691331923890064





In [18]:
print('Logistic Regression: ')
print('Median, 25th Percentile, 75th Percentile: ')

if auc_scores:
    auc_scores_valid = [score for score in auc_scores if score is not None] 

    auc_median = np.median(auc_scores_valid)
    auc_25_percentile = np.percentile(auc_scores_valid, 25)
    auc_75_percentile = np.percentile(auc_scores_valid, 75)
    print(f"AUC Score - Median: {auc_median:.4f}, 25th Percentile: {auc_25_percentile:.4f}, 75th Percentile: {auc_75_percentile:.4f}")

balanced_accs_valid = [acc for acc in balanced_accs if acc is not None] 

balanced_acc_median = np.median(balanced_accs)
balanced_acc_25_percentile = np.percentile(balanced_accs, 25)
balanced_acc_75_percentile = np.percentile(balanced_accs, 75)

print(f"Balanced Accuracy - Median: {balanced_acc_median:.4f}, 25th Percentile: {balanced_acc_25_percentile:.4f}, 75th Percentile: {balanced_acc_75_percentile:.4f}")

Logistic Regression: 
Median, 25th Percentile, 75th Percentile: 
AUC Score - Median: 0.6000, 25th Percentile: 0.4753, 75th Percentile: 0.7743
Balanced Accuracy - Median: 0.6667, 25th Percentile: 0.5690, 75th Percentile: 0.7500
