In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

from sklearn.metrics import confusion_matrix
from keras.layers import Dense, Activation, Dropout
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import random

from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.utils.class_weight import compute_class_weight
from itertools import combinations
import joblib

In [2]:
df1 = pd.read_csv('/Users/ohidabinteamin/Documents/Stress Prediction Project Three Datasets/StudentLife/week 01/Social_Interaction/recreating_social_features_studentlife.csv')
df1 = df1.rename(columns={'Date': 'date'})
df1 = df1.drop('Unnamed: 0', axis=1)
df2 = pd.read_csv('/Users/ohidabinteamin/Documents/Stress Prediction Project Three Datasets/StudentLife/week 01/Stress/recreating_dailystress_features.csv')
df2 = df2.drop('Unnamed: 0', axis=1)

In [3]:
df = pd.merge(df1, df2, on=['uid', 'date'])
print(df.columns)

df = df.sort_values(by='date')

Index(['Unnamed: 0.1', 'uid', 'date', 'app usage in morning',
       'app usage in afternoon', 'app usage in evening', 'app usage in night',
       'number of call in morning', 'number of call in afternoon',
       'number of call in evening', 'number of call in night',
       'number of Bluetooth contacts morning',
       'number of Bluetooth contacts afternoon',
       'number of Bluetooth contacts evening',
       'number of Bluetooth contacts night', 'conversation in morning',
       'conversation in afternoon', 'conversation in evening',
       'conversation in night', 'stress_ratings'],
      dtype='object')


In [4]:
df.isnull().sum()

Unnamed: 0.1                              0
uid                                       0
date                                      0
app usage in morning                      0
app usage in afternoon                    0
app usage in evening                      0
app usage in night                        0
number of call in morning                 0
number of call in afternoon               0
number of call in evening                 0
number of call in night                   0
number of Bluetooth contacts morning      0
number of Bluetooth contacts afternoon    0
number of Bluetooth contacts evening      0
number of Bluetooth contacts night        0
conversation in morning                   0
conversation in afternoon                 0
conversation in evening                   0
conversation in night                     0
stress_ratings                            0
dtype: int64

In [5]:
df = df.dropna()
print(len(df))

495


In [6]:
df.columns

Index(['Unnamed: 0.1', 'uid', 'date', 'app usage in morning',
       'app usage in afternoon', 'app usage in evening', 'app usage in night',
       'number of call in morning', 'number of call in afternoon',
       'number of call in evening', 'number of call in night',
       'number of Bluetooth contacts morning',
       'number of Bluetooth contacts afternoon',
       'number of Bluetooth contacts evening',
       'number of Bluetooth contacts night', 'conversation in morning',
       'conversation in afternoon', 'conversation in evening',
       'conversation in night', 'stress_ratings'],
      dtype='object')

In [7]:
len(df.columns)

20

In [8]:
df['stress_ratings'].value_counts()

stress_ratings
medium stress    193
high stress      162
low stress       140
Name: count, dtype: int64

In [9]:
binary_lh_data = df[df['stress_ratings'].isin(['low stress', 'high stress'])]

In [10]:
print(len(binary_lh_data['uid'].unique()))
binary_lh_data['uid'].value_counts()

18


uid
u59    51
u49    27
u57    24
u58    21
u08    21
u00    20
u52    20
u46    18
u12    14
u02    13
u51    13
u56    12
u24    11
u53    11
u36     9
u54     7
u31     5
u47     5
Name: count, dtype: int64

In [11]:
# uid_counts = binary_lh_data['uid'].value_counts()
# uids_to_keep = uid_counts[uid_counts >= 5].index
# binary_lh_data = binary_lh_data[binary_lh_data['uid'].isin(uids_to_keep)]

# print('Length of Data: ', len(binary_lh_data))
# binary_lh_data = binary_lh_data.sort_values(by=['uid', 'date'])
# print(binary_lh_data['uid'].unique())

In [12]:
X = binary_lh_data.drop(columns=['stress_ratings', 'uid', 'date'])
y = binary_lh_data['stress_ratings']
groups = binary_lh_data['uid']

stress_map = {'low stress': 0, 'high stress': 1}
y_encoded = y.map(stress_map).values 

In [13]:
print(f"Number of features in X: {X.shape[1]}")

Number of features in X: 17


In [14]:
logo = LeaveOneGroupOut()

best_thresholds = []
balanced_accs = []
auc_scores = []

In [15]:
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)
tf.random.set_seed(seed_value)

In [16]:
param_grid = [
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
     'penalty': ['l2'],
     'solver': ['newton-cg', 'lbfgs', 'sag']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
     'penalty': ['l1', 'l2'],
     'solver': ['liblinear']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
     'l1_ratio': [0.2, 0.5, 0.8],
     'penalty': ['elasticnet'],
     'solver': ['saga']}
]

In [17]:
scaler = StandardScaler()
model_logistic_rfe = LogisticRegression(max_iter=5000)

rfe = RFE(model_logistic_rfe, n_features_to_select=15)  
X_ = scaler.fit_transform(X)
rfe.fit(X_, y_encoded) 

best_features = np.where(rfe.support_)[0]
print(f"Selected Features: {best_features}")

num_splits = len(np.unique(groups))
print(num_splits)

with tqdm(total=num_splits, desc="LOSO CV Progress", unit="fold", bar_format="{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}] {percentage:3.0f}%") as pbar:
    for train_idx, test_idx in logo.split(X, y_encoded, groups=groups):
        X_train, X_test = X.iloc[train_idx, best_features], X.iloc[test_idx, best_features]
        y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

        print('X and y train shapes: ')
        print(X_train.shape)
        print(y_train.shape)

        print('X and y test shapes: ')
        print(X_test.shape)
        print(y_test.shape)

        scaler = StandardScaler()
        X_train_normalized = scaler.fit_transform(X_train)
        X_test_normalized = scaler.transform(X_test)
        
        model_logistic = GridSearchCV(LogisticRegression(max_iter=5000, class_weight='balanced'),
                                      param_grid=param_grid, cv=3, verbose=True, n_jobs=-1, scoring='roc_auc')
        if len(np.unique(y_train)) > 1 and len(np.unique(y_test)) > 1:
            model_logistic.fit(X_train_normalized, y_train)
    
            y_test_pred_proba = model_logistic.predict_proba(X_test_normalized)[:, 1]
            thresholds = np.arange(0.01, 1.0, 0.01)
            best_threshold = 0.5
            best_metric = 0.0
    
            for threshold in thresholds:
                y_test_pred_binary = (y_test_pred_proba > threshold).astype(int)
                metric_value = balanced_accuracy_score(y_test, y_test_pred_binary)
                if metric_value > best_metric:
                    best_metric = metric_value
                    best_threshold = threshold
            if len(np.unique(y_test)) > 1:
                auc_score = roc_auc_score(y_test, y_test_pred_proba)
                auc_scores.append(auc_score)
                print(f"AUC Score: {auc_score}")
            else:
                auc_scores.append(None)
                print(f"Skipping AUC computation for this fold as y_test contains only one class: {np.unique(y_test)}")
    
            y_test_pred_binary = (y_test_pred_proba > best_threshold).astype(int)
            balanced_acc = balanced_accuracy_score(y_test, y_test_pred_binary)
            balanced_accs.append(balanced_acc)
        
            print(f"Balanced Accuracy: {balanced_acc}")

        else:
            print(f"Skipping this subject for having single class: y_train = {np.unique(y_train)}, y_test = {np.unique(y_test)}")
        
        pbar.update(1)

Selected Features: [ 0  2  3  4  5  6  8  9 10 11 12 13 14 15 16]
18


LOSO CV Progress:   0%|                                    | 0/18 [00:00<?]   0%

X and y train shapes: 
(282, 15)
(282,)
X and y test shapes: 
(20, 15)
(20,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  11%|███▍                            | 2/18 [00:00<00:06]  11%

AUC Score: 0.13131313131313133
Balanced Accuracy: 0.5
X and y train shapes: 
(289, 15)
(289,)
X and y test shapes: 
(13, 15)
(13,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.7000000000000001
Balanced Accuracy: 0.7
X and y train shapes: 
(281, 15)
(281,)
X and y test shapes: 
(21, 15)
(21,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  22%|██████▉                         | 4/18 [00:01<00:02]  22%

AUC Score: 0.34444444444444444
Balanced Accuracy: 0.5833333333333334
X and y train shapes: 
(288, 15)
(288,)
X and y test shapes: 
(14, 15)
(14,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.6041666666666667
Balanced Accuracy: 0.5833333333333334
X and y train shapes: 
(291, 15)
(291,)
X and y test shapes: 
(11, 15)
(11,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.2777777777777778
Balanced Accuracy: 0.5555555555555556


LOSO CV Progress:  39%|████████████                    | 7/18 [00:01<00:01]  39%

X and y train shapes: 
(297, 15)
(297,)
X and y test shapes: 
(5, 15)
(5,)
Skipping this subject for having single class: y_train = [0 1], y_test = [0]
X and y train shapes: 
(293, 15)
(293,)
X and y test shapes: 
(9, 15)
(9,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.07142857142857142
Balanced Accuracy: 0.5
X and y train shapes: 
(284, 15)
(284,)
X and y test shapes: 
(18, 15)
(18,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  50%|███████████████▌                | 9/18 [00:01<00:01]  50%

AUC Score: 0.725
Balanced Accuracy: 0.7
X and y train shapes: 
(297, 15)
(297,)
X and y test shapes: 
(5, 15)
(5,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5
Balanced Accuracy: 0.5
X and y train shapes: 
(275, 15)
(275,)
X and y test shapes: 
(27, 15)
(27,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.6111111111111112
Balanced Accuracy: 0.6388888888888888
X and y train shapes: 
(289, 15)
(289,)
X and y test shapes: 
(13, 15)
(13,)


LOSO CV Progress:  61%|██████████████████▎            | 11/18 [00:01<00:00]  61%

Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.8333333333333334
Balanced Accuracy: 0.5
X and y train shapes: 
(282, 15)
(282,)
X and y test shapes: 
(20, 15)
(20,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  72%|█████████████████████▋         | 13/18 [00:02<00:00]  72%

AUC Score: 0.6875
Balanced Accuracy: 0.5833333333333334
X and y train shapes: 
(291, 15)
(291,)
X and y test shapes: 
(11, 15)
(11,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.4444444444444444
Balanced Accuracy: 0.6666666666666666
X and y train shapes: 
(295, 15)
(295,)
X and y test shapes: 
(7, 15)
(7,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.25
Balanced Accuracy: 0.5416666666666666


LOSO CV Progress:  83%|█████████████████████████      | 15/18 [00:02<00:00]  83%

X and y train shapes: 
(290, 15)
(290,)
X and y test shapes: 
(12, 15)
(12,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.4857142857142857
Balanced Accuracy: 0.5285714285714286
X and y train shapes: 
(278, 15)
(278,)
X and y test shapes: 
(24, 15)
(24,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  94%|████████████████████████████▎  | 17/18 [00:02<00:00]  94%

AUC Score: 0.6302521008403361
Balanced Accuracy: 0.5798319327731092
X and y train shapes: 
(281, 15)
(281,)
X and y test shapes: 
(21, 15)
(21,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.625
Balanced Accuracy: 0.7
X and y train shapes: 
(251, 15)
(251,)
X and y test shapes: 
(51, 15)
(51,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.6204545454545455
Balanced Accuracy: 0.5943181818181817


LOSO CV Progress: 100%|██████████████████████████████ | 18/18 [00:02<00:00] 100%


In [18]:
print('Logistic Regression: ')
print('Median, 25th Percentile, 75th Percentile: ')

if auc_scores:
    auc_scores_valid = [score for score in auc_scores if score is not None] 

    auc_median = np.median(auc_scores_valid)
    auc_25_percentile = np.percentile(auc_scores_valid, 25)
    auc_75_percentile = np.percentile(auc_scores_valid, 75)
    print(f"AUC Score - Median: {auc_median:.4f}, 25th Percentile: {auc_25_percentile:.4f}, 75th Percentile: {auc_75_percentile:.4f}")

balanced_accs_valid = [acc for acc in balanced_accs if acc is not None] 

balanced_acc_median = np.median(balanced_accs)
balanced_acc_25_percentile = np.percentile(balanced_accs, 25)
balanced_acc_75_percentile = np.percentile(balanced_accs, 75)

print(f"Balanced Accuracy - Median: {balanced_acc_median:.4f}, 25th Percentile: {balanced_acc_25_percentile:.4f}, 75th Percentile: {balanced_acc_75_percentile:.4f}")

Logistic Regression: 
Median, 25th Percentile, 75th Percentile: 
AUC Score - Median: 0.6042, 25th Percentile: 0.3444, 75th Percentile: 0.6303
Balanced Accuracy - Median: 0.5833, 25th Percentile: 0.5286, 75th Percentile: 0.6389
