In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

from sklearn.metrics import confusion_matrix
from keras.layers import Dense, Activation, Dropout
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import random

from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.utils.class_weight import compute_class_weight
from itertools import combinations
import joblib

In [2]:
df1 = pd.read_csv('/Users/ohidabinteamin/Documents/Stress Prediction Project Three Datasets/StudentLife/week 01/PhoneLog/raw_phonelog_features_studentlife.csv')
df1 = df1.rename(columns={'Date': 'date'})
df1 = df1.drop('Unnamed: 0', axis=1)
df2 = pd.read_csv('/Users/ohidabinteamin/Documents/Stress Prediction Project Three Datasets/StudentLife/week 01/Stress/recreating_dailystress_features.csv')
df2 = df2.drop('Unnamed: 0', axis=1)

In [3]:
df = pd.merge(df1, df2, on=['uid', 'date'])
print(df.columns)

df = df.sort_values(by='date')

Index(['date', 'uid', 'morning_phonecharge_duration',
       'morning_phonecharge_frequency', 'morning_phonelock_duration',
       'morning_phonelock_frequency', 'morning_dark_duration',
       'afternoon_phonecharge_duration', 'afternoon_phonecharge_frequency',
       'afternoon_phonelock_duration', 'afternoon_phonelock_frequency',
       'afternoon_dark_duration', 'evening_phonecharge_duration',
       'evening_phonecharge_frequency', 'evening_phonelock_duration',
       'evening_phonelock_frequency', 'evening_dark_duration',
       'night_phonecharge_duration', 'night_phonecharge_frequency',
       'night_phonelock_duration', 'night_phonelock_frequency',
       'night_dark_duration', 'stress_ratings'],
      dtype='object')


In [4]:
df.isnull().sum()

date                                 0
uid                                  0
morning_phonecharge_duration       205
morning_phonecharge_frequency      205
morning_phonelock_duration           5
morning_phonelock_frequency          5
morning_dark_duration               47
afternoon_phonecharge_duration       0
afternoon_phonecharge_frequency      0
afternoon_phonelock_duration         0
afternoon_phonelock_frequency        0
afternoon_dark_duration              0
evening_phonecharge_duration         5
evening_phonecharge_frequency        5
evening_phonelock_duration           0
evening_phonelock_frequency          0
evening_dark_duration                0
night_phonecharge_duration           5
night_phonecharge_frequency          5
night_phonelock_duration             0
night_phonelock_frequency            0
night_dark_duration                  0
stress_ratings                       0
dtype: int64

In [5]:
df = df.dropna()
print(len(df))

3349


In [6]:
df.columns

Index(['date', 'uid', 'morning_phonecharge_duration',
       'morning_phonecharge_frequency', 'morning_phonelock_duration',
       'morning_phonelock_frequency', 'morning_dark_duration',
       'afternoon_phonecharge_duration', 'afternoon_phonecharge_frequency',
       'afternoon_phonelock_duration', 'afternoon_phonelock_frequency',
       'afternoon_dark_duration', 'evening_phonecharge_duration',
       'evening_phonecharge_frequency', 'evening_phonelock_duration',
       'evening_phonelock_frequency', 'evening_dark_duration',
       'night_phonecharge_duration', 'night_phonecharge_frequency',
       'night_phonelock_duration', 'night_phonelock_frequency',
       'night_dark_duration', 'stress_ratings'],
      dtype='object')

In [7]:
len(df.columns)

23

In [8]:
df['stress_ratings'].value_counts()

stress_ratings
medium stress    1328
high stress      1043
low stress        978
Name: count, dtype: int64

In [9]:
binary_lh_data = df[df['stress_ratings'].isin(['low stress', 'high stress'])]

In [10]:
print(len(binary_lh_data['uid'].unique()))
binary_lh_data['uid'].value_counts()

43


uid
u59    152
u16    128
u19    126
u49    111
u04     97
u33     87
u57     84
u58     82
u08     79
u00     71
u52     68
u46     60
u44     54
u45     54
u22     48
u12     48
u32     47
u01     43
u02     40
u07     39
u36     38
u51     37
u30     35
u24     34
u35     32
u27     32
u53     31
u25     30
u54     30
u42     29
u14     28
u56     27
u31     22
u23     21
u18     16
u17     12
u41     12
u47     11
u05      9
u34      5
u15      5
u20      4
u09      3
Name: count, dtype: int64

In [11]:
# uid_counts = binary_lh_data['uid'].value_counts()
# uids_to_keep = uid_counts[uid_counts >= 5].index
# binary_lh_data = binary_lh_data[binary_lh_data['uid'].isin(uids_to_keep)]

# print('Length of Data: ', len(binary_lh_data))
# binary_lh_data = binary_lh_data.sort_values(by=['uid', 'date'])
# print(binary_lh_data['uid'].unique())

In [12]:
X = binary_lh_data.drop(columns=['stress_ratings', 'uid', 'date'])
y = binary_lh_data['stress_ratings']
groups = binary_lh_data['uid']

stress_map = {'low stress': 0, 'high stress': 1}
y_encoded = y.map(stress_map).values 

In [13]:
print(f"Number of features in X: {X.shape[1]}")

Number of features in X: 20


In [14]:
logo = LeaveOneGroupOut()

best_thresholds = []
balanced_accs = []
auc_scores = []

In [15]:
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)
tf.random.set_seed(seed_value)

In [16]:
param_grid = [
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
     'penalty': ['l2'],
     'solver': ['newton-cg', 'lbfgs', 'sag']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
     'penalty': ['l1', 'l2'],
     'solver': ['liblinear']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
     'l1_ratio': [0.2, 0.5, 0.8],
     'penalty': ['elasticnet'],
     'solver': ['saga']}
]

In [17]:
scaler = StandardScaler()
model_logistic_rfe = LogisticRegression(max_iter=5000)

rfe = RFE(model_logistic_rfe, n_features_to_select=15)  
X_ = scaler.fit_transform(X)
rfe.fit(X_, y_encoded) 

best_features = np.where(rfe.support_)[0]
print(f"Selected Features: {best_features}")

num_splits = len(np.unique(groups))
print(num_splits)

with tqdm(total=num_splits, desc="LOSO CV Progress", unit="fold", bar_format="{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}] {percentage:3.0f}%") as pbar:
    for train_idx, test_idx in logo.split(X, y_encoded, groups=groups):
        X_train, X_test = X.iloc[train_idx, best_features], X.iloc[test_idx, best_features]
        y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

        print('X and y train shapes: ')
        print(X_train.shape)
        print(y_train.shape)

        print('X and y test shapes: ')
        print(X_test.shape)
        print(y_test.shape)

        scaler = StandardScaler()
        X_train_normalized = scaler.fit_transform(X_train)
        X_test_normalized = scaler.transform(X_test)
        
        model_logistic = GridSearchCV(LogisticRegression(max_iter=5000, class_weight='balanced'),
                                      param_grid=param_grid, cv=3, verbose=True, n_jobs=-1, scoring='roc_auc')
        if len(np.unique(y_train)) > 1 and len(np.unique(y_test)) > 1:
            model_logistic.fit(X_train_normalized, y_train)
    
            y_test_pred_proba = model_logistic.predict_proba(X_test_normalized)[:, 1]
            thresholds = np.arange(0.01, 1.0, 0.01)
            best_threshold = 0.5
            best_metric = 0.0
    
            for threshold in thresholds:
                y_test_pred_binary = (y_test_pred_proba > threshold).astype(int)
                metric_value = balanced_accuracy_score(y_test, y_test_pred_binary)
                if metric_value > best_metric:
                    best_metric = metric_value
                    best_threshold = threshold
            if len(np.unique(y_test)) > 1:
                auc_score = roc_auc_score(y_test, y_test_pred_proba)
                auc_scores.append(auc_score)
                print(f"AUC Score: {auc_score}")
            else:
                auc_scores.append(None)
                print(f"Skipping AUC computation for this fold as y_test contains only one class: {np.unique(y_test)}")
    
            y_test_pred_binary = (y_test_pred_proba > best_threshold).astype(int)
            balanced_acc = balanced_accuracy_score(y_test, y_test_pred_binary)
            balanced_accs.append(balanced_acc)
        
            print(f"Balanced Accuracy: {balanced_acc}")

        else:
            print(f"Skipping this subject for having single class: y_train = {np.unique(y_train)}, y_test = {np.unique(y_test)}")
        
        pbar.update(1)

Selected Features: [ 2  3  5  6  7  8  9 10 11 12 13 14 15 16 18]
43


LOSO CV Progress:   0%|                                    | 0/43 [00:00<?]   0%

X and y train shapes: 
(1950, 15)
(1950,)
X and y test shapes: 
(71, 15)
(71,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:   5%|█▍                              | 2/43 [00:01<00:18]   5%

AUC Score: 0.5166666666666666
Balanced Accuracy: 0.5707317073170732
X and y train shapes: 
(1978, 15)
(1978,)
X and y test shapes: 
(43, 15)
(43,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5701357466063348
Balanced Accuracy: 0.6119909502262444
X and y train shapes: 
(1981, 15)
(1981,)
X and y test shapes: 
(40, 15)
(40,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:   9%|██▉                             | 4/43 [00:01<00:10]   9%

AUC Score: 0.5831202046035805
Balanced Accuracy: 0.6214833759590792
X and y train shapes: 
(1924, 15)
(1924,)
X and y test shapes: 
(97, 15)
(97,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5358974358974359
Balanced Accuracy: 0.5587606837606838
X and y train shapes: 
(2012, 15)
(2012,)
X and y test shapes: 
(9, 15)
(9,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  14%|████▎                           | 6/43 [00:01<00:07]  14%

AUC Score: 0.6111111111111112
Balanced Accuracy: 0.6666666666666666
X and y train shapes: 
(1982, 15)
(1982,)
X and y test shapes: 
(39, 15)
(39,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.4841269841269841
Balanced Accuracy: 0.5476190476190476
X and y train shapes: 
(1942, 15)
(1942,)
X and y test shapes: 
(79, 15)
(79,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  16%|█████                           | 7/43 [00:01<00:06]  16%

AUC Score: 0.5416666666666666
Balanced Accuracy: 0.5516381766381766
X and y train shapes: 
(2018, 15)
(2018,)
X and y test shapes: 
(3, 15)
(3,)
Skipping this subject for having single class: y_train = [0 1], y_test = [1]
X and y train shapes: 
(1973, 15)
(1973,)
X and y test shapes: 
(48, 15)
(48,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  23%|██████▉                        | 10/43 [00:02<00:05]  23%

AUC Score: 0.6392857142857143
Balanced Accuracy: 0.6178571428571429
X and y train shapes: 
(1993, 15)
(1993,)
X and y test shapes: 
(28, 15)
(28,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.7846153846153847
Balanced Accuracy: 0.7615384615384615
X and y train shapes: 
(2016, 15)
(2016,)
X and y test shapes: 
(5, 15)
(5,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  26%|███████▋                       | 11/43 [00:02<00:05]  26%

AUC Score: 0.6666666666666666
Balanced Accuracy: 0.8333333333333333
X and y train shapes: 
(1893, 15)
(1893,)
X and y test shapes: 
(128, 15)
(128,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  28%|████████▎                      | 12/43 [00:02<00:06]  28%

AUC Score: 0.4528545595453371
Balanced Accuracy: 0.5246706277447688
X and y train shapes: 
(2009, 15)
(2009,)
X and y test shapes: 
(12, 15)
(12,)
Skipping this subject for having single class: y_train = [0 1], y_test = [0]
X and y train shapes: 
(2005, 15)
(2005,)
X and y test shapes: 
(16, 15)
(16,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  33%|█████████▊                     | 14/43 [00:03<00:05]  33%

AUC Score: 0.6000000000000001
Balanced Accuracy: 0.6333333333333333
X and y train shapes: 
(1895, 15)
(1895,)
X and y test shapes: 
(126, 15)
(126,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  37%|███████████▏                   | 16/43 [00:03<00:05]  37%

AUC Score: 0.4347222222222223
Balanced Accuracy: 0.5197916666666667
X and y train shapes: 
(2017, 15)
(2017,)
X and y test shapes: 
(4, 15)
(4,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 1.0
Balanced Accuracy: 1.0
X and y train shapes: 
(1973, 15)
(1973,)
X and y test shapes: 
(48, 15)
(48,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  42%|████████████▌                  | 18/43 [00:03<00:04]  42%

AUC Score: 0.5488372093023256
Balanced Accuracy: 0.6418604651162791
X and y train shapes: 
(2000, 15)
(2000,)
X and y test shapes: 
(21, 15)
(21,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5735294117647058
Balanced Accuracy: 0.6397058823529411
X and y train shapes: 
(1987, 15)
(1987,)
X and y test shapes: 
(34, 15)
(34,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  47%|█████████████▉                 | 20/43 [00:04<00:03]  47%

AUC Score: 0.5211640211640212
Balanced Accuracy: 0.5952380952380952
X and y train shapes: 
(1991, 15)
(1991,)
X and y test shapes: 
(30, 15)
(30,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.4575
Balanced Accuracy: 0.55
X and y train shapes: 
(1989, 15)
(1989,)
X and y test shapes: 
(32, 15)
(32,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  51%|███████████████▎               | 22/43 [00:04<00:03]  51%

AUC Score: 0.5833333333333334
Balanced Accuracy: 0.6282051282051282
X and y train shapes: 
(1986, 15)
(1986,)
X and y test shapes: 
(35, 15)
(35,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5966666666666667
Balanced Accuracy: 0.6166666666666667
X and y train shapes: 
(1999, 15)
(1999,)
X and y test shapes: 
(22, 15)
(22,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  58%|█████████████████▍             | 25/43 [00:04<00:02]  58%

AUC Score: 0.5347222222222222
Balanced Accuracy: 0.5833333333333334
X and y train shapes: 
(1974, 15)
(1974,)
X and y test shapes: 
(47, 15)
(47,)
Skipping this subject for having single class: y_train = [0 1], y_test = [0]
X and y train shapes: 
(1934, 15)
(1934,)
X and y test shapes: 
(87, 15)
(87,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5641783029001075
Balanced Accuracy: 0.5631041890440387
X and y train shapes: 
(2016, 15)
(2016,)
X and y test shapes: 
(5, 15)
(5,)
Skipping this subject for having single class: y_train = [0 1], y_test = [0]
X and y train shapes: 
(1989, 15)
(1989,)
X and y test shapes: 
(32, 15)
(32,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  65%|███████████████████▌           | 28/43 [00:05<00:01]  65%

AUC Score: 0.7619047619047618
Balanced Accuracy: 0.699134199134199
X and y train shapes: 
(1983, 15)
(1983,)
X and y test shapes: 
(38, 15)
(38,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.2604166666666667
Balanced Accuracy: 0.515625
X and y train shapes: 
(2009, 15)
(2009,)
X and y test shapes: 
(12, 15)
(12,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  70%|████████████████████▉          | 30/43 [00:05<00:02]  70%

AUC Score: 0.5571428571428572
Balanced Accuracy: 0.6428571428571428
X and y train shapes: 
(1992, 15)
(1992,)
X and y test shapes: 
(29, 15)
(29,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.6161616161616161
Balanced Accuracy: 0.648989898989899
X and y train shapes: 
(1967, 15)
(1967,)
X and y test shapes: 
(54, 15)
(54,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  74%|██████████████████████▎        | 32/43 [00:06<00:01]  74%

AUC Score: 0.4453125
Balanced Accuracy: 0.5454545454545454
X and y train shapes: 
(1967, 15)
(1967,)
X and y test shapes: 
(54, 15)
(54,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.43956043956043955
Balanced Accuracy: 0.5274725274725275
X and y train shapes: 
(1961, 15)
(1961,)
X and y test shapes: 
(60, 15)
(60,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  79%|███████████████████████▋       | 34/43 [00:06<00:01]  79%

AUC Score: 0.5516273849607183
Balanced Accuracy: 0.5925925925925926
X and y train shapes: 
(2010, 15)
(2010,)
X and y test shapes: 
(11, 15)
(11,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.7
Balanced Accuracy: 0.7166666666666667
X and y train shapes: 
(1910, 15)
(1910,)
X and y test shapes: 
(111, 15)
(111,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  81%|████████████████████████▍      | 35/43 [00:06<00:01]  81%

AUC Score: 0.44753521126760565
Balanced Accuracy: 0.5170774647887324
X and y train shapes: 
(1984, 15)
(1984,)
X and y test shapes: 
(37, 15)
(37,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  86%|█████████████████████████▊     | 37/43 [00:07<00:01]  86%

AUC Score: 0.676470588235294
Balanced Accuracy: 0.6862745098039216
X and y train shapes: 
(1953, 15)
(1953,)
X and y test shapes: 
(68, 15)
(68,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.567857142857143
Balanced Accuracy: 0.5607142857142857
X and y train shapes: 
(1990, 15)
(1990,)
X and y test shapes: 
(31, 15)
(31,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  88%|██████████████████████████▌    | 38/43 [00:07<00:01]  88%

AUC Score: 0.47010869565217395
Balanced Accuracy: 0.5652173913043478
X and y train shapes: 
(1991, 15)
(1991,)
X and y test shapes: 
(30, 15)
(30,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  93%|███████████████████████████▉   | 40/43 [00:07<00:00]  93%

AUC Score: 0.5294117647058824
Balanced Accuracy: 0.6131221719457014
X and y train shapes: 
(1994, 15)
(1994,)
X and y test shapes: 
(27, 15)
(27,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5617283950617284
Balanced Accuracy: 0.6388888888888888
X and y train shapes: 
(1937, 15)
(1937,)
X and y test shapes: 
(84, 15)
(84,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress:  98%|█████████████████████████████▎ | 42/43 [00:08<00:00]  98%

AUC Score: 0.6259946949602122
Balanced Accuracy: 0.6180371352785146
X and y train shapes: 
(1939, 15)
(1939,)
X and y test shapes: 
(82, 15)
(82,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
AUC Score: 0.5687782805429865
Balanced Accuracy: 0.579185520361991
X and y train shapes: 
(1869, 15)
(1869,)
X and y test shapes: 
(152, 15)
(152,)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


LOSO CV Progress: 100%|██████████████████████████████ | 43/43 [00:08<00:00] 100%

AUC Score: 0.5636718749999999
Balanced Accuracy: 0.5927083333333334





In [18]:
print('Logistic Regression: ')
print('Median, 25th Percentile, 75th Percentile: ')

if auc_scores:
    auc_scores_valid = [score for score in auc_scores if score is not None] 

    auc_median = np.median(auc_scores_valid)
    auc_25_percentile = np.percentile(auc_scores_valid, 25)
    auc_75_percentile = np.percentile(auc_scores_valid, 75)
    print(f"AUC Score - Median: {auc_median:.4f}, 25th Percentile: {auc_25_percentile:.4f}, 75th Percentile: {auc_75_percentile:.4f}")

balanced_accs_valid = [acc for acc in balanced_accs if acc is not None] 

balanced_acc_median = np.median(balanced_accs)
balanced_acc_25_percentile = np.percentile(balanced_accs, 25)
balanced_acc_75_percentile = np.percentile(balanced_accs, 75)

print(f"Balanced Accuracy - Median: {balanced_acc_median:.4f}, 25th Percentile: {balanced_acc_25_percentile:.4f}, 75th Percentile: {balanced_acc_75_percentile:.4f}")

Logistic Regression: 
Median, 25th Percentile, 75th Percentile: 
AUC Score - Median: 0.5637, 25th Percentile: 0.5189, 75th Percentile: 0.6056
Balanced Accuracy - Median: 0.6120, 25th Percentile: 0.5597, 75th Percentile: 0.6408
