In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.svm import SVC
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix
from keras.layers import Dense, Activation, Dropout
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import random

from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import LeaveOneGroupOut
import joblib

In [2]:
df1 = pd.read_csv('/Users/ohidabinteamin/Documents/Stress Prediction Project Three Datasets/StudentLife/week 01/Social_Interaction/recreating_social_features_studentlife.csv')
df1 = df1.rename(columns={'Date': 'date'})
df1 = df1.drop('Unnamed: 0', axis=1)
df2 = pd.read_csv('/Users/ohidabinteamin/Documents/Stress Prediction Project Three Datasets/StudentLife/week 01/Stress/recreating_dailystress_features.csv')
df2 = df2.drop('Unnamed: 0', axis=1)

In [3]:
df = pd.merge(df1, df2, on=['uid', 'date'])
print(df.columns)

df = df.sort_values(by='date')

Index(['Unnamed: 0.1', 'uid', 'date', 'app usage in morning',
       'app usage in afternoon', 'app usage in evening', 'app usage in night',
       'number of call in morning', 'number of call in afternoon',
       'number of call in evening', 'number of call in night',
       'number of Bluetooth contacts morning',
       'number of Bluetooth contacts afternoon',
       'number of Bluetooth contacts evening',
       'number of Bluetooth contacts night', 'conversation in morning',
       'conversation in afternoon', 'conversation in evening',
       'conversation in night', 'stress_ratings'],
      dtype='object')


In [4]:
df.isnull().sum()

Unnamed: 0.1                              0
uid                                       0
date                                      0
app usage in morning                      0
app usage in afternoon                    0
app usage in evening                      0
app usage in night                        0
number of call in morning                 0
number of call in afternoon               0
number of call in evening                 0
number of call in night                   0
number of Bluetooth contacts morning      0
number of Bluetooth contacts afternoon    0
number of Bluetooth contacts evening      0
number of Bluetooth contacts night        0
conversation in morning                   0
conversation in afternoon                 0
conversation in evening                   0
conversation in night                     0
stress_ratings                            0
dtype: int64

In [5]:
df = df.dropna()
print(len(df))

495


In [6]:
df.columns

Index(['Unnamed: 0.1', 'uid', 'date', 'app usage in morning',
       'app usage in afternoon', 'app usage in evening', 'app usage in night',
       'number of call in morning', 'number of call in afternoon',
       'number of call in evening', 'number of call in night',
       'number of Bluetooth contacts morning',
       'number of Bluetooth contacts afternoon',
       'number of Bluetooth contacts evening',
       'number of Bluetooth contacts night', 'conversation in morning',
       'conversation in afternoon', 'conversation in evening',
       'conversation in night', 'stress_ratings'],
      dtype='object')

In [7]:
df['stress_ratings'].value_counts()

stress_ratings
medium stress    193
high stress      162
low stress       140
Name: count, dtype: int64

In [8]:
binary_lh_data = df[df['stress_ratings'].isin(['low stress', 'high stress'])]

In [9]:
X = binary_lh_data.drop(columns=['stress_ratings', 'uid', 'date'])
y = binary_lh_data['stress_ratings']
groups = binary_lh_data['uid']

stress_map = {'low stress': 0, 'high stress': 1}
y_encoded = y.map(stress_map).values 

In [10]:
logo = LeaveOneGroupOut()

best_thresholds = []
balanced_accs = []
auc_scores = []

In [11]:
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)
tf.random.set_seed(seed_value)

In [12]:
num_splits = len(np.unique(groups))
logo = LeaveOneGroupOut()

In [13]:
num_splits = len(np.unique(groups))
class_weights = compute_class_weight('balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weight_dict = dict(enumerate(class_weights))

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
    'class_weight': [class_weight_dict]
}

best_thresholds = []
balanced_accs = []
auc_scores = []
auprc_scores = []
best_params_list = []

In [14]:
with tqdm(total=num_splits, desc="LOSO CV Progress", unit="fold", bar_format="{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}] {percentage:3.0f}%") as pbar:
    for train_idx, test_idx in logo.split(X, y_encoded, groups=groups):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]  

        print(f"Train shape: {X_train.shape}, {y_train.shape}")

        scaler = StandardScaler()
        X_train_normalized = scaler.fit_transform(X_train)
        X_test_normalized = scaler.transform(X_test)

        print(f"Normalized shapes: Train {X_train_normalized.shape}, Test {X_test_normalized.shape}")

        print('Performing Grid Search for SVM')
        grid_search = GridSearchCV(
            SVC(kernel='rbf', probability=True),
            param_grid,
            cv=5,
            scoring='balanced_accuracy',
            n_jobs=-1 
        )

        grid_search.fit(X_train_normalized, y_train)
        model_svm_rbf = grid_search.best_estimator_
        
        print("Best parameters:", grid_search.best_params_)
        print("Best cross-validation score:", grid_search.best_score_)
        best_params_list.append(grid_search.best_params_)

        y_test_pred_proba = model_svm_rbf.predict_proba(X_test_normalized)[:, 1]  
        thresholds = np.arange(0.01, 1.0, 0.01)
        best_threshold = max(thresholds, key=lambda t: balanced_accuracy_score(y_test, (y_test_pred_proba > t).astype(int)))

        if len(np.unique(y_test)) > 1:
            auc_score = roc_auc_score(y_test, y_test_pred_proba)
            auprc = average_precision_score(y_test, y_test_pred_proba)
            auc_scores.append(auc_score)
            auprc_scores.append(auprc)
            print(f"AUC Score: {auc_score}")
            print(f"AUPRC Score: {auprc}")
        else:
            auc_scores.append(None)
            auprc_scores.append(None)
            print(f"Skipping AUC and AUPRC computation for this fold as y_test contains only one class: {np.unique(y_test)}")

        y_test_pred_binary = (y_test_pred_proba > best_threshold).astype(int)
        balanced_acc = balanced_accuracy_score(y_test, y_test_pred_binary)
        balanced_accs.append(balanced_acc)
        best_thresholds.append(best_threshold)
        
        print(f"Balanced Accuracy: {balanced_acc}")
        print(f"Best Threshold: {best_threshold}")

        pbar.update(1)

LOSO CV Progress:   0%|                                    | 0/18 [00:00<?]   0%

Train shape: (282, 17), (282,)
Normalized shapes: Train (282, 17), Test (20, 17)
Performing Grid Search for SVM


LOSO CV Progress:   6%|█▋                              | 1/18 [00:01<00:17]   6%

Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.6047339398952303
AUC Score: 0.20707070707070707
AUPRC Score: 0.4368556530250874
Balanced Accuracy: 0.5
Best Threshold: 0.01
Train shape: (289, 17), (289,)
Normalized shapes: Train (289, 17), Test (13, 17)
Performing Grid Search for SVM
Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.5610703290138774
AUC Score: 0.775
AUPRC Score: 0.7533333333333332
Balanced Accuracy: 0.7
Best Threshold: 0.55


LOSO CV Progress:  11%|███▍                            | 2/18 [00:01<00:08]  11%

Train shape: (281, 17), (281,)
Normalized shapes: Train (281, 17), Test (21, 17)
Performing Grid Search for SVM
Best parameters: {'C': 10, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.5588870967741936


LOSO CV Progress:  22%|██████▉                         | 4/18 [00:01<00:04]  22%

AUC Score: 0.5888888888888889
AUPRC Score: 0.38730158730158726
Balanced Accuracy: 0.6166666666666667
Best Threshold: 0.55
Train shape: (288, 17), (288,)
Normalized shapes: Train (288, 17), Test (14, 17)
Performing Grid Search for SVM
Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.5927745611616579
AUC Score: 0.5625
AUPRC Score: 0.6482142857142857
Balanced Accuracy: 0.6458333333333333
Best Threshold: 0.53
Train shape: (291, 17), (291,)
Normalized shapes: Train (291, 17), Test (11, 17)
Performing Grid Search for SVM


LOSO CV Progress:  28%|████████▌                       | 5/18 [00:01<00:03]  28%

Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 'scale'}
Best cross-validation score: 0.5697465437788019
AUC Score: 0.1111111111111111
AUPRC Score: 0.7363235529902197
Balanced Accuracy: 0.5555555555555556
Best Threshold: 0.61
Train shape: (297, 17), (297,)
Normalized shapes: Train (297, 17), Test (5, 17)
Performing Grid Search for SVM


LOSO CV Progress:  33%|██████████▎                     | 6/18 [00:02<00:02]  33%

Best parameters: {'C': 100, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.001}
Best cross-validation score: 0.5817655723905724
Skipping AUC and AUPRC computation for this fold as y_test contains only one class: [0]
Balanced Accuracy: 1.0
Best Threshold: 0.63
Train shape: (293, 17), (293,)
Normalized shapes: Train (293, 17), Test (9, 17)
Performing Grid Search for SVM


LOSO CV Progress:  39%|████████████                    | 7/18 [00:02<00:02]  39%

Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.5908388803550093
AUC Score: 0.07142857142857142
AUPRC Score: 0.6441043083900226
Balanced Accuracy: 0.5
Best Threshold: 0.01
Train shape: (284, 17), (284,)
Normalized shapes: Train (284, 17), Test (18, 17)
Performing Grid Search for SVM


LOSO CV Progress:  44%|█████████████▊                  | 8/18 [00:02<00:02]  44%

Best parameters: {'C': 10, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.5641687344913151
AUC Score: 0.39999999999999997
AUPRC Score: 0.44885236291486286
Balanced Accuracy: 0.5
Best Threshold: 0.01
Train shape: (297, 17), (297,)
Normalized shapes: Train (297, 17), Test (5, 17)
Performing Grid Search for SVM


LOSO CV Progress:  50%|███████████████▌                | 9/18 [00:02<00:02]  50%

Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 'scale'}
Best cross-validation score: 0.5604497354497354
AUC Score: 0.5
AUPRC Score: 0.5
Balanced Accuracy: 0.6666666666666666
Best Threshold: 0.5700000000000001
Train shape: (275, 17), (275,)
Normalized shapes: Train (275, 17), Test (27, 17)
Performing Grid Search for SVM


LOSO CV Progress:  56%|████████████████▋              | 10/18 [00:02<00:01]  56%

Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.5755708531570601
AUC Score: 0.537037037037037
AUPRC Score: 0.7717901859349228
Balanced Accuracy: 0.5833333333333334
Best Threshold: 0.59
Train shape: (289, 17), (289,)
Normalized shapes: Train (289, 17), Test (13, 17)
Performing Grid Search for SVM


LOSO CV Progress:  61%|██████████████████▎            | 11/18 [00:03<00:01]  61%

Best parameters: {'C': 10, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.5644973544973546
AUC Score: 0.3333333333333333
AUPRC Score: 0.9252666315166315
Balanced Accuracy: 0.6666666666666666
Best Threshold: 0.55
Train shape: (282, 17), (282,)
Normalized shapes: Train (282, 17), Test (20, 17)
Performing Grid Search for SVM
Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.5751282051282052


LOSO CV Progress:  67%|████████████████████           | 12/18 [00:03<00:01]  67%

AUC Score: 0.39583333333333337
AUPRC Score: 0.6067086787481525
Balanced Accuracy: 0.5
Best Threshold: 0.01
Train shape: (291, 17), (291,)
Normalized shapes: Train (291, 17), Test (11, 17)
Performing Grid Search for SVM
Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.5796202423621779


LOSO CV Progress:  72%|█████████████████████▋         | 13/18 [00:03<00:01]  72%

AUC Score: 0.5
AUPRC Score: 0.88658810325477
Balanced Accuracy: 0.7222222222222222
Best Threshold: 0.51
Train shape: (295, 17), (295,)
Normalized shapes: Train (295, 17), Test (7, 17)
Performing Grid Search for SVM
Best parameters: {'C': 0.1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 'scale'}
Best cross-validation score: 0.5941041559993174


LOSO CV Progress:  78%|███████████████████████▎       | 14/18 [00:03<00:00]  78%

AUC Score: 0.3333333333333333
AUPRC Score: 0.5428571428571428
Balanced Accuracy: 0.5
Best Threshold: 0.01
Train shape: (290, 17), (290,)
Normalized shapes: Train (290, 17), Test (12, 17)
Performing Grid Search for SVM
Best parameters: {'C': 1, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.1}
Best cross-validation score: 0.585304659498208


LOSO CV Progress:  89%|██████████████████████████▋    | 16/18 [00:04<00:00]  89%

AUC Score: 0.5142857142857142
AUPRC Score: 0.6386363636363637
Balanced Accuracy: 0.5571428571428572
Best Threshold: 0.61
Train shape: (278, 17), (278,)
Normalized shapes: Train (278, 17), Test (24, 17)
Performing Grid Search for SVM
Best parameters: {'C': 10, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.01}
Best cross-validation score: 0.6044061302681992
AUC Score: 0.5294117647058825
AUPRC Score: 0.7692213542428915
Balanced Accuracy: 0.6092436974789917
Best Threshold: 0.46
Train shape: (281, 17), (281,)
Normalized shapes: Train (281, 17), Test (21, 17)
Performing Grid Search for SVM


LOSO CV Progress: 100%|██████████████████████████████ | 18/18 [00:04<00:00] 100%

Best parameters: {'C': 100, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.001}
Best cross-validation score: 0.5776117496807152
AUC Score: 0.575
AUPRC Score: 0.7762498431543517
Balanced Accuracy: 0.675
Best Threshold: 0.53
Train shape: (251, 17), (251,)
Normalized shapes: Train (251, 17), Test (51, 17)
Performing Grid Search for SVM
Best parameters: {'C': 100, 'class_weight': {0: 1.0785714285714285, 1: 0.9320987654320988}, 'gamma': 0.001}
Best cross-validation score: 0.59752688172043
AUC Score: 0.5704545454545454
AUPRC Score: 0.41370940135036494
Balanced Accuracy: 0.5863636363636364
Best Threshold: 0.71





In [15]:
print('SVM with Radial basis function (RBF) kernel')
print('Median, 25th Percentile, 75th Percentile: ')

if auc_scores:
    auc_scores_valid = [score for score in auc_scores if score is not None] 

    auc_median = np.median(auc_scores_valid)
    auc_25_percentile = np.percentile(auc_scores_valid, 25)
    auc_75_percentile = np.percentile(auc_scores_valid, 75)
    print(f"AUC Score - Median: {auc_median}, 25th Percentile: {auc_25_percentile}, 75th Percentile: {auc_75_percentile}")

if auprc_scores:
    auprc_scores_valid = [score for score in auprc_scores if score is not None] 

    auprc_median = np.median(auprc_scores_valid)
    auprc_25_percentile = np.percentile(auprc_scores_valid, 25)
    auprc_75_percentile = np.percentile(auprc_scores_valid, 75)
    print(f"AUPRC Score - Median: {auprc_median}, 25th Percentile: {auprc_25_percentile}, 75th Percentile: {auprc_75_percentile}")

balanced_acc_median = np.median(balanced_accs)
balanced_acc_25_percentile = np.percentile(balanced_accs, 25)
balanced_acc_75_percentile = np.percentile(balanced_accs, 75)

print(f"Balanced Accuracy - Median: {balanced_acc_median}, 25th Percentile: {balanced_acc_25_percentile}, 75th Percentile: {balanced_acc_75_percentile}")

SVM with Radial basis function (RBF) kernel
Median, 25th Percentile, 75th Percentile: 
AUC Score - Median: 0.5, 25th Percentile: 0.3333333333333333, 75th Percentile: 0.5625
AUPRC Score - Median: 0.6441043083900226, 25th Percentile: 0.5, 75th Percentile: 0.7692213542428915
Balanced Accuracy - Median: 0.597803666921314, 25th Percentile: 0.5138888888888888, 75th Percentile: 0.6666666666666666
