In [1]:
import pytz
import os


DEFAULT_TZ = pytz.FixedOffset(540)  # GMT+09:00; Asia/Seoul

PATH_DATA = 'data/D'
PATH_ESM = os.path.join(PATH_DATA, 'EsmResponse.csv')
PATH_PARTICIPANT = os.path.join(PATH_DATA, 'UserInfo.csv')
PATH_SENSOR = os.path.join(PATH_DATA, 'Sensor')

PATH_INTERMEDIATE = os.path.join('/home/uzair/Stress/StressDetection_Collaboration/Data_Processing_D1/data/intermediate')
RANDOM_STATE = 42

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import cloudpickle
import ray
from datetime import datetime
from contextlib import contextmanager
import warnings
import time


def load(path: str):
    with open(path, mode='rb') as f:
        return cloudpickle.load(f)

    
def dump(obj, path: str):
    with open(path, mode='wb') as f:
        cloudpickle.dump(obj, f)
        
    
def log(msg: any):
    print('[{}] {}'.format(datetime.now().strftime('%y-%m-%d %H:%M:%S'), msg))


def summary(x):
    x = np.asarray(x)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        n = len(x)
        # Here, uppercase np.dtype.kind corresponds to non-numeric data.
        # Also, we view the boolean data as dichotomous categorical data.
        if x.dtype.kind.isupper() or x.dtype.kind == 'b': 
            cnt = pd.Series(x).value_counts(dropna=False)
            card = len(cnt)
            cnt = cnt[:20]                
            cnt_str = ', '.join([f'{u}:{c}' for u, c in zip(cnt.index, cnt)])
            if card > 30:
                cnt_str = f'{cnt_str}, ...'
            return {
                'n': n,
                'cardinality': card,
                'value_count': cnt_str
            }
        else: 
            x_nan = x[np.isnan(x)]
            x_norm = x[~np.isnan(x)]
            
            tot = np.sum(x_norm)
            m = np.mean(x_norm)
            me = np.median(x_norm)
            s = np.std(x_norm, ddof=1)
            l, u = np.min(x_norm), np.max(x)
            conf_l, conf_u = st.t.interval(0.95, len(x_norm) - 1, loc=m, scale=st.sem(x_norm))
            n_nan = len(x_nan)
            
            return {
                'n': n,
                'sum': tot,
                'mean': m,
                'SD': s,
                'med': me,
                'range': (l, u),
                'conf.': (conf_l, conf_u),
                'nan_count': n_nan
            }

In [3]:
p = os.path.join(PATH_INTERMEDIATE, 'stress-fixed.pkl')
X, y, groups, t, datetimes = load(p)

In [4]:
PARTICIPANTS = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'PARTICIPANT_INFO.csv'),index_col = 'pcode')
PINFO = PARTICIPANTS.assign(
    BFI_OPN=lambda x: x['openness'],
    BFI_CON=lambda x: x['conscientiousness'],
    BFI_NEU=lambda x: x['neuroticism'],
    BFI_EXT=lambda x: x['extraversion'],
    BFI_AGR=lambda x: x['agreeableness'],
)[[
    'BFI_OPN', 'BFI_CON', 'BFI_NEU', 'BFI_EXT', 'BFI_AGR'
]]
PINFO = pd.get_dummies(PINFO, prefix_sep='=', dtype=bool)

In [5]:
LABELS_PROC = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'LABELS_PROC.csv'), index_col=['pcode','timestamp'],parse_dates=True)

In [6]:
_df =LABELS_PROC
_df.reset_index(level='timestamp', inplace=True)
print('First timestamp:', _df['timestamp'].min())
print('Last timestamp:', _df['timestamp'].max())

First timestamp: 2019-04-30 10:03:28+09:00
Last timestamp: 2019-05-22 22:02:03+09:00


In [7]:
time_ranges = _df.groupby('pcode')['timestamp'].agg(['min', 'max'])

In [8]:
list_pid = set(LABELS_PROC.index.get_level_values('pcode').values)

In [9]:
PINFO_valid = PINFO.loc[PINFO.index.isin(list_pid)]

In [10]:
#Divide the features into different categories
feat_current = X.loc[:,[('#VAL' in str(x)) or ('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_dsc = X.loc[:,[('#DSC' in str(x))  for x in X.keys()]]  
feat_yesterday = X.loc[:,[('Yesterday' in str(x))  for x in X.keys()]]  
feat_today = X.loc[:,[('Today' in str(x))  for x in X.keys()]]  
feat_sleep = X.loc[:,[('Sleep' in str(x))  for x in X.keys()]]  
feat_time = X.loc[:,[('Time' in str(x))  for x in X.keys()]]  
feat_pif = X.loc[:,[('PIF' in str(x))  for x in X.keys()]]  
feat_ImmediatePast = X.loc[:,[('ImmediatePast_15' in str(x))  for x in X.keys()]]
#Divide the time window features into sensor/past stress label
feat_current_sensor = X.loc[:,[('#VAL' in str(x))  for x in X.keys()]]  
feat_current_ESM = X.loc[:,[('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_ImmediatePast_sensor = feat_ImmediatePast.loc[:,[('ESM' not in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_ImmediatePast_ESM = feat_ImmediatePast.loc[:,[('ESM'  in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_today_sensor = feat_today.loc[:,[('ESM' not in str(x))  for x in feat_today.keys()]]  
feat_today_ESM = feat_today.loc[:,[('ESM'  in str(x)) for x in feat_today.keys()]]  
feat_yesterday_sensor = feat_yesterday.loc[:,[('ESM' not in str(x)) for x in feat_yesterday.keys()]]  
feat_yesterday_ESM = feat_yesterday.loc[:,[('ESM'  in str(x)) for x in feat_yesterday.keys()]]
feat_baseline = pd.concat([ feat_time,feat_dsc,feat_current_sensor, feat_ImmediatePast_sensor],axis=1)
feat_final = pd.concat([feat_baseline  ], axis=1)
X = feat_final
cats = X.columns[X.dtypes == bool]

In [11]:
groups

array(['P01', 'P01', 'P01', ..., 'P80', 'P80', 'P80'], dtype='<U3')

In [12]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTENC, SMOTE
from tqdm import tqdm
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold, TimeSeriesSplit, LeavePGroupsOut, train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

In [13]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone

normalize =True
# Feature selection method
selector = SelectFromModel(
        estimator=LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state=RANDOM_STATE, max_iter=4000),
        threshold=0.005
#         estimator=LinearSVC(
#         penalty='l1',
#         loss='squared_hinge',
#         dual=False,
#         tol=1e-3,
#         C=1e-2,
#         max_iter=5000,
#         random_state=RANDOM_STATE
#     )  
    )
select = [clone(selector)]
oversample = True

In [14]:
C_cat = np.asarray(sorted(cats))
C_num = np.asarray(sorted(X.columns[~X.columns.isin(C_cat)]))

In [15]:
skip_users = ["P05", "P09"]

In [16]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score

# Lists to collect the metrics for each user
all_accs = []
all_aucs = []
all_f1s = []
all_f1p = []
all_precisions = []
all_recalls = []

# Loop through each user group
for user in PINFO_valid.index:
    if user in skip_users:
        continue
    print(f"Processing user: {user}")
    user_mask = groups == user
    X_user, y_user = X[user_mask], np.array(y)[user_mask]
    group_user = np.array(groups)[user_mask]
    
    # Count the number of 1s in y_user
    count_ones = np.sum(y_user == 1)
    print("\n Number of 1s in y_user ")
    print(count_ones)

    
    user_accs = []
    user_aucs = []
    user_f1s = []
    user_f1p = []
    user_precisions = []
    user_recalls = []
    
    # Splitting the data into 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X_user, y_user, test_size=0.2, random_state=RANDOM_STATE, stratify=y_user)
    
    count_ones = np.sum(y_train == 1)
    print("\n Number of 1s in y_train ")
    print(count_ones)
    
    count_ones = np.sum(y_test == 1)
    print("\n Number of 1s in y_test ")
    print(count_ones)
    
    # Split 20% of training set as eval set
    X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2, random_state=RANDOM_STATE,  stratify=y_train)

    count_ones = np.sum(y_eval == 1)
    print("\n Number of 1s in y_eval ")
    print(count_ones)
    
    if normalize:
        # Normalize numeric features
        scaler = StandardScaler().fit(X_train[C_num])
        X_train[C_num] = scaler.transform(X_train[C_num].copy())
        X_eval[C_num] = scaler.transform(X_eval[C_num].copy())
        X_test[C_num] = scaler.transform(X_test[C_num].copy())

    if select:
        if isinstance(select, SelectFromModel):
            select = [select]
        for i, s in enumerate(select):
            # Fit feature selector only on training data
            s.fit(X_train, y_train)
            selected_features = X_train.columns[s.get_support()].tolist()

            # Apply feature selection to train, eval, and test
            X_train = X_train[selected_features].copy()
            X_eval = X_eval[selected_features].copy()
            X_test = X_test[selected_features].copy()

    if oversample:
        # Determine categorical features for SMOTENC
        if len(C_cat):
            M = np.isin(X_train.columns, C_cat)
            sampler = SMOTENC(categorical_features=M, random_state=RANDOM_STATE)
        else:
            minority_class_size = np.sum(y_train == 1)  # Assuming 1 is the minority class
            if minority_class_size > 3:
                sampler = SMOTE(random_state=RANDOM_STATE, k_neighbors=2)
        # Only oversample training data
        X_train, y_train = sampler.fit_resample(X_train, y_train)

    # Train the XGBoost model
    dtrain = xgb.DMatrix(X_train, label=y_train)
    deval = xgb.DMatrix(X_eval, label=y_eval)
    dtest = xgb.DMatrix(X_test, label=y_test)


    # XGBoost parameters
    param = {
        "learning_rate": 0.01,
        "seed": RANDOM_STATE,
        "objective": 'binary:logistic',
        'eval_metric': 'auc',
        'verbosity': 0,
    }
    evallist = [(dtrain, 'train'), (deval, 'test')]
    bst = xgb.train(param, dtrain, early_stopping_rounds=10, evals=evallist)

    y_real = dtest.get_label()
    y_score = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

    # Predict binary outcomes instead of probabilities
    y_pred = [1 if score >= 0.5 else 0 for score in y_score]

    # Calculate metrics
    user_accs.append(accuracy_score(y_real, y_pred))

    if len(np.unique(y_real)) > 1:
        user_aucs.append(roc_auc_score(y_true=y_real, y_score=y_score, average=None))
    else:
        user_aucs.append(np.nan)  # NaN indicates that ROC AUC couldn't be calculated

    user_f1s.append(f1_score(y_true=y_real, y_pred=y_pred, pos_label=1, average='macro', zero_division=0))
    user_precisions.append(precision_score(y_true=y_real, y_pred=y_pred, pos_label=1, average='macro', zero_division=0))
    user_recalls.append(recall_score(y_true=y_real, y_pred=y_pred, pos_label=1, average='macro', zero_division=0))
    user_f1p.append(f1_score(y_true=y_real, y_pred=y_pred, pos_label=1, average='binary', zero_division=0))

        
all_accs.extend(user_accs)
all_aucs.extend(user_aucs)
all_f1s.extend(user_f1s)
all_f1p.extend(user_f1p)
all_precisions.extend(user_precisions)
all_recalls.extend(user_recalls)

print(f"User: {user}, Avg. Accuracy: {np.mean(user_accs):.4f}, Avg. AUC: {np.nanmean(user_aucs):.4f}, Avg. F1: {np.mean(user_f1s):.4f}")

# Overall metrics for all users
print(f"\nOverall Metrics:")
print(f"Average Accuracy: {np.mean(all_accs):.4f}")
print(f"Average AUC: {np.nanmean(all_aucs):.4f}")
print(f"Average Macro F1 Score: {np.mean(all_f1s):.4f}")
print(f"Average Macro Precision: {np.mean(all_precisions):.4f}")
print(f"Average Macro Recall: {np.mean(all_recalls):.4f}")

Processing user: P01

 Number of 1s in y_user 
11

 Number of 1s in y_train 
9

 Number of 1s in y_test 
2

 Number of 1s in y_eval 
2
[0]	train-auc:0.95679	test-auc:0.50000
[1]	train-auc:0.99846	test-auc:0.30000
[2]	train-auc:0.99846	test-auc:0.40000
[3]	train-auc:0.99846	test-auc:0.30000
[4]	train-auc:0.99846	test-auc:0.40000
[5]	train-auc:0.99846	test-auc:0.30000
[6]	train-auc:0.99846	test-auc:0.40000
[7]	train-auc:0.99846	test-auc:0.40000
[8]	train-auc:0.99846	test-auc:0.40000
[9]	train-auc:0.99846	test-auc:0.40000
Processing user: P02

 Number of 1s in y_user 
9

 Number of 1s in y_train 
7

 Number of 1s in y_test 
2

 Number of 1s in y_eval 
1
[0]	train-auc:0.99112	test-auc:0.28571
[1]	train-auc:0.99704	test-auc:0.28571
[2]	train-auc:0.99408	test-auc:0.28571
[3]	train-auc:0.99112	test-auc:0.28571
[4]	train-auc:0.99408	test-auc:0.28571
[5]	train-auc:0.99112	test-auc:0.28571
[6]	train-auc:0.99408	test-auc:0.28571
[7]	train-auc:0.99112	test-auc:0.28571
[8]	train-auc:1.00000	test-au



[0]	train-auc:0.99537	test-auc:0.83333
[1]	train-auc:1.00000	test-auc:0.83333
[2]	train-auc:1.00000	test-auc:0.83333
[3]	train-auc:1.00000	test-auc:0.83333
[4]	train-auc:1.00000	test-auc:0.83333
[5]	train-auc:1.00000	test-auc:0.83333
[6]	train-auc:1.00000	test-auc:0.83333
[7]	train-auc:1.00000	test-auc:0.83333
[8]	train-auc:1.00000	test-auc:0.83333
[9]	train-auc:1.00000	test-auc:0.83333
Processing user: P06

 Number of 1s in y_user 
14

 Number of 1s in y_train 
11

 Number of 1s in y_test 
3

 Number of 1s in y_eval 
2
[0]	train-auc:0.94215	test-auc:0.70833
[1]	train-auc:0.94215	test-auc:0.70833
[2]	train-auc:0.94215	test-auc:0.70833
[3]	train-auc:0.94215	test-auc:0.70833
[4]	train-auc:0.94215	test-auc:0.70833
[5]	train-auc:0.94215	test-auc:0.70833
[6]	train-auc:0.94215	test-auc:0.70833
[7]	train-auc:0.94215	test-auc:0.70833
[8]	train-auc:0.94215	test-auc:0.70833
[9]	train-auc:0.94215	test-auc:0.70833
Processing user: P08

 Number of 1s in y_user 
48

 Number of 1s in y_train 
38

 Nu



[0]	train-auc:0.95333	test-auc:0.37500
[1]	train-auc:0.98500	test-auc:0.39583
[2]	train-auc:0.98722	test-auc:0.35417
[3]	train-auc:0.99000	test-auc:0.39583
[4]	train-auc:0.98778	test-auc:0.39583
[5]	train-auc:0.99000	test-auc:0.39583
[6]	train-auc:0.98778	test-auc:0.39583
[7]	train-auc:0.98778	test-auc:0.39583
[8]	train-auc:0.98778	test-auc:0.39583
[9]	train-auc:0.98778	test-auc:0.39583
Processing user: P10

 Number of 1s in y_user 
6

 Number of 1s in y_train 
5

 Number of 1s in y_test 
1

 Number of 1s in y_eval 
1




ValueError: Expected n_neighbors <= n_samples,  but n_samples = 4, n_neighbors = 6

In [None]:
print(f"\nOverall Metrics:")
print(f"Average Accuracy: {np.mean(all_accs):.4f}")
print(f"Average AUC: {np.nanmean(all_aucs):.4f}")
print(f"Average Macro F1 Score: {np.mean(all_f1s):.4f}")
print(f"Average Macro Precision: {np.mean(all_precisions):.4f}")
print(f"Average Macro Recall: {np.mean(all_recalls):.4f}")
print(f"Average Positive F1 Score: {np.mean(all_f1p):.4f}")