In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import shap
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.model_selection import GroupKFold
from collections import defaultdict
from sklearn.svm import SVC

## Read Data

In [2]:
activity = pd.read_csv('activity.txt', sep='\t') 
activity = activity.drop(['gender', 'race', 'caloriesBMR'], axis = 1)

In [3]:
sleep = pd.read_csv('sleep.txt', sep='\t') 

In [4]:
#divide into main sleep and naps
def classify_sleep(row):
    main_sleep_conditions = all(pd.isna(row[key]) for key in ['asleep_count', 'asleep_minutes', 
                                                             'awake_count', 'awake_minutes',
                                                             'restless_minutes', 'restless_count'])
    nap_conditions = all(pd.isna(row[key]) for key in ['deep_count', 'deep_minutes', 'deep_thirtyDayAvgMinutes', 
                                                      'light_count', 'light_minutes', 'light_thirtyDayAvgMinutes',
                                                      'rem_count', 'rem_minutes', 'rem_thirtyDayAvgMinutes',
                                                      'wake_count', 'wake_minutes', 'wake_thirtyDayAvgMinutes'])
    if nap_conditions:
        return "Nap"
    elif main_sleep_conditions:
        return "Main Sleep"
    else:
        return "Inconclusive" 

sleep['classification'] = sleep.apply(classify_sleep, axis=1)

In [5]:
mainsleep = sleep[sleep['classification']=='Main Sleep']
mainsleep = mainsleep.drop(['isMainSleep', 'classification',
                'gender', 'race'], axis=1) 
mainsleep = mainsleep.dropna(axis = 1)

## activity + heart rate + main sleep

In [7]:
common_features = ['label', 'participant', 'age', 'date']
df = pd.merge(activity, mainsleep, on=common_features, how='inner')

In [8]:
df = df.drop(['date', 'age'], axis=1)

## Feature Selection

In [9]:
X = df.drop(columns=['label'])  
y = df['label']  

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols = X.columns[X.columns != 'participant']
X[cols] = scaler.fit_transform(X[cols])

In [11]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=20)
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X['participant'] = X['participant'].astype(int)

In [13]:
alpha = 0.05
group_kfold = GroupKFold(n_splits=5)

models = []
auc_scores = []
feature_occurrences = {} 
shaps = []

for i, (train_outer_ix, test_outer_ix) in enumerate(group_kfold.split(X, y, groups=X['participant'])):
    X_train, y_train = X.iloc[train_outer_ix].drop(columns='participant'), y.iloc[train_outer_ix]
    X_test, y_test = X.iloc[test_outer_ix].drop(columns='participant'), y.iloc[test_outer_ix]
    
    #model
    model = SVC(probability=True, kernel='linear', random_state=7) 
    model.fit(X_train, y_train)

    #SHAP in training
    explainer = shap.KernelExplainer(model.predict_proba, X_train)
    shap_values = explainer.shap_values(X_train) # 3D
    averaged_shap_values = np.mean(np.abs(shap_values), axis=0) #2D
    shap_importance = np.mean(averaged_shap_values, axis=0) #1D
    shaps.extend(shap_importance)
    important_features = np.array(X_train.columns)[shap_importance > alpha]

    #count
    for feature in important_features:
        if feature in feature_occurrences:
            feature_occurrences[feature] += 1
        else:
            feature_occurrences[feature] = 1

    #test with selected features
    X_train_selected = X_train[important_features]
    X_test_selected = X_test[important_features]
    model.fit(X_train_selected, y_train)
    yhat_probs = model.predict_proba(X_test_selected)[:, 1] 
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        auc_scores.append(auc)
    except ValueError:
        print("Skipping AUC calculation for this fold due to single class in y_true.")
        auc_scores.append(float('nan'))  
    
    models.append(model)

  0%|          | 0/461 [00:00<?, ?it/s]

  0%|          | 0/449 [00:00<?, ?it/s]

  0%|          | 0/452 [00:00<?, ?it/s]

  0%|          | 0/460 [00:00<?, ?it/s]

  0%|          | 0/458 [00:00<?, ?it/s]

In [14]:
print('The mean AUC of the 5 folds is ', np.nanmean(auc_scores))
print('The std AUC of the 5 folds is ', np.nanstd(auc_scores))

The mean AUC of the 5 folds is  0.4106970158766082
The std AUC of the 5 folds is  0.1419256225781693


In [15]:
final_selected_features = [feature for feature, count in feature_occurrences.items() if count >= 2]

In [17]:
X = X.drop('participant', axis = 1)

In [18]:
X_filtered = X[final_selected_features]
print(X_filtered.shape)
print(y.shape)

(570, 15)
(570,)


## Evaluation

In [19]:
X_filtered = X_filtered.copy()
X_filtered['participant'] = df['participant']

### CatBoost

In [50]:
group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['participant'])

models = []
auc_per_fold = []
SHAP_values_per_fold = []
all_X_tests = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='participant'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='participant'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['participant']
    
    # model
    model = CatBoostClassifier(iterations=300, learning_rate=0.01, depth=8, l2_leaf_reg=3, random_seed=7, 
                               loss_function='Logloss')
    model.fit(X_train, y_train, cat_features=[], verbose=0)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

In [51]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority voting:", mean_auc)

Mean AUC using majority voting: 0.5833333333333333


### XGBoost

In [52]:
import xgboost as xgb

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['participant'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='participant'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='participant'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['participant']
    
    # model   
    model = xgb.XGBClassifier(n_estimators=400, learning_rate=0.01, max_depth=8, 
                              random_state=7, use_label_encoder=False, eval_metric="logloss")
    model.fit(X_train, y_train, verbose=0)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

In [53]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.6833333333333333


### RF

In [54]:
from sklearn.ensemble import RandomForestClassifier

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['participant'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='participant'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='participant'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['participant']
    
    # model
    model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=7) 
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

In [55]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.6833333333333333


### SVM

In [56]:
from sklearn.svm import SVC

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['participant'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='participant'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='participant'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['participant']
    
    # model   
    model = SVC(probability=True, kernel='linear', random_state=7) 
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        auc_per_fold.append(float('nan'))  
    
    models.append(model)


In [57]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.8666666666666666


### LR

In [58]:
from sklearn.linear_model import LogisticRegression

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['participant'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='participant'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='participant'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['participant']
    
    # model   
    model = LogisticRegression(random_state=7, max_iter=100)
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

In [59]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.8333333333333334


### GNB

In [60]:
from sklearn.naive_bayes import GaussianNB

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['participant'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='participant'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='participant'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['participant']
    
    # model   
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

In [61]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.5499999999999999
