In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import shap
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.model_selection import GroupKFold
from collections import defaultdict
from sklearn.svm import SVC

## Data

In [2]:
activities = pd.read_csv('activities_final.txt', sep='\t')
activities = activities.drop(['activity_Run', 'activity_Walk', 'race', 'gender', 'activeScore', 'caloriesBMR'], axis = 1)

In [3]:
valid_days = pd.read_csv('included_days.txt', sep='\t') 

In [4]:
activity = pd.merge(activities, valid_days, left_on=['User', 'Date'], right_on=['User', 'Day'])
activity = activity.drop(['Day'], axis=1)

In [5]:
sleep = pd.read_csv('sleep_final.txt', sep='\t')

In [6]:
def classify_sleep(row):
    main_sleep_conditions = all(pd.isna(row[key]) for key in ['asleep_count', 'asleep_minutes', 
                                                             'awake_count', 'awake_minutes',
                                                             'restless_minutes', 'restless_count'])
    nap_conditions = all(pd.isna(row[key]) for key in ['deep_count', 'deep_minutes', 'deep_thirtyDayAvgMinutes', 
                                                      'light_count', 'light_minutes', 'light_thirtyDayAvgMinutes',
                                                      'rem_count', 'rem_minutes', 'rem_thirtyDayAvgMinutes',
                                                      'wake_count', 'wake_minutes', 'wake_thirtyDayAvgMinutes'])
    if nap_conditions:
        return "Nap"
    elif main_sleep_conditions:
        return "Main Sleep"
    else:
        return "Inconclusive"

sleep['classification'] = sleep.apply(classify_sleep, axis=1)

In [7]:
mainsleep = sleep[sleep['classification']=='Main Sleep']
mainsleep = mainsleep.drop(['startTime', 'endTime', 'gender', 'race',
                'duration', 'isMainSleep', 'classification'], axis=1) 
mainsleep = mainsleep.dropna(axis = 1) #remove features related with naps

## Activities + Heart rate + MainSleep

In [8]:
common_features = ['label', 'User', 'age', 'Date']
df = pd.merge(activity, mainsleep, on=common_features, how='inner')

In [9]:
df = df.drop(['Date', 'age'], axis=1)

## Feature Selection

In [10]:
X = df.drop(columns=['label'])  
y = df['label']  

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols = X.columns[X.columns != 'User']
X[cols] = scaler.fit_transform(X[cols])

In [12]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=20)
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X['User'] = X['User'].astype(int)

In [13]:
alpha = 0.05
group_kfold = GroupKFold(n_splits=5)

models = []
auc_scores = []
feature_occurrences = {} 
shaps = []

for i, (train_outer_ix, test_outer_ix) in enumerate(group_kfold.split(X, y, groups=X['User'])):
    X_train, y_train = X.iloc[train_outer_ix].drop(columns='User'), y.iloc[train_outer_ix]
    X_test, y_test = X.iloc[test_outer_ix].drop(columns='User'), y.iloc[test_outer_ix]
    
    #model
    model = SVC(probability=True, kernel='linear', random_state=7) 
    model.fit(X_train, y_train)

    #SHAP in training
    explainer = shap.KernelExplainer(model.predict_proba, X_train)
    shap_values = explainer.shap_values(X_train) # 3D
    averaged_shap_values = np.mean(np.abs(shap_values), axis=0) #2D
    shap_importance = np.mean(averaged_shap_values, axis=0) #1D
    shaps.extend(shap_importance)
    important_features = np.array(X_train.columns)[shap_importance > alpha]

    #count
    for feature in important_features:
        if feature in feature_occurrences:
            feature_occurrences[feature] += 1
        else:
            feature_occurrences[feature] = 1

    #test with selected features
    X_train_selected = X_train[important_features]
    X_test_selected = X_test[important_features]
    model.fit(X_train_selected, y_train)
    yhat_probs = model.predict_proba(X_test_selected)[:, 1] 
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        auc_scores.append(auc)
    except ValueError:
        print("Skipping AUC calculation for this fold due to single class in y_true.")
        auc_scores.append(float('nan'))  
    
    models.append(model)

  0%|          | 0/427 [00:00<?, ?it/s]

  0%|          | 0/412 [00:00<?, ?it/s]

  0%|          | 0/414 [00:00<?, ?it/s]

  0%|          | 0/420 [00:00<?, ?it/s]

  0%|          | 0/423 [00:00<?, ?it/s]

In [14]:
print('The mean AUC of the 5 folds is ', np.nanmean(auc_scores))
print('The std AUC of the 5 folds is ', np.nanstd(auc_scores))

The mean AUC of the 5 folds is  0.5279252717624345
The std AUC of the 5 folds is  0.20340585129616828


In [15]:
final_selected_features = [feature for feature, count in feature_occurrences.items() if count >= 2]

In [16]:
X = X.drop('User', axis = 1)

In [17]:
X_filtered = X[final_selected_features]
print(X_filtered.shape)
print(y.shape)

(524, 13)
(524,)


In [18]:
X_filtered.columns

Index(['caloriesOut', 'lightlyActiveMinutes', 'restingHeartRate',
       'heartRateZone_Out_ofRange_caloriesOut',
       'heartRateZone_Out_ofRange_minutes', 'light_thirtyDayAvgMinutes',
       'marginalCalories', 'activity_veryActive',
       'heartRateZone_Fat_Burn_caloriesOut', 'heartRateZone_Fat_Burn_minutes',
       'heartRateZone_Cardio_caloriesOut', 'heartRateZone_Cardio_minutes',
       'rem_thirtyDayAvgMinutes'],
      dtype='object')

## Evaluation

In [19]:
X_filtered = X_filtered.copy()
X_filtered['User'] = df['User']

### CatBoost

In [20]:
group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []
SHAP_values_per_fold = []
all_X_tests = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model
    model = CatBoostClassifier(iterations=200, learning_rate=0.01, depth=8, l2_leaf_reg=3, random_seed=7, 
                               loss_function='Logloss')
    model.fit(X_train, y_train, cat_features=[], verbose=0)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    # SHAP
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    SHAP_values_per_fold.append(shap_values)
    all_X_tests.append(X_test)
    
    models.append(model)

In [21]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority voting:", mean_auc)

Mean AUC using majority voting: 0.8333333333333334


### XGBoost

In [22]:
import xgboost as xgb

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model   
    model = xgb.XGBClassifier(n_estimators=400, learning_rate=0.01, max_depth=8, 
                              random_state=7, use_label_encoder=False, eval_metric="logloss")
    model.fit(X_train, y_train, verbose=0)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

In [23]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.775


### RF

In [24]:
from sklearn.ensemble import RandomForestClassifier

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model
    model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=7) 
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

In [25]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.8


### SVM

In [26]:
from sklearn.svm import SVC

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

all_pred_labels = []
all_pred_probs = []
all_true_labels = []
all_users = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model   
    model = SVC(probability=True, kernel='rbf', random_state=7) 
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    all_pred_labels.extend(predicted_labels.values())
    all_pred_probs.extend(predicted_probs.values())
    all_true_labels.extend(true_labels.values())
    all_users.extend(true_labels.keys())
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)


In [27]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.8333333333333334


### LR

In [28]:
from sklearn.linear_model import LogisticRegression

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model   
    model = LogisticRegression(random_state=7, max_iter=100)
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

In [29]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.7333333333333333


### GNB

In [30]:
from sklearn.naive_bayes import GaussianNB

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model   
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

In [31]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.5999999999999999


## Activities + Heart + Naps

In [32]:
naps = sleep[sleep['classification']=='Nap']
naps = naps.drop(['startTime', 'endTime', 'duration', 'isMainSleep', 'classification', 'gender', 'race'], axis=1) 
naps = naps.dropna(axis = 1)

In [33]:
common_features = ['label', 'User', 'age', 'Date']
df = pd.merge(activity, naps, on=common_features, how='inner')

In [34]:
df = df.drop(['Date', 'age'], axis=1)

## Feature Selection

In [35]:
X = df.drop(columns=['label']) 
y = df['label']  

In [36]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols = X.columns[X.columns != 'User']
X[cols] = scaler.fit_transform(X[cols])

In [37]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10)
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X['User'] = X['User'].astype(int)

In [38]:
alpha = 0.05
group_kfold = GroupKFold(n_splits=5)

models = []
auc_scores = []
feature_occurrences = {} 
shaps = []

for i, (train_outer_ix, test_outer_ix) in enumerate(group_kfold.split(X, y, groups=X['User'])):
    X_train, y_train = X.iloc[train_outer_ix].drop(columns='User'), y.iloc[train_outer_ix]
    X_test, y_test = X.iloc[test_outer_ix].drop(columns='User'), y.iloc[test_outer_ix]
    
    #model
    model = SVC(probability=True, kernel='linear', random_state=7) 
    model.fit(X_train, y_train)

    #SHAP in training
    explainer = shap.KernelExplainer(model.predict_proba, X_train)
    shap_values = explainer.shap_values(X_train) # 3D
    averaged_shap_values = np.mean(np.abs(shap_values), axis=0) #2D
    shap_importance = np.mean(averaged_shap_values, axis=0) #1D
    shaps.extend(shap_importance)
    important_features = np.array(X_train.columns)[shap_importance > alpha]

    #count
    for feature in important_features:
        if feature in feature_occurrences:
            feature_occurrences[feature] += 1
        else:
            feature_occurrences[feature] = 1

    #test with selected features
    X_train_selected = X_train[important_features]
    X_test_selected = X_test[important_features]
    model.fit(X_train_selected, y_train)
    yhat_probs = model.predict_proba(X_test_selected)[:, 1] 
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        auc_scores.append(auc)
    except ValueError:
        print("Skipping AUC calculation for this fold due to single class in y_true.")
        auc_scores.append(float('nan'))  
    
    models.append(model)

  0%|          | 0/130 [00:00<?, ?it/s]

Skipping AUC calculation for this fold due to single class in y_true.


  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

Skipping AUC calculation for this fold due to single class in y_true.


In [39]:
print('The mean AUC of the 5 folds is ', np.nanmean(auc_scores))
print('The std AUC of the 5 folds is ', np.nanstd(auc_scores))

The mean AUC of the 5 folds is  0.8882225553889721
The std AUC of the 5 folds is  0.09484930284162234


In [40]:
final_selected_features = [feature for feature, count in feature_occurrences.items() if count >= 2]

In [41]:
X = X.drop('User', axis = 1)

In [42]:
X_filtered = X[final_selected_features]
print(X_filtered.shape)
print(y.shape)

(171, 13)
(171,)


In [43]:
X_filtered.columns

Index(['caloriesOut', 'lightlyActiveMinutes', 'sedentaryMinutes', 'steps',
       'veryActiveMinutes', 'heartRateZone_Out_ofRange_caloriesOut',
       'heartRateZone_Out_ofRange_minutes',
       'heartRateZone_Fat_Burn_caloriesOut', 'heartRateZone_Cardio_minutes',
       'restingHeartRate', 'heartRateZone_Out_ofRange_min',
       'heartRateZone_Fat_Burn_max', 'totalTimeInBed'],
      dtype='object')

## Evaluation

In [44]:
X_filtered = X_filtered.copy()
X_filtered['User'] = df['User']

### CatBoost

In [45]:
group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model
    model = CatBoostClassifier(iterations=10, learning_rate=0.01, depth=8, l2_leaf_reg=3, random_seed=6, 
                               loss_function='Logloss')
    model.fit(X_train, y_train, cat_features=[], verbose=0)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

Only one class present in fold 0. Skipping AUC calculation.
Only one class present in fold 4. Skipping AUC calculation.


In [46]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority voting:", mean_auc)

Mean AUC using majority voting: 0.6666666666666666


### XGBoost

In [47]:
import xgboost as xgb

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model   
    model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=8, 
                              random_state=9, use_label_encoder=False, eval_metric="logloss")
    model.fit(X_train, y_train, verbose=10)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Warning: Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)



In [48]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.5


### RF

In [49]:
from sklearn.ensemble import RandomForestClassifier

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model
    model = RandomForestClassifier(n_estimators=10, max_depth=8, random_state=9)
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

Only one class present in fold 1. Skipping AUC calculation.
Only one class present in fold 5. Skipping AUC calculation.


In [50]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.5833333333333334


### SVM

In [51]:
from sklearn.svm import SVC

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model   
    model = SVC(probability=True, kernel='linear', random_state=7)
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

Only one class present in fold 1. Skipping AUC calculation.
Only one class present in fold 5. Skipping AUC calculation.


In [52]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.75


### LR

In [53]:
from sklearn.linear_model import LogisticRegression

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model   
    model = LogisticRegression(random_state=7, max_iter=200)
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}

    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

Only one class present in fold 1. Skipping AUC calculation.
Only one class present in fold 5. Skipping AUC calculation.


In [54]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.75


### GNB

In [55]:
from sklearn.naive_bayes import GaussianNB

group_kfold = GroupKFold(n_splits=5)
splits = group_kfold.split(X_filtered, y, groups=X_filtered['User'])

models = []
auc_per_fold = []

for fold_num, (train_idx, test_idx) in enumerate(splits, start=1):
    X_train, y_train = X_filtered.iloc[train_idx].drop(columns='User'), y.iloc[train_idx]
    X_test, y_test = X_filtered.iloc[test_idx].drop(columns='User'), y.iloc[test_idx]
    groups_test = X_filtered.iloc[test_idx]['User']
    
    # model   
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    # test
    yhat = model.predict(X_test)
    
    # initialization
    positive_predictions = defaultdict(int)
    total_predictions = defaultdict(int)
    
    for i, group in enumerate(groups_test):
        if yhat[i] == 1:  
            positive_predictions[group] += 1
        total_predictions[group] += 1
    
    # calculate the fraction of days predicted as positive for each user
    predicted_probs = {user: positive_predictions[user] / total_predictions[user] for user in groups_test.unique()}
    
    # extract the true labels for each user
    true_labels = {user: y_test[groups_test == user].iloc[0] for user in groups_test.unique()}
    
    predicted_labels = {user: 1 if prob >= 0.5 else 0 for user, prob in predicted_probs.items()}
    
    # auc
    if len(set(true_labels.values())) > 1:
        auc = roc_auc_score(list(true_labels.values()), list(predicted_probs.values()))
        auc_per_fold.append(auc)
    else:
        print(f"Only one class present in fold {fold_num}. Skipping AUC calculation.")
        auc_per_fold.append(float('nan'))  
    
    models.append(model)

Only one class present in fold 1. Skipping AUC calculation.
Only one class present in fold 5. Skipping AUC calculation.


In [56]:
mean_auc = np.nanmean(auc_per_fold)
print("Mean AUC using majority vote:", mean_auc)

Mean AUC using majority vote: 0.5
