In [6]:
import pandas as pd
import numpy as np
import featuretools as ft
import matplotlib.pyplot as plt
import sklearn
import shap
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import roc_curve, roc_auc_score, auc

In [7]:
import warnings
warnings.filterwarnings("ignore", "is_categorical_dtype")

## Read Data

In [11]:
activity = pd.read_csv('activity.txt', sep='\t') 
activity = activity.drop(['gender', 'race', 'caloriesBMR'], axis = 1)

In [12]:
sleep = pd.read_csv('sleep.txt', sep='\t') 

In [13]:
#divide into main sleep and naps
def classify_sleep(row):
    main_sleep_conditions = all(pd.isna(row[key]) for key in ['asleep_count', 'asleep_minutes', 
                                                             'awake_count', 'awake_minutes',
                                                             'restless_minutes', 'restless_count'])
    nap_conditions = all(pd.isna(row[key]) for key in ['deep_count', 'deep_minutes', 'deep_thirtyDayAvgMinutes', 
                                                      'light_count', 'light_minutes', 'light_thirtyDayAvgMinutes',
                                                      'rem_count', 'rem_minutes', 'rem_thirtyDayAvgMinutes',
                                                      'wake_count', 'wake_minutes', 'wake_thirtyDayAvgMinutes'])
    if nap_conditions:
        return "Nap"
    elif main_sleep_conditions:
        return "Main Sleep"
    else:
        return "Inconclusive" 

sleep['classification'] = sleep.apply(classify_sleep, axis=1)

In [14]:
mainsleep = sleep[sleep['classification']=='Main Sleep']
mainsleep = mainsleep.drop(['isMainSleep', 'classification',
                'gender', 'race'], axis=1) 
mainsleep = mainsleep.dropna(axis = 1)

## activity + heart rate + mian sleep

In [16]:
common_features = ['label', 'participant', 'age', 'date']
df = pd.merge(activity, mainsleep, on=common_features, how='inner')

In [18]:
unique_df = df[['participant', 'label']].drop_duplicates()
unique_df.set_index('participant', inplace=True)

In [20]:
df = df.drop(['label', 'date'], axis = 1)

## Featuretools

In [21]:
es = ft.EntitySet(id = 'data_activities_heart_mainsleep')
es.add_dataframe(dataframe_name = 'AHM', dataframe = df, make_index = True, index = 'ahm_index')

Entityset: data_activities_heart_mainsleep
  DataFrames:
    AHM [Rows: 570, Columns: 58]
  Relationships:
    No relationships

In [22]:
es.normalize_dataframe(base_dataframe_name='AHM', new_dataframe_name='person', index = 'participant', 
            additional_columns = ['age'])

Entityset: data_activities_heart_mainsleep
  DataFrames:
    AHM [Rows: 570, Columns: 57]
    person [Rows: 20, Columns: 2]
  Relationships:
    AHM.participant -> person.participant

In [23]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_dataframe_name = 'person', 
max_depth = 2, 
verbose = 1 
)

Built 332 features
Elapsed: 00:00 | Progress: 100%|████████████████████████████████████████████████


## Feature Selection

In [24]:
y = unique_df['label']
X = feature_matrix

In [93]:
alpha = 0.02
kf = KFold(n_splits=5, shuffle=True, random_state=42) 

ix_training, ix_test = [], []

for fold in kf.split(X):
    ix_training.append(fold[0]), ix_test.append(fold[1])

models = []
auc_scores = []
feature_occurrences = {} 

for i, (train_outer_ix, test_outer_ix) in enumerate(zip(ix_training, ix_test)): 
    X_train, y_train = X.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    #model
    model = CatBoostClassifier(iterations=200, learning_rate=0.01, depth=6, l2_leaf_reg=3, random_seed=8, 
                               loss_function='Logloss')
    model.fit(X_train, y_train, cat_features = [], verbose = 0)
    
    #SHAP in training
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train) 
    shap_importance = np.mean(np.abs(np.array(shap_values)), axis = 0)
    important_features = np.array(X_train.columns)[shap_importance > alpha]

    #count
    for feature in important_features:
        if feature in feature_occurrences:
            feature_occurrences[feature] += 1
        else:
            feature_occurrences[feature] = 1

    #test with selected features
    X_train_selected = X_train[important_features]
    X_test_selected = X_test[important_features]

    model.fit(X_train_selected, y_train, cat_features = [], verbose = 0)
    
    yhat_probs = model.predict_proba(X_test_selected)[:, 1]
    
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        auc_scores.append(auc)
    except ValueError:
        auc_scores.append(float('nan'))
        
    models.append(model)


In [94]:
print('The mean AUC of the 5 folds is ', np.nanmean(auc_scores))
print('The std AUC of the 5 folds is ', np.nanstd(auc_scores))

The mean AUC of the 5 folds is  1.0
The std AUC of the 5 folds is  0.0


In [95]:
final_selected_features = [feature for feature, count in feature_occurrences.items() if count >= 2]

In [96]:
X_filtered = X[final_selected_features]

In [97]:
print(X_filtered.shape)
print(y.shape)

(20, 17)
(20,)


In [99]:
from sklearn.preprocessing import StandardScaler
scaler_z = StandardScaler()
original_index = X_filtered.index
scaled_data = scaler_z.fit_transform(X_filtered)
X_filtered = pd.DataFrame(scaled_data, index=original_index, columns=X_filtered.columns)

## Evaluation

### Catboost

In [105]:
rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []
all_shap_values = []
all_X_tests = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = CatBoostClassifier(iterations=200, learning_rate=0.01, depth=8, l2_leaf_reg=3, random_seed=8, 
                               loss_function='Logloss')
    model.fit(X_train, y_train, cat_features=[], verbose=0)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    yhat_labels = model.predict(X_test)
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
        
    #compute SHAP values for the current test set
    explainer = shap.Explainer(model)
    shap_values = explainer.shap_values(X_test)
    all_shap_values.append(shap_values)
    all_X_tests.append(X_test)

    models.append(model)

In [106]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Kfold after feature selection:", mean_auc)

Mean AUC using Kfold after feature selection: 1.0


In [112]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]
    
    userids_test = X_filtered.iloc[test_ix].index.values

    model = CatBoostClassifier(iterations=200, learning_rate=0.01, depth=8, l2_leaf_reg=3, random_seed=7, 
                               loss_function='Logloss')
    model.fit(X_train, y_train, cat_features = [], verbose = 0)

    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #predict labels
    yhat_labels = model.predict(X_test)
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [113]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 1.0


### RF

In [116]:
from sklearn.ensemble import RandomForestClassifier

rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=7)
    model.fit(X_train, y_train)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [117]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.9880952380952381


In [120]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=7)
    model.fit(X_train, y_train)

    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [121]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.9791666666666666


### XGBoost

In [122]:
import xgboost as xgb
rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=8, 
                              random_state=7, use_label_encoder=False, eval_metric="logloss")
    model.fit(X_train, y_train, verbose=0)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [123]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.9166666666666666


In [124]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=8, 
                              random_state=7, use_label_encoder=False, eval_metric="logloss")
    model.fit(X_train, y_train, verbose=10)

    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [125]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.8333333333333333


### SVM

In [126]:
from sklearn.svm import SVC

rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = SVC(probability=True, kernel='linear', random_state=7)
    model.fit(X_train, y_train)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [127]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.9484126984126983


In [130]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = SVC(probability=True, kernel='linear', random_state=7)
    model.fit(X_train, y_train)
    
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [131]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.9270833333333334


### LR

In [132]:
from sklearn.linear_model import LogisticRegression

rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = LogisticRegression(random_state=7, max_iter=200)
    model.fit(X_train, y_train)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [133]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.9603174603174602


In [134]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = LogisticRegression(random_state=7, max_iter=200)
    model.fit(X_train, y_train)
    
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [135]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.9791666666666666


### GNB

In [136]:
from sklearn.naive_bayes import GaussianNB

rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [137]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.9841269841269842


In [138]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = GaussianNB()
    model.fit(X_train, y_train)
    
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [139]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.9375


## activity + heart rate + naps

In [140]:
naps = sleep[sleep['classification']=='Nap']
naps = naps.drop(['isMainSleep', 'classification', 'gender', 'race'], axis=1) 
naps = naps.dropna(axis = 1)

In [141]:
common_features = ['label', 'participant', 'age', 'date']
df = pd.merge(activity, naps, on=common_features, how='inner')

In [143]:
unique_df = df[['participant', 'label']].drop_duplicates()
unique_df.set_index('participant', inplace=True)

In [144]:
df = df.drop(['label', 'date'], axis = 1)

## Featuretools

In [145]:
es = ft.EntitySet(id = 'data_activities_heart_nap')
es.add_dataframe(dataframe_name = 'AHN', dataframe = df, make_index = True, index = 'ahn_index')

Entityset: data_activities_heart_nap
  DataFrames:
    AHN [Rows: 204, Columns: 52]
  Relationships:
    No relationships

In [147]:
es.normalize_dataframe(base_dataframe_name='AHN', new_dataframe_name='person', index = 'participant', 
            additional_columns = ['age'])

Entityset: data_activities_heart_nap
  DataFrames:
    AHN [Rows: 204, Columns: 51]
    person [Rows: 15, Columns: 2]
  Relationships:
    AHN.participant -> person.participant

In [148]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_dataframe_name = 'person', 
max_depth = 2, 
verbose = 1 
)

Built 296 features
Elapsed: 00:00 | Progress: 100%|████████████████████████████████████████████████


## Feature Selection

In [149]:
y = unique_df['label']
X = feature_matrix

In [167]:
alpha = 0.02
kf = KFold(n_splits=5, shuffle=True, random_state=40) 

ix_training, ix_test = [], []

for fold in kf.split(X):
    ix_training.append(fold[0]), ix_test.append(fold[1])

models = []
auc_scores = []
feature_occurrences = {} 

for i, (train_outer_ix, test_outer_ix) in enumerate(zip(ix_training, ix_test)): 
    X_train, y_train = X.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    #model
    model = CatBoostClassifier(iterations=200, learning_rate=0.01, depth=6, l2_leaf_reg=3, random_seed=7, 
                               loss_function='Logloss')
    model.fit(X_train, y_train, cat_features = [], verbose = 0)

    #SHAP in training
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train) 
    shap_importance = np.mean(np.abs(np.array(shap_values)), axis = 0)
    important_features = np.array(X_train.columns)[shap_importance > alpha]

    #count
    for feature in important_features:
        if feature in feature_occurrences:
            feature_occurrences[feature] += 1
        else:
            feature_occurrences[feature] = 1

    #test with selected features
    X_train_selected = X_train[important_features]
    X_test_selected = X_test[important_features]

    model.fit(X_train_selected, y_train, cat_features = [], verbose = 0)
    
    yhat_probs = model.predict_proba(X_test_selected)[:, 1]
    
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        auc_scores.append(auc)
    except ValueError:
        auc_scores.append(float('nan'))
        
    models.append(model)

In [168]:
print('The mean AUC of the 5 folds is ', np.nanmean(auc_scores))
print('The std AUC of the 5 folds is ', np.nanstd(auc_scores))

The mean AUC of the 5 folds is  0.6666666666666666
The std AUC of the 5 folds is  0.23570226039551584


In [169]:
final_selected_features = [feature for feature, count in feature_occurrences.items() if count >= 2]

In [170]:
X_filtered = X[final_selected_features]
print(X_filtered.shape)
print(y.shape)

(15, 19)
(15,)


In [173]:
from sklearn.preprocessing import StandardScaler
scaler_z = StandardScaler()
original_index = X_filtered.index
scaled_data = scaler_z.fit_transform(X_filtered)
X_filtered = pd.DataFrame(scaled_data, index=original_index, columns=X_filtered.columns)

In [174]:
from sklearn.impute import KNNImputer
original_index = X_filtered.index
imputer = KNNImputer(n_neighbors=4)
X_imputed = imputer.fit_transform(X_filtered)
X_filtered = pd.DataFrame(X_imputed, index=original_index, columns=X_filtered.columns)

## Evaluation

### CatBoost

In [176]:
rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []
all_shap_values = []
all_X_tests = []


for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = CatBoostClassifier(iterations=200, learning_rate=0.01, depth=8, l2_leaf_reg=3, random_seed=7, 
                               loss_function='Logloss')
    model.fit(X_train, y_train, cat_features=[], verbose=0)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [177]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Kfold after feature selection:", mean_auc)

Mean AUC using Kfold after feature selection: 0.9444444444444444


In [178]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []
all_results = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]
    
    userids_test = X_filtered.iloc[test_ix].index.values

    model = CatBoostClassifier(iterations=200, learning_rate=0.01, depth=8, l2_leaf_reg=3, random_seed=7, 
                               loss_function='Logloss')
    model.fit(X_train, y_train, cat_features = [], verbose = 0)

    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [179]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.9545454545454546


### RF

In [180]:
from sklearn.ensemble import RandomForestClassifier

rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=7)
    model.fit(X_train, y_train)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [181]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.9166666666666666


In [182]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=7)
    model.fit(X_train, y_train)

    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [183]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.8636363636363636


### XGBoost

In [184]:
import xgboost as xgb
rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=8, 
                              random_state=7, use_label_encoder=False, eval_metric="logloss")
    model.fit(X_train, y_train, verbose=0)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [185]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.9444444444444444


In [186]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=8, 
                              random_state=7, use_label_encoder=False, eval_metric="logloss")
    model.fit(X_train, y_train, verbose=0)

    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [187]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.8181818181818182


### SVM

In [188]:
from sklearn.svm import SVC

rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = SVC(probability=True, kernel='linear', random_state=7)
    model.fit(X_train, y_train)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [189]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.8888888888888888


In [190]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = SVC(probability=True, kernel='linear', random_state=7)
    model.fit(X_train, y_train)
    
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [191]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.8863636363636364


### RF

In [193]:
from sklearn.linear_model import LogisticRegression

rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = LogisticRegression(random_state=8, max_iter=200)
    model.fit(X_train, y_train)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [194]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.9166666666666666


In [195]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = LogisticRegression(random_state=8, max_iter=200)
    model.fit(X_train, y_train)
    
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)

In [196]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.9545454545454546


### GNB

In [198]:
from sklearn.naive_bayes import GaussianNB

rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

models = []
r_auc_scores = []

for i, (train_outer_ix, test_outer_ix) in enumerate(rkf.split(X_filtered)): 
    X_train, y_train = X_filtered.iloc[train_outer_ix], y.iloc[train_outer_ix]
    X_test, y_test = X_filtered.iloc[test_outer_ix], y.iloc[test_outer_ix]
    
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    #test
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    #auc
    try:
        auc = roc_auc_score(y_test, yhat_probs)
        r_auc_scores.append(auc)
    except ValueError:
        r_auc_scores.append(float('nan')) 
    
    models.append(model)

In [199]:
mean_auc = np.nanmean(r_auc_scores)
print("Mean AUC using Repeated Kfold after feature selection:", mean_auc)

Mean AUC using Repeated Kfold after feature selection: 0.6388888888888888


In [200]:
cv = LeaveOneOut()

models = []
all_y_test = []
all_yhat_probs = []

for train_ix, test_ix in cv.split(X_filtered):
    X_train, y_train = X_filtered.iloc[train_ix], y.iloc[train_ix]
    X_test, y_test = X_filtered.iloc[test_ix], y.iloc[test_ix]

    model = GaussianNB()
    model.fit(X_train, y_train)
    
    yhat_probs = model.predict_proba(X_test)[:, 1]
    
    all_y_test.extend(y_test)
    all_yhat_probs.extend(yhat_probs)
        
    models.append(model)


In [201]:
auc = roc_auc_score(all_y_test, all_yhat_probs)
print("AUC Score across all LOOCV iterations:", auc)

AUC Score across all LOOCV iterations: 0.5909090909090908
