In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedGroupKFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

# conda install -c conda-forge lightgbm
# conda install -c conda-forge catboost

# https://www.kaggle.com/code/ihelon/titanic-hyperparameter-tuning-with-gridsearchcv/notebook

In [2]:
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    os.environ["PYTHONHASHSEED"] = str(seed_value)
    
SEED = 42
set_seed(SEED)

In [6]:
df = pd.read_csv('data/baby/order_relative_abundance_otu_drop_baby.csv')
df.patientID = df.patientID.astype('category').cat.codes
metadata_list = ['sampleID', 'EverCovid', 'CovidStatus', 'CovidLabel', 'Timepoint']  # keep patientID for later use
df0 = df.loc[df['Timepoint'].isin([1, 2, 3])]  # all three time points T1,2,3
df1 = df0.loc[df0['CovidLabel'] == 1]  # case-negatives (CN) only exist in T1&2
df2 = df0.loc[df0['CovidLabel'] == 2]  # case-positives (CP) only exist in T3
# reformat y values from [1, 2] to [0, 1], to work with XGB
df1['CovidLabel'] = 0
df2['CovidLabel'] = 1
df_CN_CP = pd.concat([df1, df2], ignore_index=True)  # reset the index after concat
df_CN_CP_otu = df_CN_CP.drop(columns=metadata_list)
X = df_CN_CP_otu.to_numpy()
y = df_CN_CP['CovidLabel'].to_numpy()
groups = df_CN_CP.patientID.to_list()  # for outer_cv use

# log transform
addons = np.min(X[:, :-3][np.nonzero(X[:, :-3])]) / 2  # find the min non-zero value, based on RF - Gut bowel disease paper
X[:, :-3] += addons  # only add values & log transform on OTU data
X[:, :-3] = np.log2(X[:, :-3])  # based on "A Fair Comparison" paper, they used log2

groups = [int(x) for x in groups]  # groups must be integers
cv = StratifiedGroupKFold(n_splits=2, random_state=SEED, shuffle=True)
# pca = PCA(n_components=5)
cross_valid_scores = {}
categorical_columns = ['HIVstatus']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['CovidLabel'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['CovidLabel'] = 1


In [None]:
for fold, (train_index, test_index) in enumerate(cv.split(X, y, groups)):
    # set up train set from each fold
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    X_train, y_train = ADASYN(random_state=SEED, n_neighbors=2).fit_resample(X_train, y_train)

In [5]:
# Decision Tree
    parameters = {"max_depth": [3, 5, 7, 9, 11, 13]}
    model_desicion_tree = DecisionTreeClassifier(random_state=SEED, class_weight='balanced')
    model_desicion_tree = GridSearchCV(
        model_desicion_tree, 
        parameters, 
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1)
    model_desicion_tree.fit(X_train[:, :-1], y_train)
    print('-----')
    print(f'Best parameters {model_desicion_tree.best_params_}')
    print(
        f'Mean cross-validated score of the best_estimator: ' + \
        f'{model_desicion_tree.best_score_:.3f}')
    cross_valid_scores['desicion_tree'] = model_desicion_tree.best_score_
    print('-----')

IndentationError: unexpected indent (1848366407.py, line 2)

In [None]:
# Random Forest
    parameters = {
        'n_estimators': [3, 5, 10, 50],
        'max_features': [0.05, 0.1, 0.5, 0.8],
        'max_depth': [3, 4, 5],
        'max_samples': [0.3, 0.5, 0.8]}
    model_random_forest = RandomForestClassifier(random_state=SEED, class_weight='balanced')
    model_random_forest = GridSearchCV(
        model_random_forest, 
        parameters, 
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1)
    model_random_forest.fit(X_train[:, :-1], y_train)
    print('-----')
    print(f'Best parameters {model_random_forest.best_params_}')
    print(
        f'Mean cross-validated score of the best_estimator: '+ \
        f'{model_random_forest.best_score_:.3f}')
    cross_valid_scores['random_forest'] = model_random_forest.best_score_
    print('-----')

In [None]:
# AdaBoost
    parameters = {
        "n_estimators": [5, 10, 15, 20, 25, 50, 75, 100], 
        "learning_rate": [0.001, 0.01, 0.1, 1.]}
    model_adaboost = AdaBoostClassifier(random_state=SEED)
    model_adaboost = GridSearchCV(
        model_adaboost, 
        parameters, 
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1)
    model_adaboost.fit(X_train[:, :-1], y_train)
    print('-----')
    print(f'Best parameters {model_adaboost.best_params_}')
    print(
        f'Mean cross-validated score of the best_estimator: '+ \
        f'{model_adaboost.best_score_:.3f}')
    cross_valid_scores['ada_boost'] = model_adaboost.best_score_
    print('-----')

In [None]:
# XGBoost
    parameters = {
        'max_depth': [3, 5, 7, 9], 
        'n_estimators': [5, 10, 15, 20, 25, 50, 100],
        'learning_rate': [0.01, 0.05, 0.1]}
    model_xgb = xgb.XGBClassifier(random_state=SEED)
    model_xgb = GridSearchCV(
        model_xgb, 
        parameters, 
        cv=5,
        scoring='roc_auc',
        n_jobs=-1)
    model_xgb.fit(X_train[:, :-1], y_train)
    print('-----')
    print(f'Best parameters {model_xgb.best_params_}')
    print(
        f'Mean cross-validated score of the best_estimator: ' + 
        f'{model_xgb.best_score_:.3f}')
    cross_valid_scores['xgboost'] = model_xgb.best_score_
    print('-----')

In [None]:
# LightGBM
    parameters = {
        'n_estimators': [5, 10, 15, 20, 25, 50, 100],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [7, 15, 31]}
    model_lgbm = lgbm.LGBMClassifier(random_state=SEED, class_weight='balanced')
    model_lgbm = GridSearchCV(
        model_lgbm, 
        parameters, 
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1)
    model_lgbm.fit(X_train[:, :-1], y_train, categorical_feature=categorical_columns)
    print('-----')
    print(f'Best parameters {model_lgbm.best_params_}')
    print(
        f'Mean cross-validated accuracy score of the best_estimator: ' + 
        f'{model_lgbm.best_score_:.3f}')
    cross_valid_scores['lightgbm'] = model_lgbm.best_score_
    print('-----')

In [None]:
# CatBoost
    parameters = {
        'iterations': [5, 10, 15, 20, 25, 50, 100],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [3, 5, 7, 9, 11, 13]}
    model_catboost = cb.CatBoostClassifier(verbose=False)
    model_catboost = GridSearchCV(
        model_catboost, 
        parameters, 
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1)
    model_catboost.fit(X_train[:, :-1], y_train)
    print('-----')
    print(f'Best parameters {model_catboost.best_params_}')
    print(
        f'Mean cross-validated accuracy score of the best_estimator: ' + 
        f'{model_catboost.best_score_:.3f}')
    cross_valid_scores['catboost'] = model_catboost.best_score_
    print('-----')

In [None]:
pd.DataFrame(cross_valid_scores, index=['cross_valid_score']).T

In [None]:
def create_submission(model, X_test, test_passenger_id, model_name):
    y_pred_test = model.predict_proba(X_test)[:, 1]
    submission = pd.DataFrame(
        {
            'PassengerId': test_passenger_id, 
            'Survived': (y_pred_test >= 0.5).astype(int),
        }
    )
    submission.to_csv(f"submission_{model_name}.csv", index=False)
    
    return y_pred_test