# Import libraries

In [6]:
import shap
import pymrmr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif,RFE
from sklearn.model_selection import train_test_split, GridSearchCV,RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score, roc_curve,precision_recall_curve

# Read in data

In [7]:
# JIVE dataframes
cbcl_jive = pd.read_csv('cbcl.jive.sc.fc.out.csv') # [463 rows x 116 columns]> [drop first c]
cbcl_jive_embed = pd.read_csv('cbcl.jive.embed.csv') # [463 rows x 197 columns]> [drop first c]
cbcl_df = pd.read_csv('cbcl.sc.match.csv') # [463 rows x 3748 columns]>
cbcl_label = cbcl_df['CBCL'] # Name: CBCL, Length: 463, dtype: int64>

# Embedding dataframes
cbcl_fc_embed = pd.read_csv('cbcl_fc_node2vec_32embeddings_20wl.csv')# [463 rows x 3200 columns]>
cbcl_sc_embed = pd.read_csv('cbcl_node2vec_32embeddings_20wl.csv') # [602 rows x 2784 columns]>

# Vectorized dataframes
cbcl_fc_df = pd.read_csv('updated_flattened_fc_matrices_level_150.csv') # 463 rows × 4952 columns
cbcl_fc = cbcl_fc_df.filter(like='feature') # [463 rows x 4950 columns]>
cbcl_fc_label = cbcl_fc_df['CBCL'] # Name: CBCL, Length: 463, dtype: float64>

cbcl_sc_df = pd.read_csv('merged_dataset_cbcl.csv') # 602 rows × 3746 columns  
cbcl_sc = cbcl_sc_df.filter(like='V') # [602 rows x 3741 columns]>
cbcl_sc_label = cbcl_sc_df['CBCL'] # Name: CBCL, Length: 602, dtype: int64>

# Helpers

In [8]:
def find_best_threshold(y_true, y_prob):
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    f1_scores = 2 * (precision * recall) / (precision + recall)
    best_threshold = thresholds[np.argmax(f1_scores)]
    return best_threshold

In [9]:
def xgboost_4jive_pipeline(X, y, k_best=10, test_size=0.4, random_state=42, n_splits=5, n_repeats=3):
    # Ensure all data is numeric
    X = X.apply(pd.to_numeric, errors='coerce')
    y = pd.to_numeric(y, errors='coerce')

    # Drop any rows with NaN values resulting from the coercion
    X.dropna(inplace=True)
    y = y[y.index.isin(X.index)]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Feature selection using SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k_best)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Cross-validation and training for XGBoost
    xgb = XGBClassifier(random_state=random_state, eval_metric='logloss')
    xgb_param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_child_weight': [1, 2, 4],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    # Repeated Stratified K-Fold Cross Validation
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

    # Perform cross-validation
    grid_search = GridSearchCV(xgb, xgb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train_selected, y_train)

    # Get the best estimator
    best_pipeline = grid_search.best_estimator_

    # Predict probabilities on the training set
    y_train_prob = best_pipeline.predict_proba(X_train_selected)[:, 1]

    # Find the best threshold based on the training set
    best_threshold = find_best_threshold(y_train, y_train_prob)

    # Predict probabilities on the test set
    y_test_prob = best_pipeline.predict_proba(X_test_selected)[:, 1]

    # Apply the best threshold to the test set
    y_pred_xgb = (y_test_prob >= best_threshold).astype(int)

    # Evaluate the model
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    report_xgb = classification_report(y_test, y_pred_xgb)
    auc_xgb = roc_auc_score(y_test, y_test_prob)

    return best_pipeline, grid_search.best_params_, accuracy_xgb, auc_xgb, report_xgb, best_threshold


In [24]:
def xgboost_pipeline(X, y, n_components=150, k_best=10, test_size=0.4, random_state=42, n_splits=5, n_repeats=3):
    # Ensure all data is numeric
    X = X.apply(pd.to_numeric, errors='coerce')
    y = pd.to_numeric(y, errors='coerce')

    # Drop any rows with NaN values resulting from the coercion
    X.dropna(inplace=True)
    y = y[y.index.isin(X.index)]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=True)

    # Constant Feature Elimination (CFE)
    cfe = VarianceThreshold()
    X_train_cfe = cfe.fit_transform(X_train)
    X_test_cfe = cfe.transform(X_test)

    # Perform PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_cfe)
    X_test_pca = pca.transform(X_test_cfe)

    # Feature selection using SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k_best)
    X_train_selected = selector.fit_transform(X_train_pca, y_train)
    X_test_selected = selector.transform(X_test_pca)

    # Cross-validation and training for XGBoost
    xgb = XGBClassifier(random_state=random_state, eval_metric='logloss')
    xgb_param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_child_weight': [1, 2, 4],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    # Repeated Stratified K-Fold Cross Validation
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

    # Perform cross-validation
    grid_search = GridSearchCV(xgb, xgb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train_selected, y_train)

    # Get the best estimator
    best_pipeline = grid_search.best_estimator_

    # Predict probabilities on the training set
    y_train_prob = best_pipeline.predict_proba(X_train_selected)[:, 1]

    # Find the best threshold based on the training set
    best_threshold = find_best_threshold(y_train, y_train_prob)

    # Predict probabilities on the test set
    y_test_prob = best_pipeline.predict_proba(X_test_selected)[:, 1]

    # Apply the best threshold to the test set
    y_pred_xgb = (y_test_prob >= best_threshold).astype(int)

    # Evaluate the model
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    report_xgb = classification_report(y_test, y_pred_xgb)
    auc_xgb = roc_auc_score(y_test, y_test_prob)

    return best_pipeline, grid_search.best_params_, accuracy_xgb, auc_xgb, report_xgb, best_threshold

# Main Codes

## [JIVE SC&FC]

In [11]:
best_pipeline, best_params, accuracy, auc, report, best_threshold = xgboost_4jive_pipeline(cbcl_jive.iloc[: , 1:], cbcl_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)
print("Best Threshold:", best_threshold)

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.489247311827957
AUC: 0.49536178107606677
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.62      0.56        98
           1       0.45      0.34      0.39        88

    accuracy                           0.49       186
   macro avg       0.48      0.48      0.47       186
weighted avg       0.48      0.49      0.48       186

Best Threshold: 0.57564163


## [JIVE Embedding SC&FC]

In [12]:
best_pipeline, best_params, accuracy, auc, report, best_threshold = xgboost_4jive_pipeline(cbcl_jive_embed.iloc[: , 1:], cbcl_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)
print("Best Threshold:", best_threshold)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.5376344086021505
AUC: 0.5295686456400742
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.46      0.51        98
           1       0.51      0.62      0.56        88

    accuracy                           0.54       186
   macro avg       0.54      0.54      0.54       186
weighted avg       0.54      0.54      0.53       186

Best Threshold: 0.43193695


## [SC]

In [25]:
# pc 150
best_pipeline, best_params, accuracy, auc, report, best_threshold = xgboost_pipeline(cbcl_sc, cbcl_sc_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)
print("Best Threshold:", best_threshold)

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1.0}
Accuracy: 0.5186721991701245
AUC: 0.5416666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.50      0.51       121
           1       0.52      0.54      0.53       120

    accuracy                           0.52       241
   macro avg       0.52      0.52      0.52       241
weighted avg       0.52      0.52      0.52       241

Best Threshold: 0.44762605


## [SC Embedding]

In [26]:
# pc 150
best_pipeline, best_params, accuracy, auc, report, best_threshold = xgboost_pipeline(cbcl_sc_embed, cbcl_sc_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)
print("Best Threshold:", best_threshold)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.5394190871369294
AUC: 0.5409090909090909
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.61      0.57       121
           1       0.54      0.47      0.50       120

    accuracy                           0.54       241
   macro avg       0.54      0.54      0.54       241
weighted avg       0.54      0.54      0.54       241

Best Threshold: 0.49100077


## [FC]

In [27]:
# pc 150
best_pipeline, best_params, accuracy, auc, report, best_threshold = xgboost_pipeline(cbcl_fc, cbcl_fc_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)
print("Best Threshold:", best_threshold)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.8}
Accuracy: 0.5860215053763441
AUC: 0.5812904717853841
Classification Report:
               precision    recall  f1-score   support

         0.0       0.58      0.60      0.59        92
         1.0       0.59      0.57      0.58        94

    accuracy                           0.59       186
   macro avg       0.59      0.59      0.59       186
weighted avg       0.59      0.59      0.59       186

Best Threshold: 0.4558845


## [FC Embedding]

In [28]:
# pc 150
best_pipeline, best_params, accuracy, auc, report, best_threshold = xgboost_pipeline(cbcl_fc_embed, cbcl_fc_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)
print("Best Threshold:", best_threshold)

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.46774193548387094
AUC: 0.4825393154486587
Classification Report:
               precision    recall  f1-score   support

         0.0       0.47      0.62      0.54        92
         1.0       0.46      0.32      0.38        94

    accuracy                           0.47       186
   macro avg       0.47      0.47      0.46       186
weighted avg       0.47      0.47      0.46       186

Best Threshold: 0.5066455
