# Import libraries

In [23]:
import shap
import pymrmr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif,RFE
from sklearn.model_selection import train_test_split, GridSearchCV,RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score, roc_curve,precision_recall_curve

# Read in data

In [25]:
# JIVE dataframes
cbcl_jive = pd.read_csv('cbcl.jive.sc.fc.out.csv') # [463 rows x 116 columns]> [drop first c]
cbcl_jive_embed = pd.read_csv('cbcl.jive.embed.csv') # [463 rows x 197 columns]> [drop first c]
cbcl_df = pd.read_csv('cbcl.sc.match.csv') # [463 rows x 3748 columns]>
cbcl_label = cbcl_df['CBCL'] # Name: CBCL, Length: 463, dtype: int64>

# Embedding dataframes
cbcl_fc_embed = pd.read_csv('cbcl_fc_node2vec_32embeddings_20wl.csv')# [463 rows x 3200 columns]>
cbcl_sc_embed = pd.read_csv('cbcl_node2vec_32embeddings_20wl.csv') # [602 rows x 2784 columns]>

# Vectorized dataframes
cbcl_fc_df = pd.read_csv('updated_flattened_fc_matrices_level_150.csv') # 463 rows × 4952 columns
cbcl_fc = cbcl_fc_df.filter(like='feature') # [463 rows x 4950 columns]>
cbcl_fc_label = cbcl_fc_df['CBCL'] # Name: CBCL, Length: 463, dtype: float64>

cbcl_sc_df = pd.read_csv('merged_dataset_cbcl.csv') # 602 rows × 3746 columns  
cbcl_sc = cbcl_sc_df.filter(like='V') # [602 rows x 3741 columns]>
cbcl_sc_label = cbcl_sc_df['CBCL'] # Name: CBCL, Length: 602, dtype: int64>

# Helpers

In [26]:
def find_best_threshold(y_true, y_prob):
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    f1_scores = 2 * (precision * recall) / (precision + recall)
    best_threshold = thresholds[np.argmax(f1_scores)]
    return best_threshold

In [27]:
def xgboost_4jive_pipeline(X, y, k_best=10, test_size=0.4, random_state=42, n_splits=5, n_repeats=3):
    # Ensure all data is numeric
    X = X.apply(pd.to_numeric, errors='coerce')
    y = pd.to_numeric(y, errors='coerce')

    # Drop any rows with NaN values resulting from the coercion
    X.dropna(inplace=True)
    y = y[y.index.isin(X.index)]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Feature selection using SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k_best)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Cross-validation and training for XGBoost
    xgb = XGBClassifier(random_state=random_state, eval_metric='logloss')
    xgb_param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_child_weight': [1, 2, 4],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    # Repeated Stratified K-Fold Cross Validation
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

    # Perform cross-validation
    grid_search = GridSearchCV(xgb, xgb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train_selected, y_train)

    # Get the best estimator
    best_pipeline = grid_search.best_estimator_

    # Predict and evaluate XGBoost
    y_pred_prob_xgb = best_pipeline.predict_proba(X_test_selected)[:, 1]
    y_pred_xgb = best_pipeline.predict(X_test_selected)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    report_xgb = classification_report(y_test, y_pred_xgb)
    auc_xgb = roc_auc_score(y_test, y_pred_prob_xgb)

    return best_pipeline, grid_search.best_params_, accuracy_xgb, auc_xgb, report_xgb

In [28]:
def xgboost_pipeline(X, y, n_components=150, k_best=10, test_size=0.4, random_state=42, n_splits=5, n_repeats=3):
    # Ensure all data is numeric
    X = X.apply(pd.to_numeric, errors='coerce')
    y = pd.to_numeric(y, errors='coerce')

    # Drop any rows with NaN values resulting from the coercion
    X.dropna(inplace=True)
    y = y[y.index.isin(X.index)]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=True)

    # Constant Feature Elimination (CFE)
    cfe = VarianceThreshold()
    X_train_cfe = cfe.fit_transform(X_train)
    X_test_cfe = cfe.transform(X_test)

    # Perform PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_cfe)
    X_test_pca = pca.transform(X_test_cfe)

    # Feature selection using SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k_best)
    X_train_selected = selector.fit_transform(X_train_pca, y_train)
    X_test_selected = selector.transform(X_test_pca)

    # Cross-validation and training for XGBoost
    xgb = XGBClassifier(random_state=random_state, eval_metric='logloss')
    xgb_param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_child_weight': [1, 2, 4],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    # Repeated Stratified K-Fold Cross Validation
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

    # Perform cross-validation
    grid_search = GridSearchCV(xgb, xgb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train_selected, y_train)

    # Get the best estimator
    best_pipeline = grid_search.best_estimator_

    # Predict and evaluate XGBoost
    y_pred_prob_xgb = best_pipeline.predict_proba(X_test_selected)[:, 1]
    y_pred_xgb = best_pipeline.predict(X_test_selected)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    report_xgb = classification_report(y_test, y_pred_xgb)
    auc_xgb = roc_auc_score(y_test, y_pred_prob_xgb)

    return best_pipeline, grid_search.best_params_, accuracy_xgb, auc_xgb, report_xgb

# Main Codes

## [JIVE SC&FC]

In [21]:
best_pipeline, best_params, accuracy, auc, report = xgboost_4jive_pipeline(cbcl_jive.iloc[: , 1:], cbcl_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.4946236559139785
AUC: 0.49536178107606677
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.61      0.56        98
           1       0.46      0.36      0.41        88

    accuracy                           0.49       186
   macro avg       0.49      0.49      0.48       186
weighted avg       0.49      0.49      0.49       186



## [JIVE Embedding SC&FC]

In [9]:
best_pipeline, best_params, accuracy, auc, report = xgboost_4jive_pipeline(cbcl_jive_embed.iloc[: , 1:], cbcl_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.5268817204301075
AUC: 0.5295686456400742
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.63      0.58        98
           1       0.50      0.41      0.45        88

    accuracy                           0.53       186
   macro avg       0.52      0.52      0.52       186
weighted avg       0.52      0.53      0.52       186



## [SC]

In [29]:
best_pipeline, best_params, accuracy, auc, report = xgboost_pipeline(cbcl_sc, cbcl_sc_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.8}
Accuracy: 0.4854771784232365
AUC: 0.47741046831955924
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.56      0.52       121
           1       0.48      0.41      0.44       120

    accuracy                           0.49       241
   macro avg       0.48      0.49      0.48       241
weighted avg       0.48      0.49      0.48       241



## [SC Embedding]

In [30]:
best_pipeline, best_params, accuracy, auc, report = xgboost_pipeline(cbcl_sc_embed, cbcl_sc_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.5145228215767634
AUC: 0.5166666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.58      0.54       121
           1       0.51      0.45      0.48       120

    accuracy                           0.51       241
   macro avg       0.51      0.51      0.51       241
weighted avg       0.51      0.51      0.51       241



## [FC]

In [31]:
best_pipeline, best_params, accuracy, auc, report = xgboost_pipeline(cbcl_fc, cbcl_fc_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.5053763440860215
AUC: 0.4928307123034228
Classification Report:
               precision    recall  f1-score   support

         0.0       0.50      0.63      0.56        92
         1.0       0.51      0.38      0.44        94

    accuracy                           0.51       186
   macro avg       0.51      0.51      0.50       186
weighted avg       0.51      0.51      0.50       186



## [FC Embedding]

In [32]:
best_pipeline, best_params, accuracy, auc, report = xgboost_pipeline(cbcl_fc_embed, cbcl_fc_label)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Classification Report:\n", report)

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}
Accuracy: 0.45698924731182794
AUC: 0.4610314523589269
Classification Report:
               precision    recall  f1-score   support

         0.0       0.46      0.58      0.51        92
         1.0       0.45      0.34      0.39        94

    accuracy                           0.46       186
   macro avg       0.46      0.46      0.45       186
weighted avg       0.46      0.46      0.45       186

