# Core (Always run)

In [None]:
import shap
import mygene
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, learning_curve, cross_validate, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, label_binarize
from itertools import cycle
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold, RFE, SelectFromModel
from sklearn.pipeline import Pipeline

Global Variables

In [None]:
path_to_data = "data/"
dataset_file_name = "dataset.pq"

# Preprocess

## Load datasets

In [None]:
df = pd.read_parquet(path_to_data + dataset_file_name)

## Cleaning

Transpose

In [None]:
df = df.transpose()

print(f'Dataframe shape after transpose: {df.shape}')

df.head()

Apply subtypes

In [None]:
excell_sheet_df = pd.read_excel('./assets/subtype_sheet.xlsx', sheet_name='RNA-Seq 1148')

for sample_id in df.index:
    print(f'Processing sample ID: {sample_id}')

    if sample_id in excell_sheet_df['Sample ID'].values:
        subtype = excell_sheet_df.loc[excell_sheet_df['Sample ID'] == sample_id, 'PAM50'].values[0]
        print(f'Subtype found: {subtype}')
        df.at[sample_id, 'Subtype'] = subtype

df.head()

Look for NaN

In [None]:
if df.isna().sum().sum() > 0:
    print("Dataframe contains missing values. Dropping missing values.")
    print(f'Number of missing values: {df.isna().sum().sum()}')

    df = df.dropna()

    print("Missing values dropped.")
    print(f'Number of remaining missing values: {df.isna().sum().sum()}')
else:
    print("Dataframe does not contain missing values.")

# Exploratory Data Analysis (EDA)

Plot 1: Subtype distribution plot

More info about the subtypes in this paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC6985186/

In [None]:
plot_df = df.copy()

plt.figure(figsize=(10, 6))
sns.countplot(data=plot_df, x='Subtype', order=plot_df['Subtype'].value_counts().index)
plt.title('Distribution of Subtypes')
plt.xlabel('Subtype')
plt.ylabel('Count')
plt.show()

Plot 2: Scatter plot

Observation: Contains a few outliers, not entirely sure what to do about them.

https://stats.stackexchange.com/questions/533503/when-should-you-remove-outliers-entire-dataset-or-train-dataset

In [None]:
x_log_transformed = np.log1p(plot_df.select_dtypes(include=np.number))

scaler = StandardScaler()
df_scaled = scaler.fit_transform(x_log_transformed)

PCA_model = PCA(n_components=2)
pca_result = PCA_model.fit_transform(df_scaled)
plot_df['PCA1'] = pca_result[:, 0]
plot_df['PCA2'] = pca_result[:, 1]
plt.figure(figsize=(10, 6))
sns.scatterplot(data=plot_df, x='PCA1', y='PCA2', hue='Subtype', palette='Set2')
plt.title('PCA Scatter Plot Colored by Subtype')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='Subtype')
plt.show()

Plot 2.1: Scatter plot with outliers removed

In [None]:
# Filter out outliers based on PCA1 and PCA2
filtered_plot_df = plot_df[plot_df['PCA1'] < 2000]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=filtered_plot_df, x='PCA1', y='PCA2', hue='Subtype', palette='Set2')
plt.title('PCA Scatter Plot with PCA1 < 2000 Colored by Subtype')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='Subtype')
plt.show()  

# Training

## Setup

Stratified K fold

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
n_jobs = -1

lr_base_model = LogisticRegression(max_iter=7500, random_state=42)
rf_base_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=n_jobs)
xgb_base_model = XGBClassifier(objective='multi:softmax', random_state=42, n_jobs=n_jobs, eval_metric='mlogloss')

Model training function

In [None]:
def setup_pipeline(model, type: str) -> Pipeline:
    """Returns pipeline based on model and type of feature selection. Feature selection types: rfe, sbm, skb"""

    base_list = [
        ('scaler', StandardScaler()),
        ('variance_threshold', VarianceThreshold(threshold=0.0))
    ]

    if type == 'rfe':
        return Pipeline(base_list + [
            ('feature_selection', RFE(estimator=LogisticRegression(max_iter=1500, random_state=42), n_features_to_select=50, step=0.1)),
            ('model', model)
        ])
    elif type == 'sbm':
        return Pipeline(base_list + [
            ('feature_selection', SelectFromModel(estimator=RandomForestClassifier(n_estimators=100, random_state=42), max_features=50)),
            ('model', model)
        ])
    elif type == 'skb':
        return Pipeline(base_list + [
            ('feature_selection', SelectKBest(score_func=f_classif, k=50)),
            ('model', model)
        ])
    else:
        raise ValueError("Invalid feature selection type. Choose from 'lr', 'rf', or 'skb'.")
    
def print_score(scores):
    print(f"Scores for each fold: {scores}")
    print(f"Average score: {np.mean(scores)}")
    print(f"Standard deviation: {np.std(scores)}")

Labeling

In [None]:
encoder = LabelEncoder()

y = encoder.fit_transform(df['Subtype'])
X = df.drop(columns=['Subtype'])

Normalization - log2

In [None]:
X = np.log2(X + 1)

Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training Logistic Regression using LR, RF and KBest for feature selection

Logistic Regression - RFE(LogisticRegression) Feature Selection

In [None]:
logreg_pipeline_rfe = setup_pipeline(model=lr_base_model, type='rfe')

logreg_rfe_result = cross_validate(
    estimator=logreg_pipeline_rfe, 
    X=X_train, 
    y=y_train, 
    cv=skf, 
    scoring='f1_macro',
    n_jobs=n_jobs,
    return_train_score=True
)

print_score(logreg_rfe_result['test_score'])

Logistic Regression - SelectBestModel(RandomForest) Feature Selection

In [None]:
# logreg_classifier_sbm = setup_pipeline(model=LogisticRegression(max_iter=1500, random_state=42), type='sbm').fit(X_train, y_train)
# logreg_predictions_sbm = logreg_classifier_sbm.predict(X_test)

# accuracy_logreg_sbm = accuracy_score(y_test, logreg_predictions_sbm)

# print(f'Logistic Regression with SBM Feature Selection Accuracy: {accuracy_logreg_sbm:.4f}')

logreg_pipeline_sbm = setup_pipeline(model=lr_base_model, type='sbm')

logreg_sbm_result = cross_validate(
    estimator=logreg_pipeline_sbm,
    X=X_train,
    y=y_train,
    cv=skf,
    scoring='f1_macro',
    n_jobs=n_jobs,
    return_train_score=True
)

print_score(logreg_sbm_result['test_score'])

Logistic Regression - SelectKBest Feature Selection

In [None]:
# logreg_classifier_kbest = setup_pipeline(model=LogisticRegression(max_iter=1500, random_state=42), type='skb').fit(X_train, y_train)
# logreg_predictions_kbest = logreg_classifier_kbest.predict(X_test)

# accuracy_logreg_kbest = accuracy_score(y_test, logreg_predictions_kbest)

# print(f'Logistic Regression with KBest Feature Selection Accuracy: {accuracy_logreg_kbest:.4f}')

logreg_pipeline_kbest = setup_pipeline(model=lr_base_model, type='skb')

logreg_kbest_result = cross_validate(
    estimator=logreg_pipeline_kbest,
    X=X_train,
    y=y_train,
    cv=skf,
    scoring='f1_macro',
    n_jobs=n_jobs,
    return_train_score=True
)

print_score(logreg_kbest_result['test_score'])

## Training Random Forest using LR, RF and KBest for feature selection

Random Forest - RFE(LogisticRegression) Feature Selection

In [None]:
# rf_classifier_rfe = setup_pipeline(model=RandomForestClassifier(n_estimators=100, random_state=42), type='rfe').fit(X_train, y_train)
# rf_predictions_rfe = rf_classifier_rfe.predict(X_test)

# accuracy_rf_rfe = accuracy_score(y_test, rf_predictions_rfe)

# print(f'Random Forest with RFE Feature Selection Accuracy: {accuracy_rf_rfe:.4f}')

rf_pipeline_rfe = setup_pipeline(model=rf_base_model, type='rfe')

rf_rfe_result = cross_validate(
    estimator=rf_pipeline_rfe,
    X=X_train,
    y=y_train,
    cv=skf,
    scoring='f1_macro',
    n_jobs=n_jobs,
    return_train_score=True
)

print_score(rf_rfe_result['test_score'])

Random Forest - SelectBestModel(RandomForest) Feature Selection

In [None]:
# rf_classifier_sbm = setup_pipeline(model=RandomForestClassifier(n_estimators=100, random_state=42), type='sbm').fit(X_train, y_train)
# rf_predictions_sbm = rf_classifier_sbm.predict(X_test)

# accuracy_rf_sbm = accuracy_score(y_test, rf_predictions_sbm)

# print(f'Random Forest with SBM Feature Selection Accuracy: {accuracy_rf_sbm:.4f}')

rf_pipeline_sbm = setup_pipeline(model=rf_base_model, type='sbm')

rf_sbm_result = cross_validate(
    estimator=rf_pipeline_sbm,
    X=X_train,
    y=y_train,
    cv=skf,
    scoring='f1_macro',
    n_jobs=n_jobs,
    return_train_score=True
)

print_score(rf_sbm_result['test_score'])

Random Forest - SelectKBest Feature Selection

In [None]:
# rf_classifier_kbest = setup_pipeline(model=RandomForestClassifier(n_estimators=100, random_state=42), type='skb').fit(X_train, y_train)
# rf_predictions_kbest = rf_classifier_kbest.predict(X_test)

# accuracy_rf_kbest = accuracy_score(y_test, rf_predictions_kbest)

# print(f'Random Forest with KBest Feature Selection Accuracy: {accuracy_rf_kbest:.4f}')

rf_pipeline_kbest = setup_pipeline(model=rf_base_model, type='skb')

rf_kbest_result = cross_validate(
    estimator=rf_pipeline_kbest,
    X=X_train,
    y=y_train,
    cv=skf,
    scoring='f1_macro',
    n_jobs=n_jobs,
    return_train_score=True
)

print_score(rf_kbest_result['test_score'])

## Training XGBoost using LR, RF and KBest for feature selection

XGBoost - RFE(LogisticRegression) Feature Selection

In [None]:
# xgb_classifier_rfe = setup_pipeline(model=XGBClassifier(eval_metric='mlogloss', random_state=42), type='rfe').fit(X_train, y_train)
# xgb_predictions_rfe = xgb_classifier_rfe.predict(X_test)

# accuracy_xgb_rfe = accuracy_score(y_test, xgb_predictions_rfe)

# print(f'XGBoost with RFE Feature Selection Accuracy: {accuracy_xgb_rfe:.4f}')

xgb_pipeline_rfe = setup_pipeline(model=xgb_base_model, type='rfe')

xgb_rfe_result = cross_validate(
    xgb_pipeline_rfe,
    X_train,
    y_train,
    cv=skf,
    scoring='f1_macro',
    n_jobs=n_jobs,
    return_train_score=True
)

print_score(xgb_rfe_result['test_score'])

XGBoost - SelectBestModel(RandomForest) Feature Selection

In [None]:
# xgb_classifier_sbm = setup_pipeline(model=XGBClassifier(eval_metric='mlogloss', random_state=42), type='sbm').fit(X_train, y_train)
# xgb_predictions_sbm = xgb_classifier_sbm.predict(X_test)

# accuracy_xgb_sbm = accuracy_score(y_test, xgb_predictions_sbm)

# print(f'XGBoost with SBM Feature Selection Accuracy: {accuracy_xgb_sbm:.4f}')

xgb_pipeline_sbm = setup_pipeline(model=xgb_base_model, type='sbm')

xgb_sbm_result = cross_validate(
    estimator=xgb_pipeline_sbm,
    X=X_train,
    y=y_train,
    cv=skf,
    scoring='f1_macro',
    n_jobs=n_jobs,
    return_train_score=True
)

print_score(xgb_sbm_result['test_score'])

XGBoost - SelectKBest Feature Selection

In [None]:
# xgb_classifier_skbest = setup_pipeline(model=XGBClassifier(eval_metric='mlogloss', random_state=42), type='skb').fit(X_train, y_train)
# xgb_predictions_kbest = xgb_classifier_skbest.predict(X_test)

# accuracy_xgb_kbest = accuracy_score(y_test, xgb_predictions_kbest)

# print(f'XGBoost with SKB Feature Selection Accuracy: {accuracy_xgb_kbest:.4f}')

xgb_pipeline_kbest = setup_pipeline(model=xgb_base_model, type='skb')

xgb_kbest_result = cross_validate(
    estimator=xgb_pipeline_kbest,
    X=X_train,
    y=y_train,
    cv=skf,
    scoring='f1_macro',
    n_jobs=n_jobs,
    return_train_score=True
)

print_score(xgb_kbest_result['test_score'])

## Final training score

In [None]:
all_scores = {
    'logistic_regression_rfe': {
        'result': logreg_rfe_result,
        'base_model': lr_base_model,
        'type': 'rfe'
    },
    'logistic_regression_sbm': {
        'result': logreg_sbm_result,
        'base_model': lr_base_model,
        'type': 'sbm'
    },
    'logistic_regression_skb': {
        'result': logreg_kbest_result,
        'base_model': lr_base_model,
        'type': 'skb'
    },
    'random_forest_rfe': {
        'result': rf_rfe_result,
        'base_model': rf_base_model,
        'type': 'rfe'
    },
    'random_forest_sbm': {
        'result': rf_sbm_result,
        'base_model': rf_base_model,
        'type': 'sbm'
    },
    'random_forest_skb': {
        'result': rf_kbest_result,
        'base_model': rf_base_model,
        'type': 'skb'
    },
    'xgboost_rfe': {
        'result': xgb_rfe_result,
        'base_model': xgb_base_model,
        'type': 'rfe'
    },
    'xgboost_sbm': {
        'result': xgb_sbm_result,
        'base_model': xgb_base_model,
        'type': 'sbm'
    },
    'xgboost_skb': {
        'result': xgb_kbest_result,
        'base_model': xgb_base_model,
        'type': 'skb'
    }
}

results = []
for model_name, data in all_scores.items():
    val_scores = data['result']['test_score']
    train_scores = data['result']['train_score']

    mean_val_score = np.mean(val_scores)
    std_val_score = np.std(val_scores)
    mean_train_score = np.mean(train_scores)
    overfitting_gap = mean_train_score - mean_val_score

    results.append({
        'model': model_name,
        'mean_val_f1': mean_val_score,
        'std_dev': std_val_score,
        'mean_train_f1': mean_train_score,
        'overfitting_gap': overfitting_gap
    })

report_df = pd.DataFrame(results)

report_df = report_df.sort_values(by='overfitting_gap', ascending=True)

pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', None)

print(report_df.to_string(index=False))

# Training Results Analysis

Learning Curve plot

In [None]:
best_model_name = report_df.iloc[0]['model']

best_pipeline = setup_pipeline(
    model=all_scores[best_model_name]['base_model'],
    type=all_scores[best_model_name]['type']
)

train_sizes, train_scores, test_scores = learning_curve(
    estimator=best_pipeline,
    X=X_train,
    y=y_train,
    cv=skf,
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 5)
)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.xlabel('Number of Training Samples')
plt.ylabel('F1-Macro Score')

plt.plot(train_sizes, train_scores_mean, 'o-', color='blue', label='Training score')
plt.plot(train_sizes, test_scores_mean, 'o-', color='green', label='Cross-validation score')

plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='blue')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='green')

plt.legend(loc='best')
plt.title(f'Learning Curve for Best Model: {best_model_name}')
plt.grid()
plt.show()

Confusion matrix

In [None]:
cv_predict_scores = cross_val_predict(
    best_pipeline,
    X_train,
    y_train,
    cv=skf,
    n_jobs=n_jobs
)

cm = confusion_matrix(y_train, cv_predict_scores)
plt.figure(figsize=(10, 8))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.title(f'Confusion Matrix for Best Model: {best_model_name}')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Hyper Parameter Tuning

In [None]:
n_features = [50, 75, 100]

param_grid = [
    {
        'model__penalty': ['l1'],
        'model__solver': ['saga'],
        'model__C': [0.01, 0.1, 1, 10, 50],
        'model__class_weight': ['balanced', None]
    },
    {
        'model__penalty': ['l2'],
        'model__solver': ['lbfgs', 'newton-cg', 'newton-cholesky'],
        'model__C': [0.01, 0.1, 1, 10, 50],
        'model__class_weight': ['balanced', None]
    }
]

# Insert N feature based on feature selection method
for grid in param_grid:
    if all_scores[best_model_name]['type'] == 'rfe':
        grid['feature_selection__n_features_to_select'] = n_features
    elif all_scores[best_model_name]['type'] == 'sbm':
        grid['feature_selection__max_features'] = n_features
    elif all_scores[best_model_name]['type'] == 'skb':
        grid['feature_selection__k'] = n_features
    else:
        raise ValueError("Invalid feature selection type. Choose from 'rfe', 'sbm', or 'skb'.")

grid_search = GridSearchCV(
    estimator=best_pipeline,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=skf,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print(f"Best F1-Macro Score: {grid_search.best_score_:.4f}")
print("Best Parameters Found:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_

# Hyper Parameter Tuning Results Analysis

Learning curve plot

In [None]:
train_sizes, train_scores, test_scores = learning_curve(
    estimator=best_model,
    X=X_train,
    y=y_train,
    cv=skf,
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 5)
)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.xlabel('Number of Training Samples')
plt.ylabel('F1-Macro Score')

plt.plot(train_sizes, train_scores_mean, 'o-', color='blue', label='Training score')
plt.plot(train_sizes, test_scores_mean, 'o-', color='green', label='Cross-validation score')

plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='blue')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='green')

plt.legend(loc='best')
plt.title(f'Learning Curve for Best Model: {best_model_name}')
plt.grid()
plt.show()

Confusion matrix

In [None]:
cv_predict_scores = cross_val_predict(
    best_model,
    X_train,
    y_train,
    cv=skf,
    n_jobs=n_jobs
)

cm = confusion_matrix(y_train, cv_predict_scores)
plt.figure(figsize=(10, 8))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.title(f'Confusion Matrix for Best Model: {best_model_name}')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Shap Analysis

In [None]:
scaler_step = best_model.named_steps['scaler']
vt_step = best_model.named_steps['variance_threshold']
selector_step = best_model.named_steps['feature_selection']
model_step = best_model.named_steps['model']

gene_ids_array = np.array(X_train.columns)

vt_mask = vt_step.get_support()
genes_after_vt = gene_ids_array[vt_mask]

skb_mask = selector_step.get_support()
final_selected_gene_ids = genes_after_vt[skb_mask]

final_selected_gene_ids_transformed = [gene_id.split('.')[0] for gene_id in final_selected_gene_ids]

mg = mygene.MyGeneInfo()
query_results = mg.querymany(
    final_selected_gene_ids_transformed,
    scopes='ensembl.gene', 
    fields='symbol', 
    species='human',
    verbose=False
)

gene_symbol_mapping = {res['query']: res.get('symbol', res['query']) for res in query_results}
final_feature_names = [gene_symbol_mapping.get(gene_id.split('.')[0]) for gene_id in final_selected_gene_ids]

X_train_scaled = scaler_step.transform(X_train)
X_train_vt = vt_step.transform(X_train_scaled)
X_train_transformed = selector_step.transform(X_train_vt)

In [None]:
explainer = shap.LinearExplainer(model_step, X_train_transformed)
shap_values = explainer(X_train_transformed)

In [None]:
global_importance_score = np.abs(shap_values.values).mean(axis=0).sum(axis=1)

feature_importance_series = pd.Series(global_importance_score, index=final_feature_names).sort_values(ascending=False)

print("Top 10 Important Features based on SHAP values:")
print(feature_importance_series.head(10))

Simple ROPN1 Influence by Subtype - Bar Chart

In [None]:
ropn1_idx = final_feature_names.index('ROPN1')
class_names = encoder.classes_

influence_scores = []
for class_idx in range(len(class_names)):
    shap_vals = shap_values[:, class_idx, ropn1_idx].values
    influence_scores.append(np.mean(np.abs(shap_vals)))

plt.figure(figsize=(10, 6))
colors = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6']
bars = plt.bar(class_names, influence_scores, color=colors, edgecolor='black', linewidth=2)

for bar, value in zip(bars, influence_scores):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{value:.4f}',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.ylabel('Mean Absolute SHAP Value\n(Influence Strength)', fontsize=13, fontweight='bold')
plt.xlabel('Subtype', fontsize=13, fontweight='bold')
plt.title('ROPN1 Gene: Influence by Cancer Subtype', fontsize=15, fontweight='bold', pad=20)
plt.grid(axis='y', alpha=0.3, linestyle='--')
plt.tight_layout()

print("ROPN1 Influence Ranking:")
print("-" * 40)
ranking = sorted(zip(class_names, influence_scores), key=lambda x: x[1], reverse=True)
for rank, (subtype, score) in enumerate(ranking, 1):
    print(f"{rank}. {subtype:10s}: {score:.4f}")

print(f"\nðŸŽ¯ HIGHEST influence: {ranking[0][0]} ({ranking[0][1]:.4f})")

plt.show()

Biological check

In [None]:
# all_gene_names = X.columns

# pam50_list = [
#     "UBE2T", "BIRC5", "NUF2", "CDC6", "CCNB1", "TYMS", "MYBL2", "CEP55", 
#     "MELK", "NDC80", "RRM2", "UBE2C", "CENPF", "PTTG1", "EXO1", "ORC6L", 
#     "ANLN", "CCNE1", "CDC20", "MKI67", "KIF2C", "ACTR3B", "MYC", "EGFR", 
#     "KRT5", "PHGDH", "CDH3", "MIA", "KRT17", "FOXC1", "SFRP1", "KRT14", 
#     "ESR1", "SLC39A6", "BAG1", "MAPT", "PGR", "CXXC5", "MLPH", "BCL2", 
#     "MDM2", "NAT1", "FOXA1", "BLVRA", "MMP11", "GPR160", "FGFR4", "GRB7", 
#     "TMEM45B", "ERBB2"
# ]

# pam50_set = set(pam50_list)

# selector = best_model.named_steps['feature_selection']
# lr_model = best_model.named_steps['model']

# selected_indices = selector.get_support(indices=True)
# print(f"Selected {len(selected_indices)} feature indices")

# selected_gene_names = [all_gene_names[i] for i in selected_indices]
# print(f"Selected {len(selected_gene_names)} genes")

# # Modify to look for symbols instead of Ensembl IDs with version numbers
# selected_gene_names_modified = [gene.split('.')[0] for gene in selected_gene_names]
# print(f"Selected gene names after removing version numbers: {selected_gene_names_modified[:10]}")

# mg = mygene.MyGeneInfo()
# gene_info = mg.querymany(
#     selected_gene_names_modified,
#     scopes='ensembl.gene',
#     fields='symbol',
#     species='human',
#     verbose=False
# )

# # Print first 5 entries of gene_info for verification
# print(gene_info[:5])

# ensembl_to_symbol = {}
# for result in gene_info:
#     ensembl_id = result['query']
#     gene_symbol = result.get('symbol', ensembl_id)
#     ensembl_to_symbol[ensembl_id] = gene_symbol

# gene_symbols = [ensembl_to_symbol.get(gene, gene) for gene in selected_gene_names_modified]

# # Identify PAM50 genes in the selected genes
# pam50_selected_genes = [gene for gene in gene_symbols if gene in pam50_set]

# print(gene_symbols)
# print(pam50_set)
# print(pam50_set.intersection(set(gene_symbols)))

# print(f"Number of PAM50 genes selected: {len(pam50_selected_genes)}")
# print("PAM50 genes selected:")
# for gene in pam50_selected_genes:
#     print(gene)

# TRAINING LEGACY

Scaler

In [None]:
# standard_scaler = StandardScaler()
# X_train_scaled = standard_scaler.fit_transform(X_train)
# X_test_scaled = standard_scaler.transform(X_test)

Variance Threshold

In [None]:
# variance_filter = VarianceThreshold(threshold=0.1)
# X_train_filtered = variance_filter.fit_transform(X_train_scaled)
# X_test_filtered = variance_filter.transform(X_test_scaled)

Feature selection (SelectKBest)

In [None]:
# SelectKBest_model = SelectKBest(score_func=f_classif, k=50)
# X_train_selected = SelectKBest_model.fit_transform(X_train_filtered, y_train)
# X_test_selected = SelectKBest_model.transform(X_test_filtered)

Random Forest

In [None]:
# rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_classifier.fit(X_train_selected, y_train)

# y_pred_rf = rf_classifier.predict(X_test_selected)

# accuracy_rf = accuracy_score(y_test, y_pred_rf)

# print(f'Random Forest Classifier Accuracy: {accuracy_rf:.6f}')

Logistic Regression

In [None]:
# logreg_classifier = LogisticRegression(max_iter=1500, random_state=42)
# logreg_classifier.fit(X_train_selected, y_train)

# y_pred_lr = logreg_classifier.predict(X_test_selected)

# accuracy_lr = accuracy_score(y_test, y_pred_lr)

# print(f'Logistic Regression Classifier Accuracy: {accuracy_lr:.6f}')

XGBoost

Parameters for XGBClassifier: https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py

In [None]:
# xgb_classifier = XGBClassifier(
#     tree_method='auto',
#     n_estimators=100,
#     eval_metric='mlogloss', 
#     random_state=42,
#     max_depth=6,
# )
# xgb_classifier.fit(X_train_selected, y_train)

# y_pred_xgb = xgb_classifier.predict(X_test_selected)

# accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# print(f'XGBoost Classifier Accuracy: {accuracy_xgb:.6f}')

# Training Results Analysis

## Classification Report and Confusion Matrix

### Setup

In [None]:
# def generate_classification_report(y_true, y_pred):
#     report = classification_report(
#         y_true,
#         y_pred
#     )
#     print("Classification Report:")
#     print(report)

# def generate_confusion_matrix(y_true, y_pred, class_names, title_additional=""):
#     cm = confusion_matrix(y_true, y_pred)
#     plt.figure(figsize=(8, 6))
#     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
#     plt.xlabel('Predicted')
#     plt.ylabel('True')
#     if title_additional:
#         plt.title(f'Confusion Matrix - {title_additional}')
#     else:
#         plt.title('Confusion Matrix')
#     plt.show()

# def plot_cm_on_ax(ax, y_true, y_pred, class_names, title=""):
#     cm = confusion_matrix(y_true, y_pred)
    
#     sns.heatmap(cm, 
#                 annot=True, 
#                 fmt='d', 
#                 cmap='Blues', 
#                 xticklabels=class_names, 
#                 yticklabels=class_names,
#                 ax=ax,          # <<< Plots on the provided subplot
#                 cbar=True)      # You can set this to False if you want
    
#     ax.set_xlabel('Predicted')
#     ax.set_ylabel('True')
#     ax.set_title(title, fontsize=14)

### Classification Reports - Logistic Regression

In [None]:
# all_titles = ['Model (LR-RFE)', 'Model (RF-Select)', 'Model (KBest)']

In [None]:
# all_preds_lr = [logreg_predictions_rfe, logreg_predictions_sbm, logreg_predictions_kbest]

# fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(24, 6))

# for ax, y_pred, title in zip(axes, all_preds_lr, all_titles):
#     plot_cm_on_ax(ax, 
#                   y_true=y_test, 
#                   y_pred=y_pred, 
#                   class_names=encoder.classes_, 
#                   title=title
#                 )

# fig.suptitle('Logistic Regression Model - Feature Selection', fontsize=20, y=1.05)
# plt.tight_layout()
# plt.show()

### Classification Reports - Random Forest

In [None]:
# all_preds_rf = [rf_predictions_rfe, rf_predictions_sbm, rf_predictions_kbest]

# fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(24, 6))

# for ax, y_pred, title in zip(axes, all_preds_rf, all_titles):
#     plot_cm_on_ax(ax, 
#                   y_true=y_test, 
#                   y_pred=y_pred, 
#                   class_names=encoder.classes_, 
#                   title=title
#                 )

# fig.suptitle('Random Forest Model - Feature Selection', fontsize=20, y=1.05)
# plt.tight_layout()
# plt.show()

### Classification Reports - XGBoost

In [None]:
# all_preds_xgb = [xgb_predictions_rfe, xgb_predictions_sbm, xgb_predictions_kbest]

# fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(24, 6))
# for ax, y_pred, title in zip(axes, all_preds_xgb, all_titles):
#     plot_cm_on_ax(ax, 
#                   y_true=y_test, 
#                   y_pred=y_pred, 
#                   class_names=encoder.classes_, 
#                   title=title
#                 )

# fig.suptitle('XGBoost Model - Feature Selection', fontsize=20, y=1.05)
# plt.tight_layout()
# plt.show()

## Train ACC vs Test ACC

Setup

In [None]:
# def train_test_acc_compare(model, X_train, y_train, y_test, y_pred):
#     acc_score = accuracy_score(y_test, y_pred)
#     train_pred = model.predict(X_train)
#     train_f1 = f1_score(y_train, train_pred, average='weighted')
#     test_f1 = f1_score(y_test, y_pred, average='weighted')

#     print(f'Model name: {model.named_steps["model"].__class__.__name__}')
#     print(f'Feature Selection Method: {model.named_steps["feature_selection"].__class__.__name__}')
#     print('---')
#     print(f'Accuracy Score: {acc_score:.4f}')
#     print('---')
#     print(f'Training F1 Score: {train_f1:.4f}')
#     print(f'Testing F1 Score: {test_f1:.4f}')
#     print(f'F1 Score Difference (Train - Test): {train_f1 - test_f1:.4f}')

Logistical Regression

In [None]:
# train_test_acc_compare(logreg_classifier_rfe, X_train, y_train, y_test, logreg_predictions_rfe)
# print("\n")
# train_test_acc_compare(logreg_classifier_sbm, X_train, y_train, y_test, logreg_predictions_sbm)
# print("\n")
# train_test_acc_compare(logreg_classifier_kbest, X_train, y_train, y_test, logreg_predictions_kbest)

Random Forest

In [None]:
# train_test_acc_compare(rf_classifier_rfe, X_train, y_train, y_test, rf_predictions_rfe)
# print("\n")
# train_test_acc_compare(rf_classifier_sbm, X_train, y_train, y_test, rf_predictions_sbm)
# print("\n")
# train_test_acc_compare(rf_classifier_kbest, X_train, y_train, y_test, rf_predictions_kbest)

XGBoost

In [None]:
# train_test_acc_compare(xgb_classifier_rfe, X_train, y_train, y_test, xgb_predictions_rfe)
# print("\n")
# train_test_acc_compare(xgb_classifier_sbm, X_train, y_train, y_test, xgb_predictions_sbm)
# print("\n")
# train_test_acc_compare(xgb_classifier_skbest, X_train, y_train, y_test, xgb_predictions_kbest)

# Hyperparameter Tuning

## Tuning process

Possible scoring values are: 'accuracy', 'f1_macro', 'f1_weighted'

In [None]:
# scoring = 'f1_macro' # metric for evaluation
# n_features = [50, 75, 100]
# n_jobs = 6

In [None]:
# logreg_sbm_pipeline = setup_pipeline(model=LogisticRegression(max_iter=7500, random_state=42), type='sbm')

# param_grid_logreg_sbm = [
#     {
#         'feature_selection__max_features': n_features,
#         'model__penalty': ['l1'],
#         'model__solver': ['saga'],
#         'model__C': [0.001, 0.01, 0.1, 1, 10],
#         'model__class_weight': ['balanced']
#     },
#     {
#         'feature_selection__max_features': n_features,
#         'model__penalty': ['l2'],
#         'model__solver': ['lbfgs', 'newton-cg', 'newton-cholesky'],
#         'model__C': [0.001, 0.01, 0.1, 1, 10],
#         'model__class_weight': ['balanced']
#     }
# ]

# grid_search_logreg_sbm = GridSearchCV(estimator=logreg_sbm_pipeline, param_grid=param_grid_logreg_sbm, scoring=scoring, cv=5, n_jobs=n_jobs, verbose=2)

# grid_search_logreg_sbm.fit(X_train, y_train)

# print(f'Best parameters for Logistic Regression with SBM: {grid_search_logreg_sbm.best_params_}')
# print(f'Best {scoring} score for Logistic Regression with SBM: {grid_search_logreg_sbm.best_score_:.4f}')

In [None]:
# best_model = grid_search_logreg_sbm.best_estimator_

# all_gene_names = X.columns

# pam50_list = [
#     "UBE2T", "BIRC5", "NUF2", "CDC6", "CCNB1", "TYMS", "MYBL2", "CEP55", 
#     "MELK", "NDC80", "RRM2", "UBE2C", "CENPF", "PTTG1", "EXO1", "ORC6L", 
#     "ANLN", "CCNE1", "CDC20", "MKI67", "KIF2C", "ACTR3B", "MYC", "EGFR", 
#     "KRT5", "PHGDH", "CDH3", "MIA", "KRT17", "FOXC1", "SFRP1", "KRT14", 
#     "ESR1", "SLC39A6", "BAG1", "MAPT", "PGR", "CXXC5", "MLPH", "BCL2", 
#     "MDM2", "NAT1", "FOXA1", "BLVRA", "MMP11", "GPR160", "FGFR4", "GRB7", 
#     "TMEM45B", "ERBB2"
# ]

# pam50_set = set(pam50_list)

# selector = best_model.named_steps['feature_selection']
# lr_model = best_model.named_steps['model']

# selected_indices = selector.get_support(indices=True)
# print(f"Selected {len(selected_indices)} feature indices")

# selected_gene_names = [all_gene_names[i] for i in selected_indices]
# print(f"Selected {len(selected_gene_names)} genes")

# # Modify to look for symbols instead of Ensembl IDs with version numbers
# selected_gene_names = [gene.split('.')[0] for gene in selected_gene_names]
# print(f"Selected gene names after removing version numbers: {selected_gene_names[:10]}")

# mg = mygene.MyGeneInfo()
# gene_info = mg.querymany(
#     selected_gene_names,
#     scopes='ensembl.gene',
#     fields='symbol',
#     species='human',
#     verbose=False
# )

# # Print first 5 entries of gene_info for verification
# print(gene_info[:5])

# ensembl_to_symbol = {}
# for result in gene_info:
#     ensembl_id = result['query']
#     gene_symbol = result.get('symbol', ensembl_id)
#     ensembl_to_symbol[ensembl_id] = gene_symbol

# gene_symbols = [ensembl_to_symbol.get(gene, gene) for gene in selected_gene_names]

# # Identify PAM50 genes in the selected genes
# pam50_selected_genes = [gene for gene in gene_symbols if gene in pam50_set]

# print(gene_symbols)
# print(pam50_set)
# print(pam50_set.intersection(set(gene_symbols)))


In [None]:
# tuned_logreg_sbm_model = grid_search_logreg_sbm.best_estimator_['model']

# print(f'Tuned Logistic Regression Model: {tuned_logreg_sbm_model}')

# main_classifier = setup_pipeline(model=tuned_logreg_sbm_model, type='sbm').fit(X_train, y_train)
# main_predictions = main_classifier.predict(X_test)

# accuracy_main = accuracy_score(y_test, main_predictions)

# print(f'Main Classifier Accuracy after Tuning: {accuracy_main:.4f}')

# Hyperparameter Tuning Analysis

## Train ACC vs Test ACC

In [None]:
# train_test_acc_compare(main_classifier, X_train, y_train, y_test, main_predictions)

## SHAP Analysis

Setup

In [None]:
# all_gene_names = X.columns

# estimator = grid_search_logreg_sbm.best_estimator_

# selector = estimator.named_steps['feature_selection']
# lr_model = estimator.named_steps['model']

# selected_indices = selector.get_support(indices=True)
# print(f"Selected {len(selected_indices)} feature indices")

# selected_gene_names = [all_gene_names[i] for i in selected_indices]
# print(f"Selected {len(selected_gene_names)} genes")

# # Modify to look for symbols instead of Ensembl IDs with version numbers
# selected_gene_names_modified = [gene.split('.')[0] for gene in selected_gene_names]
# print(f"Selected gene names after removing version numbers: {selected_gene_names_modified[:10]}")

# mg = mygene.MyGeneInfo()
# gene_info = mg.querymany(
#     selected_gene_names_modified,
#     scopes='ensembl.gene',
#     fields='symbol',
#     species='human',
#     verbose=False
# )

# # Print first 5 entries of gene_info for verification
# print(gene_info[:5])

# ensembl_to_symbol = {}
# for result in gene_info:
#     ensembl_id = result['query']
#     gene_symbol = result.get('symbol', ensembl_id)
#     ensembl_to_symbol[ensembl_id] = gene_symbol

# gene_symbols = [ensembl_to_symbol.get(gene, gene) for gene in selected_gene_names_modified]

# class_names = ['Basal', 'Her2', 'LumA', 'LumB', 'Normal']

Explainer

In [None]:
# explainer = shap.LinearExplainer(lr_model, X_train[selected_gene_names])
# shap_values = explainer.shap_values(X_test[selected_gene_names])

Boxplot explainer

# Classification Reports (LEGACY)

Classification Report - LR + LR Selection

In [None]:
# generate_classification_report(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(logreg_predictions_lr),
# )

# generate_confusion_matrix(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(logreg_predictions_lr),
#     class_names=encoder.classes_
# )

Classification Report - LR + RF Selection

In [None]:
# generate_classification_report(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(logreg_predictions_rf),
# )

# generate_confusion_matrix(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(logreg_predictions_rf),
#     class_names=encoder.classes_
# )

Classification Report - LR + KBest Selection

In [None]:
# generate_classification_report(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(logreg_predictions_kbest),
# )

# generate_confusion_matrix(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(logreg_predictions_kbest),
#     class_names=encoder.classes_
# )

### Classification Reports for RF Classifier with LR, RF and KBest as selection methods

Classification Report - RF + LR Selection

In [None]:
# generate_classification_report(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(rf_predictions_lr),
# )

# generate_confusion_matrix(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(rf_predictions_lr),
#     class_names=encoder.classes_
# )

Classification Report - RF + RF Selection

In [None]:
# generate_classification_report(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(rf_predictions_rf),
# )

# generate_confusion_matrix(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(rf_predictions_rf),
#     class_names=encoder.classes_
# )

Classification Report - RF + KBest Selection

In [None]:
# generate_classification_report(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(rf_predictions_kbest),
# )

# generate_confusion_matrix(
#     encoder.inverse_transform(y_test),
#     encoder.inverse_transform(rf_predictions_kbest),
#     class_names=encoder.classes_
# )

# REPORT (LEGACY)

In [None]:
# print("Classification Report for Random Forest:")
# print(classification_report(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_rf)))

In [None]:
# cm = confusion_matrix(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_rf))
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
# plt.title('Confusion Matrix for Random Forest Classifier')
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.show()

In [None]:
# print("Classification Report for Logistic Regression:")
# print(classification_report(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_lr)))

In [None]:
# cm = confusion_matrix(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_lr))
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
# plt.title('Confusion Matrix for Logistic Regression Classifier')
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.show()

In [None]:
# print("Classification Report for XGBoost Classifier:")
# print(classification_report(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_xgb)))

In [None]:
# cm = confusion_matrix(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_xgb))
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
# plt.title('Confusion Matrix for XGBoost Classifier')
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.show()

## ROC Curve

In [None]:
# classes = range(len(encoder.classes_))
# class_labels = encoder.classes_
# number_of_classes = len(class_labels)

Random Forest

In [None]:
# y_pred_proba_rf = rf_classifier.predict_proba(X_test_selected)
# y_test_binarized = label_binarize(y_test, classes=classes)

# plt.figure(figsize=(10, 8))
# colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red'])

# for i, color in zip(range(number_of_classes), colors):
#     labels_for_class = y_test_binarized[:, i]
#     probs_for_class = y_pred_proba_rf[:, i]

#     fpr, tpr, _ = roc_curve(labels_for_class, probs_for_class)
#     roc_auc = auc(fpr, tpr)

#     plt.plot(fpr, tpr, color=color, lw=2, label=f'ROC curve of class {class_labels[i]} (area = {roc_auc:.2f})')

# plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Chance (AUC = 0.50)')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curves for Random Forest Classifier')
# plt.legend(loc='lower right')
# plt.grid(True)
# plt.show()

Logistic Regression

In [None]:
# y_pred_proba_lr = logreg_classifier.predict_proba(X_test_selected)
# y_test_binarized = label_binarize(y_test, classes=classes)

# plt.figure(figsize=(10, 8))
# colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red'])

# for i, color in zip(range(number_of_classes), colors):
#     labels_for_class = y_test_binarized[:, i]
#     probs_for_class = y_pred_proba_lr[:, i]

#     fpr, tpr, _ = roc_curve(labels_for_class, probs_for_class)
#     roc_auc = auc(fpr, tpr)

#     plt.plot(fpr, tpr, color=color, lw=2, label=f'ROC curve of class {class_labels[i]} (area = {roc_auc:.2f})')

# plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Chance (AUC = 0.50)')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curves for Logistic Regression Classifier')
# plt.legend(loc='lower right')
# plt.grid(True)
# plt.show()

XGBoost

In [None]:
# y_pred_proba_xgb = xgb_classifier.predict_proba(X_test_selected)
# y_test_binarized = label_binarize(y_test, classes=classes)

# plt.figure(figsize=(10, 8))
# colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red'])

# for i, color in zip(range(number_of_classes), colors):
#     labels_for_class = y_test_binarized[:, i]
#     probs_for_class = y_pred_proba_xgb[:, i]

#     fpr, tpr, _ = roc_curve(labels_for_class, probs_for_class)
#     roc_auc = auc(fpr, tpr)

#     plt.plot(fpr, tpr, color=color, lw=2, label=f'ROC curve of class {class_labels[i]} (area = {roc_auc:.2f})')

# plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Chance (AUC = 0.50)')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curves for XGBoost Classifier')
# plt.legend(loc='lower right')
# plt.grid(True)
# plt.show()

## Train ACC vs Test ACC

Random Forest

In [None]:
# train_pred_rf = rf_classifier.predict(X_train_selected)
# train_f1_rf = f1_score(y_train, train_pred_rf, average='weighted')
# test_f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

# print(f'Training F1 Score for Random Forest: {train_f1_rf:.6f}')
# print(f'Test F1 Score for Random Forest: {test_f1_rf:.6f}')
# print(f'F1 Score Difference for Random Forest: {train_f1_rf - test_f1_rf:.6f}')

Logistical Regression

In [None]:
# train_pred_lr = logreg_classifier.predict(X_train_selected)
# train_f1_lr = f1_score(y_train, train_pred_lr, average='weighted')
# test_f1_lr = f1_score(y_test, y_pred_lr, average='weighted')

# print(f'Training F1 Score for Logistic Regression: {train_f1_lr:.6f}')
# print(f'Test F1 Score for Logistic Regression: {test_f1_lr:.6f}')
# print(f'F1 Score Difference for Logistic Regression: {train_f1_lr - test_f1_lr:.6f}')

XGBoost

In [None]:
# train_pred_xgb = xgb_classifier.predict(X_train_selected)
# train_f1_xgb = f1_score(y_train, train_pred_xgb, average='weighted')
# test_f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')

# print(f'Training F1 Score for XGBoost: {train_f1_xgb:.6f}')
# print(f'Test F1 Score for XGBoost: {test_f1_xgb:.6f}')
# print(f'F1 Score Difference for XGBoost: {train_f1_xgb - test_f1_xgb:.6f}')

# Hyperparameter Tuning (LEGACY)

## GridsearchCV

Global variables

In [None]:
# core_usage = 6 # -1 = ALL Cores
# scoring = "f1_macro"
# select_k_list = [50, 75, 100]

Define global pipeline

In [None]:
# preprocessing_steps = [
#     ('scaler', StandardScaler()),
#     ('filter', VarianceThreshold(threshold=0.1)),
#     ('selector', SelectKBest(score_func=f_classif)),
# ]

Logistical Regression

In [None]:
# max_iteration = 5000
# random_state = 42

# pipeline_lr = Pipeline(preprocessing_steps + [
#     ('model', LogisticRegression(max_iter=max_iteration, random_state=random_state))
# ])

# param_grid_lr = {
#     'selector__k': select_k_list,
#     'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
#     'model__solver': ['lbfgs', 'sag', 'saga'],
#     'model__class_weight': ['balanced', None]
# }

# grid_search_lr = GridSearchCV(
#     estimator=pipeline_lr,
#     param_grid=param_grid_lr,
#     scoring=scoring,
#     cv=5,
#     n_jobs=core_usage,
#     verbose=2
# )

# grid_search_lr.fit(X_train, y_train)

# print(f'Best parameters found: {grid_search_lr.best_params_}')
# print(f'Best {scoring} score: {grid_search_lr.best_score_:.6f}')

## GridsearchCV Analysis Result

Logistical Regression

In [None]:
# SelectKBest_model = SelectKBest(score_func=f_classif, k=grid_search_lr.best_params_['selector__k'])
# X_train_selected = SelectKBest_model.fit_transform(X_train_filtered, y_train)
# X_test_selected = SelectKBest_model.transform(X_test_filtered)

# logreg_classifier = LogisticRegression(
#     max_iter=max_iteration, 
#     random_state=random_state,
#     C=grid_search_lr.best_params_['model__C'],
#     solver=grid_search_lr.best_params_['model__solver'],
#     class_weight=grid_search_lr.best_params_['model__class_weight']
# )

# logreg_classifier.fit(X_train_selected, y_train)

# y_pred_lr = logreg_classifier.predict(X_test_selected)

# accuracy_lr = accuracy_score(y_test, y_pred_lr)

# print(f'Logistic Regression Classifier Accuracy: {accuracy_lr:.6f}')

# train_pred_lr = logreg_classifier.predict(X_train_selected)
# train_f1_lr = f1_score(y_train, train_pred_lr, average='macro')
# test_f1_lr = f1_score(y_test, y_pred_lr, average='macro')

# print(f'Training F1 Score for Logistic Regression: {train_f1_lr:.6f}')
# print(f'Test F1 Score for Logistic Regression: {test_f1_lr:.6f}')
# print(f'F1 Score Difference for Logistic Regression: {train_f1_lr - test_f1_lr:.6f}')

# SHAP Analysis

Setup

In [None]:
# all_gene_names = list(df.columns)

# best_model = grid_search_lr.best_estimator_

# selector = best_model.named_steps['selector']
# lr_model = best_model.named_steps['model']
# selected_indices = selector.get_support(indices=True)

# selected_gene_names = [all_gene_names[i] for i in selected_indices]
# print(f"Selected {len(selected_gene_names)} genes")

# # Print first 10 selected gene names for verification
# print(selected_gene_names[:10]) 

# selected_gene_names = [gene.split('.')[0] for gene in selected_gene_names]

# # Print first 10 selected gene names after removing version numbers for verification
# print(f"Selected gene names after removing version numbers: {selected_gene_names[:10]}")

# # Gene symbol mapping using mygene
# mg = mygene.MyGeneInfo()
# gene_info = mg.querymany(
#     selected_gene_names,
#     scopes='ensembl.gene',
#     fields='symbol',
#     species='human',
#     verbose=False
# )

# # Print first 5 entries of gene_info for verification
# print(gene_info[:5])

# ensembl_to_symbol = {}
# for result in gene_info:
#     ensembl_id = result['query']
#     gene_symbol = result.get('symbol', ensembl_id)
#     ensembl_to_symbol[ensembl_id] = gene_symbol

# gene_symbols = [ensembl_to_symbol.get(gene, gene) for gene in selected_gene_names]

# class_names = ['Basal', 'Her2', 'LumA', 'LumB', 'Normal']

Explainer

In [None]:
# explainer = shap.LinearExplainer(lr_model, X_train_selected)
# shap_values = explainer.shap_values(X_test_selected)

Bar plot

In [None]:
# plt.figure(figsize=(14, 10))
# shap.summary_plot(
#     shap_values,
#     X_test_selected,
#     feature_names=gene_symbols,
#     class_names=class_names,
#     plot_type="bar"
# )
# plt.tight_layout()
# plt.show()

Top Genes per Subtype

In [None]:
# for class_idx, class_name in enumerate(class_names):
#     # Calculate mean absolute SHAP value per gene for this class
#     mean_abs_shap = np.abs(shap_values[class_idx]).mean(axis=0)
    
#     # Get top 10
#     top_indices = np.argsort(mean_abs_shap)[-10:][::-1]
    
#     print(f"\n{class_name.upper()}:")
#     print("-" * 70)
#     for rank, idx in enumerate(top_indices, 1):
#         gene_symbol = gene_symbols[idx]
#         ensembl_id = selected_gene_names[idx]
#         importance = mean_abs_shap[idx]
#         print(f"  {rank:2d}. {gene_symbol:15s} ({ensembl_id}) - Importance: {importance:.4f}")

# Alternative selection method: Random Forest Selection (LEGACY)

In [None]:
# y = df['Subtype']
# X = df.drop(columns=['Subtype'])

In [None]:
# X = np.log2(X + 1)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

Random Forest Selection

In [None]:
# rf_selector = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=6)

# rf_selector.fit(X_train_scaled, y_train)

# importances = rf_selector.feature_importances_
# top_n_genes = np.argsort(importances)[-50:]

# X_train_selected = X_train_scaled[:, top_n_genes]
# X_test_selected = X_test_scaled[:, top_n_genes]

Logistic Regression Selection

In [None]:
# lr_classifier = LogisticRegression(max_iter=1500, random_state=42, class_weight='balanced')

# rfe_selector = RFE(estimator=lr_classifier, n_features_to_select=50, step=0.1, verbose=1)
# rfe_selector.fit(X_train_scaled, y_train)

# selected_indices_lr = np.where(rfe_selector.support_)[0]
# print(f"Selected features for LR:\n{selected_indices_lr}\n")

# top_n_genes = selected_indices_lr

# X_train_selected = X_train_scaled[:, top_n_genes]
# X_test_selected = X_test_scaled[:, top_n_genes]

List selected genes

In [None]:
# gene_ids = X.columns[top_n_genes]

# gene_ids = [gene.split('.')[0] for gene in gene_ids]

# mg = mygene.MyGeneInfo()
# mygene_info = mg.querymany(
#     list(gene_ids),
#     scopes='ensembl.gene',
#     fields='symbol',
#     species='human',
#     verbose=False
# )

# print("All genes retrieved from mygene:")
# for gene in mygene_info:
#     print(f"Gene ID: {gene['query']}, Symbol: {gene['symbol'] if 'symbol' in gene else 'N/A'}")

Logistic Regression

In [None]:
# logreg_classifier = LogisticRegression(max_iter=1500, random_state=42, class_weight='balanced')
# logreg_classifier.fit(X_train_selected, y_train)

# y_pred_lr = logreg_classifier.predict(X_test_selected)

# accuracy_lr = accuracy_score(y_test, y_pred_lr)

# print(f'Logistic Regression Classifier Accuracy with Top 100 RF Genes: {accuracy_lr:.6f}')

# # F1 Score to check for overfitting
# train_pred = logreg_classifier.predict(X_train_selected)
# train_f1 = f1_score(y_train, train_pred, average='macro')
# test_f1 = f1_score(y_test, y_pred_lr, average='macro')

# print(f'Training F1 Score for Logistic Regression with Top 100 RF Genes: {train_f1:.6f}')
# print(f'Test F1 Score for Logistic Regression with Top 100 RF Genes: {test_f1:.6f}')
# print(f'F1 Score Difference for Logistic Regression with Top 100 RF Genes: {train_f1 - test_f1:.6f}')

Random Forest

In [None]:
# random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# random_forest_classifier.fit(X_train_selected, y_train)

# y_pred_rf = random_forest_classifier.predict(X_test_selected)

# accuracy_rf = accuracy_score(y_test, y_pred_rf)

# print(f'Random Forest Classifier Accuracy with Top 100 RF Genes: {accuracy_rf:.6f}')

# # F1 Score to check for overfitting
# train_pred_rf = random_forest_classifier.predict(X_train_selected)
# train_f1_rf = f1_score(y_train, train_pred_rf, average='macro')
# test_f1_rf = f1_score(y_test, y_pred_rf, average='macro')

# print(f'Training F1 Score for Random Forest with Top 100 RF Genes: {train_f1_rf:.6f}')
# print(f'Test F1 Score for Random Forest with Top 100 RF Genes: {test_f1_rf:.6f}')
# print(f'F1 Score Difference for Random Forest with Top 100 RF Genes: {train_f1_rf - test_f1_rf:.6f}')

XGBoost

In [None]:
# encoder = LabelEncoder()

# y = encoder.fit_transform(df['Subtype'])
# X = df.drop(columns=['Subtype'])

# X = np.log2(X + 1)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# rf_selector = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=6)

# rf_selector.fit(X_train_scaled, y_train)

# importances = rf_selector.feature_importances_
# top_100_genes = np.argsort(importances)[-50:]

# X_train_selected = X_train_scaled[:, top_100_genes]
# X_test_selected = X_test_scaled[:, top_100_genes]

# xgb_classifier = XGBClassifier(
#     tree_method='auto',
#     n_estimators=100,
#     eval_metric='mlogloss', 
#     random_state=42,
#     max_depth=6,
# )
# xgb_classifier.fit(X_train_selected, y_train)

# y_pred_xgb = xgb_classifier.predict(X_test_selected)

# accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# print(f'XGBoost Classifier Accuracy with Top 100 RF Genes: {accuracy_xgb:.6f}')

# # F1 Score to check for overfitting
# train_pred_xgb = xgb_classifier.predict(X_train_selected)
# train_f1_xgb = f1_score(y_train, train_pred_xgb, average='macro')
# test_f1_xgb = f1_score(y_test, y_pred_xgb, average='macro')

# print(f'Training F1 Score for XGBoost with Top 100 RF Genes: {train_f1_xgb:.6f}')
# print(f'Test F1 Score for XGBoost with Top 100 RF Genes: {test_f1_xgb:.6f}')
# print(f'F1 Score Difference for XGBoost with Top 100 RF Genes: {train_f1_xgb - test_f1_xgb:.6f}')