In [1]:
import pandas as pd
from sklearn.utils import shuffle

df = pd.read_csv("Dataset/bladder+normal.csv")
df = shuffle(df, random_state=42).reset_index(drop=True)

df.head(5)

Unnamed: 0,cancer_type,type,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,normal,normal,10.154073,6.229753,4.726761,7.69461,3.036633,7.709174,4.876,5.498816,...,12.301864,11.774613,13.769954,13.445276,8.039906,7.07538,7.236685,3.213733,3.253917,3.30995
1,bladder,tumoral_urothelia,5.44855,2.855473,3.956391,6.990753,2.515888,4.532908,5.148095,3.420617,...,12.355006,12.073772,13.375017,13.129036,14.285014,12.222636,13.764253,2.635243,3.087296,2.749863
2,normal,normal,11.113545,5.940342,5.564016,8.278557,3.737454,8.904443,6.310405,5.029888,...,12.44036,11.836593,13.794074,13.652361,10.184499,5.065497,7.553789,3.308568,3.670986,3.624235
3,bladder,tumoral_urothelia,6.117643,2.963498,4.462724,7.637291,2.748184,4.799081,5.387159,3.796579,...,13.488239,13.297656,14.433989,14.138876,8.479708,3.234453,3.25535,2.217603,2.700261,2.564158
4,bladder,tumoral_urothelia,6.341556,2.83939,4.035876,6.989258,2.344312,5.291725,5.210843,3.260132,...,12.746157,12.766855,14.151366,13.834423,8.481049,3.886499,3.866914,2.422337,2.591235,2.733775


In [2]:
def preprocessing(df):
    
    # Drop type Columns
    if "type" in df.columns:
        df = df.drop(columns="type")

    # Convert label to binary type:
    if 'cancer_type' in df.columns and not df['cancer_type'].isin([0, 1]).all():
        df['cancer_type'] = df['cancer_type'].map({'bladder': 1, 'normal': 0})
    
    # Get X,y
    target = 'cancer_type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y 

X,y = preprocessing(df)
display(X.head(5))
display(y.head(5))

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,10.154073,6.229753,4.726761,7.69461,3.036633,7.709174,4.876,5.498816,7.18219,3.183595,...,12.301864,11.774613,13.769954,13.445276,8.039906,7.07538,7.236685,3.213733,3.253917,3.30995
1,5.44855,2.855473,3.956391,6.990753,2.515888,4.532908,5.148095,3.420617,3.378807,3.022539,...,12.355006,12.073772,13.375017,13.129036,14.285014,12.222636,13.764253,2.635243,3.087296,2.749863
2,11.113545,5.940342,5.564016,8.278557,3.737454,8.904443,6.310405,5.029888,6.55758,3.570486,...,12.44036,11.836593,13.794074,13.652361,10.184499,5.065497,7.553789,3.308568,3.670986,3.624235
3,6.117643,2.963498,4.462724,7.637291,2.748184,4.799081,5.387159,3.796579,2.914434,2.863601,...,13.488239,13.297656,14.433989,14.138876,8.479708,3.234453,3.25535,2.217603,2.700261,2.564158
4,6.341556,2.83939,4.035876,6.989258,2.344312,5.291725,5.210843,3.260132,4.015626,2.747104,...,12.746157,12.766855,14.151366,13.834423,8.481049,3.886499,3.866914,2.422337,2.591235,2.733775


0    0
1    1
2    0
3    1
4    1
Name: cancer_type, dtype: int64

In [3]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Perform feature_selection (ANOVA, Mutual Information, Reduce Overlap)
def feature_selection(X,y):
    # Perform ANOVA
    k_best_selector = SelectKBest(score_func=f_classif, k=500)
    X_anova = k_best_selector.fit_transform(X,y)
    X_anova = X.columns[k_best_selector.get_support()]
    
    # Perform Mutual Information
    mutual_info_selector = SelectKBest(score_func=mutual_info_classif, k=500)
    X_mut = mutual_info_selector.fit_transform(X,y)
    X_mut = X.columns[mutual_info_selector.get_support()]
    
    # Reduce overlap
    X_reduce = set(X_anova).difference(set(X_mut))
    X_reduce = X[list(X_reduce)]
    
    return X_reduce

X = feature_selection(X,y)
display(X.head(5))

Unnamed: 0,211980_at,212121_at,211964_at,216088_s_at,201840_at,225787_at,208631_s_at,202576_s_at,201264_at,223170_at,...,203189_s_at,207438_s_at,226241_s_at,213263_s_at,209471_s_at,221693_s_at,201821_s_at,212057_at,37943_at,208308_s_at
0,7.93735,8.790117,8.351469,7.075256,9.79114,8.44074,10.786379,8.39815,9.044983,10.417365,...,8.135397,7.546761,8.569265,8.816998,8.743963,8.72135,8.784204,8.313053,6.436263,10.3047
1,3.482097,3.831227,3.96156,3.126583,4.248021,2.798232,4.239028,4.315741,4.124428,4.473512,...,3.773622,4.644429,3.204769,3.958362,2.835044,3.965435,3.292908,3.658089,2.6602,2.927143
2,7.450394,7.816752,8.658588,7.371614,8.845517,7.314234,10.445133,8.576719,9.436113,9.850669,...,7.126642,7.675419,8.668856,8.541448,7.188475,8.169337,8.46877,7.633848,6.672042,10.381521
3,3.506123,3.904075,3.624559,2.547178,4.004707,2.6133,4.243757,4.16062,3.766338,3.362919,...,2.812185,5.479604,3.427183,3.960069,2.683343,3.589379,2.848358,4.049269,2.54548,3.211091
4,3.644899,3.644899,3.50284,2.761201,4.925606,2.730127,4.217308,4.21584,4.03817,3.591208,...,3.263552,4.866218,4.282971,3.749406,3.21425,3.523311,2.745396,2.926581,2.582636,3.619338


In [4]:
# Model train: Logistic Regression, L1 Regularization, 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report
import numpy as np

# Define Training Pipeline
pipeline = Pipeline([
    ('classifier', LogisticRegression(penalty='l1', solver='saga', max_iter=5000, random_state=42))
])

# Define Stratified K-Fold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store results
accuracy_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []

# Perform K-Fold Cross-Validation
fold = 1
for train_index, test_index in skf.split(X, y):
    # Split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    
    # Append scores
    accuracy_scores.append(accuracy)
    recall_scores.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)
    
    fold += 1

In [5]:
# Final Train accuracy

# Print average metrics across folds
print("\nCross-Validation Results:")
print(f"Mean Accuracy: {np.mean(accuracy_scores):.2f} ± {np.std(accuracy_scores):.2f}")
print(f"Mean Recall: {np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}")
print(f"Mean F1 Score: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")
print(f"Mean ROC-AUC: {np.mean(roc_auc_scores):.2f} ± {np.std(roc_auc_scores):.2f}")


Cross-Validation Results:
Mean Accuracy: 1.00 ± 0.00
Mean Recall: 1.00 ± 0.00
Mean F1 Score: 1.00 ± 0.00
Mean ROC-AUC: 1.00 ± 0.00


In [14]:
import pandas as pd
from sklearn.utils import shuffle

# Load and shuffle the dataset
test_bladder = pd.read_csv("Dataset/bladder_test.csv")
target = 'cancer_type'
X_test = test_bladder.drop(columns=target)
X_test = X_test[X.columns]
y_test = test_bladder[target]

In [15]:
# Predict using the final trained model
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Print results
print("\nFinal Model Evaluation on Test Dataset:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Final Model Evaluation on Test Dataset:
Accuracy: 0.93
Recall: 1.00
F1 Score: 0.71
ROC-AUC: 0.95

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        52
           1       0.56      1.00      0.71         5

    accuracy                           0.93        57
   macro avg       0.78      0.96      0.84        57
weighted avg       0.96      0.93      0.94        57



In [17]:
import pandas as pd

# Create the report dataframe
report_df = pd.DataFrame({
    'True Label': y_test,
    'Predicted Label': y_pred,
    'Probability (Bladder)': y_pred_prob
})

# Save the report to the Dataset directory
output_path = "Dataset/test_set_prediction_report.csv"
report_df.to_csv(output_path, index=False)

print(f"Prediction report saved to {output_path}")


Prediction report saved to Dataset/test_set_prediction_report.csv



Sample 1: Predicted Probability (Bladder): 0.9765
Top Contributing Genes:
         Feature  Contribution
233    217720_at      1.199741
189  208742_s_at      0.801423
106  201782_s_at      0.760590
107    218391_at      0.565685
376  224415_s_at      0.456880
218    205441_at      0.442617
100  207438_s_at      0.418623
345  218407_x_at      0.400843
396    223064_at      0.384167
147    200649_at      0.363771

Sample 4: Predicted Probability (Bladder): 0.9623
Top Contributing Genes:
         Feature  Contribution
233    217720_at      1.315752
189  208742_s_at      0.887549
106  201782_s_at      0.757272
107    218391_at      0.526747
376  224415_s_at      0.479815
100  207438_s_at      0.443113
218    205441_at      0.427262
345  218407_x_at      0.398276
396    223064_at      0.389278
68     204218_at      0.382206

Sample 5: Predicted Probability (Bladder): 0.9642
Top Contributing Genes:
         Feature  Contribution
233    217720_at      1.144736
189  208742_s_at      0.813840
