In [1]:
import pandas as pd

## Preparing the Data for Cancer Sub-Type Classification

In [2]:
df_brain = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/brain.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [3]:
def create_combined_df(cancer_df, normal_df, random_state):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=random_state, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [4]:
df_brain_cancer = create_combined_df(df_brain, df_normal, 2)
df_brain_cancer.shape

(278, 54677)

In [5]:
df_brain_cancer['type'].unique()

array(['glioblastoma', 'astrocytoma', 'oligodendrioglioma',
       'glioblastoma-cell-line', 'ependymoma', 'medulloblastoma',
       'pilocytic_astrocytoma', 'normal'], dtype=object)

In [6]:
def preprocessing_sub(df):
    
    # Drop cancer_type Columns
    if "cancer_type" in df.columns:
        df = df.drop(columns="cancer_type")

    # Get X,y
    target = 'type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data

Xs_b,ys_b = preprocessing_sub(df_brain_cancer)

display(Xs_b,ys_b)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,9.409264,6.486642,6.849118,5.553799,2.614667,6.465026,5.468213,3.088760,4.888455,2.627267,...,11.117198,10.995354,12.540305,12.393756,3.000946,2.229139,2.125375,1.849213,2.164977,2.247018
1,9.566747,7.012791,7.269160,7.016673,3.370558,7.696889,5.180409,3.299861,4.841996,2.534386,...,10.996753,10.947164,12.487712,12.363095,2.815493,2.552860,2.017627,1.806097,2.094159,2.291342
2,8.405845,5.699024,4.224252,6.538294,3.523430,6.086694,6.921622,3.058813,4.436407,3.320190,...,11.112277,11.034261,12.272104,12.084999,3.342329,2.236081,2.225670,2.030381,2.281967,2.808183
3,9.495506,7.388342,5.716298,5.858912,3.943877,6.649459,5.413839,3.554260,3.038899,2.216190,...,10.815664,10.689521,12.418079,12.157467,2.811632,2.331273,1.980123,1.952916,1.972212,2.145180
4,9.110781,5.543423,4.024291,5.249798,3.516682,5.777301,6.411266,3.139687,2.518323,3.050451,...,10.954677,10.900948,12.451990,12.209082,2.918800,2.175992,1.937505,1.895730,1.969136,2.271932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,10.266941,7.274360,7.481815,7.868121,3.057731,8.498494,4.848267,7.390349,9.902287,4.036274,...,12.452098,11.652207,13.878998,13.619784,3.703534,3.521350,3.113138,3.054296,3.542719,3.162886
274,10.854855,7.586964,7.974392,8.378973,3.524966,8.830916,5.535973,7.765696,10.167490,4.419998,...,12.640816,11.752741,13.928000,13.777568,8.867246,5.827812,7.381320,3.540077,3.986037,3.993587
275,7.539441,5.091317,4.969811,7.281619,2.676623,7.123846,6.975420,4.206157,6.865027,2.819222,...,12.502692,12.075645,13.626916,13.364571,7.908427,4.790640,5.784323,2.575718,2.793955,2.929937
276,9.829387,7.091643,6.346083,5.425531,2.317111,8.787408,5.194341,5.415858,5.222828,6.089014,...,13.501576,13.645847,14.534075,14.424066,2.888612,2.551908,2.590236,2.323836,2.556166,2.494261


0      glioblastoma
1      glioblastoma
2      glioblastoma
3      glioblastoma
4      glioblastoma
           ...     
273          normal
274          normal
275          normal
276          normal
277          normal
Name: type, Length: 278, dtype: object

## Sub-Type Classification using Logistic Regression

In [7]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np

def kfold_with_pca(X, y, n_components, n_splits=5):
    
    # Ensure consistent column ordering
    feature_names = X.columns.tolist()

    # Get the unique class labels
    unique_classes = np.unique(y)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    pca = PCA(n_components=n_components)
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Lists to store fold metrics
    accuracies, recalls, precisions, f1_scores = [], [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply PCA
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        # Train the model
        model.fit(X_train_pca, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test_pca)

        # Calculate metrics
        accuracies.append(accuracy_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred, average='macro', zero_division=0))
        precisions.append(precision_score(y_test, y_pred, average='macro', zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # Final PCA fitting and model training on full data
    X_pca_full = pca.fit_transform(X)
    model.fit(X_pca_full, y)

    # Aggregate metrics
    metrics = {
        "Mean Accuracy": np.mean(accuracies),
        "Mean Recall": np.mean(recalls),
        "Mean Precision": np.mean(precisions),
        "Mean F1 Score": np.mean(f1_scores),
    }

    # Print cross-validation results
    print(f"5-Fold CV Results with {n_components} PCA components:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.2f}")

    # Return the final model, PCA object, and metrics
    return {
        "model": model,
        "pca": pca,
        "feature_names": feature_names,
        "metrics": metrics,
    }

In [10]:
results = kfold_with_pca(Xs_b, ys_b, n_components=50)
results


5-Fold CV Results with 50 PCA components:
Mean Accuracy: 0.93
Mean Recall: 0.84
Mean Precision: 0.85
Mean F1 Score: 0.83


{'model': RandomForestClassifier(random_state=42),
 'pca': PCA(n_components=50),
 'feature_names': ['1007_s_at',
  '1053_at',
  '117_at',
  '121_at',
  '1255_g_at',
  '1294_at',
  '1316_at',
  '1320_at',
  '1405_i_at',
  '1431_at',
  '1438_at',
  '1487_at',
  '1494_f_at',
  '1552256_a_at',
  '1552257_a_at',
  '1552258_at',
  '1552261_at',
  '1552263_at',
  '1552264_a_at',
  '1552266_at',
  '1552269_at',
  '1552271_at',
  '1552272_a_at',
  '1552274_at',
  '1552275_s_at',
  '1552276_a_at',
  '1552277_a_at',
  '1552278_a_at',
  '1552279_a_at',
  '1552280_at',
  '1552281_at',
  '1552283_s_at',
  '1552286_at',
  '1552287_s_at',
  '1552288_at',
  '1552289_a_at',
  '1552291_at',
  '1552293_at',
  '1552295_a_at',
  '1552296_at',
  '1552299_at',
  '1552301_a_at',
  '1552302_at',
  '1552303_a_at',
  '1552304_at',
  '1552306_at',
  '1552307_a_at',
  '1552309_a_at',
  '1552310_at',
  '1552311_a_at',
  '1552312_a_at',
  '1552314_a_at',
  '1552315_at',
  '1552316_a_at',
  '1552318_at',
  '1552319_a_

In [11]:
model = results["model"]
pca = results["pca"]

In [12]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_brain_test = df_test[df_test['cancer_type'].isin(['normal', 'brain'])]

In [13]:
X_test,y_test = preprocessing_sub(df_brain_test)

In [14]:
from sklearn.metrics import classification_report

def evaluate_model(model, pca, X_test, y_test, unique_classes, feature_names):
    """
    Evaluate the final trained model on test data.
    """
    # Ensure test set columns match the training set
    X_test = X_test[feature_names]

    # Transform test data using PCA
    X_test_pca = pca.transform(X_test)

    # Predict using the final trained model
    y_pred = model.predict(X_test_pca)

    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average=None, labels=unique_classes, zero_division=0)

    # Print results
    print("\nFinal Model Evaluation on Test Dataset:")
    print(f"Accuracy: {accuracy:.2f}")
    print("\nRecall for each class:")
    for i, cls in enumerate(unique_classes):
        print(f"Class {cls}: Recall = {recall[i]:.2f}")

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, labels=unique_classes))


In [15]:
evaluate_model(
    model=results["model"],
    pca=results["pca"],
    X_test=X_test,
    y_test=y_test,
    unique_classes=np.unique(ys_b),
    feature_names=results["feature_names"],
)


Final Model Evaluation on Test Dataset:
Accuracy: 0.99

Recall for each class:
Class astrocytoma: Recall = 0.50
Class ependymoma: Recall = 1.00
Class glioblastoma: Recall = 1.00
Class glioblastoma-cell-line: Recall = 1.00
Class medulloblastoma: Recall = 1.00
Class normal: Recall = 1.00
Class oligodendrioglioma: Recall = 0.00
Class pilocytic_astrocytoma: Recall = 0.00

Classification Report:
                        precision    recall  f1-score   support

           astrocytoma       1.00      0.50      0.67         2
            ependymoma       1.00      1.00      1.00         5
          glioblastoma       0.83      1.00      0.91         5
glioblastoma-cell-line       1.00      1.00      1.00         1
       medulloblastoma       1.00      1.00      1.00         2
                normal       1.00      1.00      1.00        52
    oligodendrioglioma       0.00      0.00      0.00         0
 pilocytic_astrocytoma       0.00      0.00      0.00         0

              accuracy     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Save model

import joblib
joblib.dump(model, 'brain_sub.joblib')
joblib.dump(pca, 'brain_sub_pca.joblib')

['brain_sub_pca.joblib']