In [1]:
import pandas as pd
import numpy as np

## Preparing the Data for Cancer Sub-Type Classification

In [2]:
df_colorectal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/colorectal.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [3]:
def create_combined_df(cancer_df, normal_df, random_state):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=random_state, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [None]:
df_colorectal_cancer = create_combined_df(df_colorectal, df_normal, 4)
df_colorectal_cancer.shape

(320, 54677)

In [5]:
df_colorectal_cancer['type'].unique()

array(['tumoral', 'adenoma', 'tumoral_LCM', 'tumoral_homogenized',
       'normal'], dtype=object)

In [6]:
def preprocessing_sub(df):
    
    # Drop cancer_type Columns
    if "cancer_type" in df.columns:
        df = df.drop(columns="cancer_type")

    # Get X,y
    target = 'type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data

Xs_c,ys_c = preprocessing_sub(df_colorectal_cancer)

display(Xs_c,ys_c)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,9.619385,7.046702,3.941195,6.938480,2.848975,6.857807,5.688482,3.638475,7.986668,2.860325,...,12.490817,11.811781,13.508058,13.506478,3.144123,2.937982,2.587996,2.350123,2.932034,2.605678
1,9.430742,8.206753,4.161144,7.343511,2.592307,6.727380,5.932399,3.985782,6.810169,2.659053,...,11.852443,11.076892,13.142011,13.129857,2.869188,2.946861,2.542567,2.320794,2.919703,2.414407
2,9.338931,8.124252,4.114290,6.570741,2.428728,5.945522,5.492001,3.776078,8.171787,2.796055,...,12.083467,11.281992,13.123608,13.068289,3.139030,3.094092,2.471298,2.239316,2.834338,2.491721
3,9.442320,7.638560,4.018402,6.892336,2.459235,6.202368,5.613744,3.965668,6.749412,2.662916,...,12.367512,11.596857,13.260558,13.341141,3.033355,3.253789,2.437978,2.318163,2.960950,2.513157
4,9.492616,6.945367,3.913129,6.672961,2.750156,6.967156,5.326716,4.006410,7.955763,2.483739,...,11.907987,11.488572,13.204390,13.319602,3.185108,2.978889,2.449346,2.377659,2.702139,2.547656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,10.313773,7.538086,7.245006,9.848509,4.970065,7.940631,6.844568,6.486222,6.475101,5.793909,...,13.697772,13.312410,14.482585,14.408494,5.629199,5.550626,4.643727,4.323855,4.810055,4.928338
316,10.140518,6.493956,5.293636,7.471767,2.450517,7.406938,7.147774,3.698845,8.310904,3.856742,...,12.171680,11.796772,13.539391,13.371401,7.692508,3.850374,5.455213,2.480886,2.911229,2.610331
317,10.456393,6.492040,6.147456,7.867307,3.705554,7.129797,5.843678,5.287788,5.400974,3.867667,...,12.011763,11.424249,13.652698,13.495931,4.033132,3.697196,3.484437,3.505660,3.900688,3.830304
318,10.854855,7.586964,7.974392,8.378973,3.524966,8.830916,5.535973,7.765696,10.167490,4.419998,...,12.640816,11.752741,13.928000,13.777568,8.867246,5.827812,7.381320,3.540077,3.986037,3.993587


0      tumoral
1      tumoral
2      tumoral
3      tumoral
4      tumoral
        ...   
315     normal
316     normal
317     normal
318     normal
319     normal
Name: type, Length: 320, dtype: object

## Sub-Type Classification using Logistic Regression

In [None]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np

def kfold_with_pca(X, y, n_components, n_splits=5):
    
    # Ensure consistent column ordering
    feature_names = X.columns.tolist()

    # Get the unique class labels
    unique_classes = np.unique(y)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    pca = PCA(n_components=n_components)
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Lists to store fold metrics
    accuracies, recalls, precisions, f1_scores = [], [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply PCA
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        # Train the model
        model.fit(X_train_pca, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test_pca)

        # Calculate metrics
        accuracies.append(accuracy_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred, average='macro', zero_division=0))
        precisions.append(precision_score(y_test, y_pred, average='macro', zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # Final PCA fitting and model training on full data
    X_pca_full = pca.fit_transform(X)
    model.fit(X_pca_full, y)

    # Aggregate metrics
    metrics = {
        "Mean Accuracy": np.mean(accuracies),
        "Mean Recall": np.mean(recalls),
        "Mean Precision": np.mean(precisions),
        "Mean F1 Score": np.mean(f1_scores),
    }

    # Print cross-validation results
    print(f"5-Fold CV Results with {n_components} PCA components:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.2f}")

    # Return the final model, PCA object, and metrics
    return {
        "model": model,
        "pca": pca,
        "feature_names": feature_names,
        "metrics": metrics,
    }


In [40]:
results = kfold_with_pca(Xs_c, ys_c, n_components=10)
results


5-Fold CV Results with 10 PCA components:
Mean Accuracy: 0.97
Std Accuracy: 0.01
Mean Recall: 0.91
Std Recall: 0.09
Mean Precision: 0.92
Std Precision: 0.07
Mean F1 Score: 0.91
Std F1 Score: 0.08


{'model': RandomForestClassifier(random_state=42),
 'pca': PCA(n_components=10),
 'feature_names': ['1007_s_at',
  '1053_at',
  '117_at',
  '121_at',
  '1255_g_at',
  '1294_at',
  '1316_at',
  '1320_at',
  '1405_i_at',
  '1431_at',
  '1438_at',
  '1487_at',
  '1494_f_at',
  '1552256_a_at',
  '1552257_a_at',
  '1552258_at',
  '1552261_at',
  '1552263_at',
  '1552264_a_at',
  '1552266_at',
  '1552269_at',
  '1552271_at',
  '1552272_a_at',
  '1552274_at',
  '1552275_s_at',
  '1552276_a_at',
  '1552277_a_at',
  '1552278_a_at',
  '1552279_a_at',
  '1552280_at',
  '1552281_at',
  '1552283_s_at',
  '1552286_at',
  '1552287_s_at',
  '1552288_at',
  '1552289_a_at',
  '1552291_at',
  '1552293_at',
  '1552295_a_at',
  '1552296_at',
  '1552299_at',
  '1552301_a_at',
  '1552302_at',
  '1552303_a_at',
  '1552304_at',
  '1552306_at',
  '1552307_a_at',
  '1552309_a_at',
  '1552310_at',
  '1552311_a_at',
  '1552312_a_at',
  '1552314_a_at',
  '1552315_at',
  '1552316_a_at',
  '1552318_at',
  '1552319_a_

In [41]:
model = results["model"]
pca = results["pca"]

In [42]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_colorectal_test = df_test[df_test['cancer_type'].isin(['normal', 'colorectal'])]

In [43]:
X_test,y_test = preprocessing_sub(df_colorectal_test)

In [44]:
from sklearn.metrics import classification_report

def evaluate_model(model, pca, X_test, y_test, unique_classes, feature_names):
    """
    Evaluate the final trained model on test data.
    """
    # Ensure test set columns match the training set
    X_test = X_test[feature_names]

    # Transform test data using PCA
    X_test_pca = pca.transform(X_test)

    # Predict using the final trained model
    y_pred = model.predict(X_test_pca)

    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average=None, labels=unique_classes, zero_division=0)

    # Print results
    print("\nFinal Model Evaluation on Test Dataset:")
    print(f"Accuracy: {accuracy:.2f}")
    print("\nRecall for each class:")
    for i, cls in enumerate(unique_classes):
        print(f"Class {cls}: Recall = {recall[i]:.2f}")

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, labels=unique_classes))


In [None]:
evaluate_model(
    model=results["model"],
    pca=results["pca"],
    X_test=X_test,
    y_test=y_test,
    unique_classes=np.unique(ys_c),
    feature_names=results["feature_names"],
)

5-Fold CV Results with 10 PCA components:
Mean Accuracy: 0.97
Std Accuracy: 0.01
Mean Recall: 0.91
Std Recall: 0.09
Mean Precision: 0.92
Std Precision: 0.07
Mean F1 Score: 0.91
Std F1 Score: 0.08

Final Model Evaluation on Test Dataset:
Accuracy: 0.99

Recall for each class:
Class adenoma: Recall = 1.00
Class normal: Recall = 1.00
Class tumoral: Recall = 0.67
Class tumoral_LCM: Recall = 1.00
Class tumoral_homogenized: Recall = 1.00

Classification Report:
                     precision    recall  f1-score   support

            adenoma       1.00      1.00      1.00         3
             normal       0.98      1.00      0.99        52
            tumoral       1.00      0.67      0.80         3
        tumoral_LCM       1.00      1.00      1.00         9
tumoral_homogenized       1.00      1.00      1.00         2

           accuracy                           0.99        69
          macro avg       1.00      0.93      0.96        69
       weighted avg       0.99      0.99      0.98

In [46]:
# Save model

import joblib
joblib.dump(model, 'colorectal_sub.joblib')
joblib.dump(pca, 'colorectal_sub_pca.joblib')

['colorectal_sub_pca.joblib']