In [1]:
import pandas as pd

## Preparing the Data for Cancer Sub-Type Classification

In [2]:
df_gastric = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/gastric.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [3]:
def create_combined_df(cancer_df, normal_df, random_state):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=random_state, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [4]:
df_gastric_cancer = create_combined_df(df_gastric, df_normal, 5)
df_gastric_cancer.shape

(40, 54677)

In [5]:
df_gastric_cancer['type'].unique()

array(['tumoral', 'adenocarcinoma', 'normal'], dtype=object)

In [6]:
def preprocessing_sub(df):
    
    # Drop cancer_type Columns
    if "cancer_type" in df.columns:
        df = df.drop(columns="cancer_type")

    # Get X,y
    target = 'type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data

Xs_g,ys_g = preprocessing_sub(df_gastric_cancer)

display(Xs_g,ys_g)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,8.729061,5.180322,4.608926,5.647848,2.042212,7.206777,4.422622,2.755938,8.812442,2.573335,...,11.164609,10.775514,12.857273,12.731003,9.234305,8.248071,8.344889,1.631211,1.978618,1.892722
1,9.001334,6.617095,4.395646,7.123623,3.241733,5.498223,4.2439,2.925823,4.534666,1.964338,...,10.788976,10.302022,12.440039,12.300295,8.163385,7.362267,7.501064,1.756396,1.977184,2.015272
2,8.159827,5.92643,5.122536,6.226437,2.019512,6.624482,4.5518,2.767274,7.986535,2.210271,...,11.187776,10.748935,12.66579,12.592801,8.167704,6.801099,6.979345,1.662133,2.070686,1.924248
3,9.559118,7.043322,4.971121,6.7686,2.671845,6.883452,4.032452,3.171697,6.465104,4.093294,...,10.731669,10.294698,12.534455,12.378446,7.68566,5.212969,6.233874,1.731197,1.949998,2.027056
4,8.25784,6.26811,5.317055,6.096988,2.095831,5.882465,4.526486,2.666876,7.004057,1.829208,...,10.663157,10.080286,12.349305,12.303505,7.658543,6.624243,6.80969,1.617277,2.042376,2.088563
5,8.268637,5.673706,4.041234,5.940841,2.092022,7.395003,5.180165,2.788542,8.33776,2.37271,...,11.630287,11.226991,12.952052,12.891272,9.438176,6.562677,7.899977,1.713044,2.031004,1.826066
6,8.072856,6.055937,5.0261,6.05028,1.739597,6.663004,4.890627,2.831725,8.391452,2.39175,...,11.234398,10.733645,12.696144,12.586035,7.879248,6.413369,6.656621,1.630604,1.974087,2.036339
7,7.680028,5.894405,5.199329,6.30754,1.90292,6.742952,4.502145,2.649148,7.964943,2.582655,...,10.878437,10.401695,12.39047,12.366916,7.831222,6.800263,6.979293,1.630654,1.879426,1.92526
8,8.451915,5.863011,4.382178,6.026346,1.947833,6.923885,4.499549,3.01664,7.658565,2.525693,...,10.78152,10.42831,12.621985,12.550523,8.586549,7.755926,7.762786,1.655331,1.823594,1.941182
9,8.123749,5.428503,4.366489,5.876885,1.968247,6.917033,4.814167,2.804359,8.31408,2.925554,...,11.17294,10.922962,12.603767,12.570259,8.182451,6.412731,6.597469,1.592001,2.002152,2.093622


0            tumoral
1            tumoral
2            tumoral
3            tumoral
4            tumoral
5            tumoral
6            tumoral
7            tumoral
8            tumoral
9            tumoral
10           tumoral
11    adenocarcinoma
12    adenocarcinoma
13    adenocarcinoma
14    adenocarcinoma
15    adenocarcinoma
16    adenocarcinoma
17    adenocarcinoma
18    adenocarcinoma
19    adenocarcinoma
20            normal
21            normal
22            normal
23            normal
24            normal
25            normal
26            normal
27            normal
28            normal
29            normal
30            normal
31            normal
32            normal
33            normal
34            normal
35            normal
36            normal
37            normal
38            normal
39            normal
Name: type, dtype: object

## Sub-Type Classification using Logistic Regression

In [15]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np

def kfold_with_pca(X, y, n_components, n_splits=5):
    
    # Ensure consistent column ordering
    feature_names = X.columns.tolist()

    # Get the unique class labels
    unique_classes = np.unique(y)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    pca = PCA(n_components=n_components)
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Lists to store fold metrics
    accuracies, recalls, precisions, f1_scores = [], [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply PCA
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        # Train the model
        model.fit(X_train_pca, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test_pca)

        # Calculate metrics
        accuracies.append(accuracy_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred, average='macro', zero_division=0))
        precisions.append(precision_score(y_test, y_pred, average='macro', zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # Final PCA fitting and model training on full data
    X_pca_full = pca.fit_transform(X)
    model.fit(X_pca_full, y)

    # Aggregate metrics
    metrics = {
        "Mean Accuracy": np.mean(accuracies),
        "Mean Recall": np.mean(recalls),
        "Mean Precision": np.mean(precisions),
        "Mean F1 Score": np.mean(f1_scores),
    }

    # Print cross-validation results
    print(f"5-Fold CV Results with {n_components} PCA components:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.2f}")

    # Return the final model, PCA object, and metrics
    return {
        "model": model,
        "pca": pca,
        "feature_names": feature_names,
        "metrics": metrics,
    }


In [16]:
results = kfold_with_pca(Xs_g, ys_g, n_components=16)
results


5-Fold CV Results with 16 PCA components:
Mean Accuracy: 1.00
Mean Recall: 1.00
Mean Precision: 1.00
Mean F1 Score: 1.00


{'model': RandomForestClassifier(random_state=42),
 'pca': PCA(n_components=16),
 'feature_names': ['1007_s_at',
  '1053_at',
  '117_at',
  '121_at',
  '1255_g_at',
  '1294_at',
  '1316_at',
  '1320_at',
  '1405_i_at',
  '1431_at',
  '1438_at',
  '1487_at',
  '1494_f_at',
  '1552256_a_at',
  '1552257_a_at',
  '1552258_at',
  '1552261_at',
  '1552263_at',
  '1552264_a_at',
  '1552266_at',
  '1552269_at',
  '1552271_at',
  '1552272_a_at',
  '1552274_at',
  '1552275_s_at',
  '1552276_a_at',
  '1552277_a_at',
  '1552278_a_at',
  '1552279_a_at',
  '1552280_at',
  '1552281_at',
  '1552283_s_at',
  '1552286_at',
  '1552287_s_at',
  '1552288_at',
  '1552289_a_at',
  '1552291_at',
  '1552293_at',
  '1552295_a_at',
  '1552296_at',
  '1552299_at',
  '1552301_a_at',
  '1552302_at',
  '1552303_a_at',
  '1552304_at',
  '1552306_at',
  '1552307_a_at',
  '1552309_a_at',
  '1552310_at',
  '1552311_a_at',
  '1552312_a_at',
  '1552314_a_at',
  '1552315_at',
  '1552316_a_at',
  '1552318_at',
  '1552319_a_

In [17]:
model = results["model"]
pca = results["pca"]

In [18]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_gastric_test = df_test[df_test['cancer_type'].isin(['normal', 'gastric'])]

In [19]:
X_test,y_test = preprocessing_sub(df_gastric_test)

In [20]:
from sklearn.metrics import classification_report

def evaluate_model(model, pca, X_test, y_test, unique_classes, feature_names):
    """
    Evaluate the final trained model on test data.
    """
    # Ensure test set columns match the training set
    X_test = X_test[feature_names]

    # Transform test data using PCA
    X_test_pca = pca.transform(X_test)

    # Predict using the final trained model
    y_pred = model.predict(X_test_pca)

    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average=None, labels=unique_classes, zero_division=0)

    # Print results
    print("\nFinal Model Evaluation on Test Dataset:")
    print(f"Accuracy: {accuracy:.2f}")
    print("\nRecall for each class:")
    for i, cls in enumerate(unique_classes):
        print(f"Class {cls}: Recall = {recall[i]:.2f}")

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, labels=unique_classes))


In [None]:
evaluate_model(
    model=results["model"],
    pca=results["pca"],
    X_test=X_test,
    y_test=y_test,
    unique_classes=np.unique(ys_g),
    feature_names=results["feature_names"],
)

5-Fold CV Results with 10 PCA components:
Mean Accuracy: 1.00
Mean Recall: 1.00
Mean Precision: 1.00
Mean F1 Score: 1.00

Final Model Evaluation on Test Dataset:
Accuracy: 0.91

Recall for each class:
Class adenocarcinoma: Recall = 1.00
Class normal: Recall = 0.90
Class tumoral: Recall = 1.00

Classification Report:
                precision    recall  f1-score   support

adenocarcinoma       1.00      1.00      1.00         1
        normal       1.00      0.90      0.95        52
       tumoral       0.17      1.00      0.29         1

      accuracy                           0.91        54
     macro avg       0.72      0.97      0.75        54
  weighted avg       0.98      0.91      0.94        54



In [22]:
# Save model

import joblib
joblib.dump(model, 'gastric_sub.joblib')
joblib.dump(pca, 'gastric_sub_pca.joblib')

['gastric_sub_pca.joblib']