In [1]:
import pandas as pd

## Preparing the Data for Cancer Type Classification

In [2]:
df_pancreatic = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/pancreatic.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [3]:
def create_combined_df(cancer_df, normal_df):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=9, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [4]:
df_pancreatic_cancer = create_combined_df(df_pancreatic, df_normal)

In [5]:
# Preprocessing data
# Input df, return X,y for training

def preprocessing(df):
    
    # Drop type Columns
    if "type" in df.columns:
        df = df.drop(columns="type")

    # Convert 'cancer_type' column to binary type: normal = 0, other = 1
    if 'cancer_type' in df.columns and not df['cancer_type'].isin([0, 1]).all():
        df['cancer_type'] = df['cancer_type'].map({'normal': 0}).fillna(1).astype(int)
    
    # Get X,y
    target = 'cancer_type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data
X_p,y_p = preprocessing(df_pancreatic_cancer)

display(X_p,y_p)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,10.190882,5.856793,5.504879,6.603939,2.589324,7.307193,4.505694,4.170773,6.811538,3.861369,...,12.410819,11.775152,13.730560,13.527400,10.939302,9.366024,10.029756,2.197022,2.549561,2.604475
1,10.397977,6.230170,5.870616,7.176309,2.447036,6.882623,4.827819,4.580666,7.868610,3.704774,...,12.250131,11.663606,13.807925,13.546156,10.223511,8.451554,9.192795,2.354645,2.411806,2.305854
2,9.697873,5.938232,5.303771,7.775848,2.396953,6.487590,4.965582,4.381057,6.377550,4.530656,...,12.542018,11.947975,13.688978,13.499380,10.443427,9.248596,9.575915,2.660606,2.822478,2.695719
3,9.493338,6.269669,5.874772,7.017075,2.444303,7.117745,4.463363,4.347028,9.268125,4.562947,...,12.096191,11.434712,13.689672,13.415672,9.411523,7.842530,8.400336,2.206494,2.397196,2.671649
4,10.360418,6.807520,9.385285,6.777342,2.821871,7.173666,4.515082,4.654198,6.902234,3.910507,...,12.133896,11.478597,13.636572,13.378688,9.378373,7.660518,8.366299,2.341204,2.608619,2.625820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,9.890654,5.992240,5.212611,10.059517,2.542138,7.321693,5.701188,4.616775,5.858608,3.949566,...,12.791088,12.125477,13.972744,13.892863,9.693684,7.364260,8.306983,2.468187,2.860115,2.826091
60,9.365680,5.965926,5.407865,8.050331,2.938916,6.472617,4.934719,4.186337,6.833240,5.257223,...,12.432145,11.833130,13.648058,13.391102,9.776251,8.534153,8.876821,2.412885,2.662891,2.668909
61,10.903770,7.245693,7.707139,9.959299,5.178917,8.770112,7.484030,6.332515,6.956309,5.454712,...,13.148153,12.787935,14.132452,13.957892,5.744669,5.596981,4.592644,4.328368,4.771192,4.834202
62,5.759644,7.821138,8.132814,7.781482,2.889190,8.167174,6.141597,4.088115,10.166836,3.780919,...,12.724064,12.197321,14.117278,14.055453,7.080774,4.763754,5.149368,2.764788,3.520711,3.320865


0     1
1     1
2     1
3     1
4     1
     ..
59    0
60    0
61    0
62    0
63    0
Name: cancer_type, Length: 64, dtype: int64

Feature Selection

In [6]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def feature_selection(X, y, k_anova=500, k_mutual=500, combine_features=True):
    
    # Perform ANOVA feature selection
    anova_selector = SelectKBest(score_func=f_classif, k=k_anova)
    anova_selector.fit(X, y)
    X_anova = set(X.columns[anova_selector.get_support()])

    # Perform Mutual Information feature selection
    mutual_info_selector = SelectKBest(score_func=mutual_info_classif, k=k_mutual)
    mutual_info_selector.fit(X, y)
    X_mut = set(X.columns[mutual_info_selector.get_support()])

    selected_features = X_anova.intersection(X_mut)

    # Subset data with selected features
    X_reduce = X[list(selected_features)]
    return X_reduce

X_p = feature_selection(X_p, y_p, k_anova=300, k_mutual=300, combine_features=True)

## Training the Models and Returning their LOOCV Score

In [7]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Function Definition
def perform_model(X, y):
    # Initialize the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    accuracy_scores, recall_scores, f1_scores = [], [], []

    # Perform 5-fold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training data
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, zero_division=0))

    # Train the final model on the entire dataset
    model.fit(X, y)

    # Print cross-validation results
    print("\nCross-Validation Results:")
    print(f"Mean Accuracy: {np.mean(accuracy_scores):.2f} ± {np.std(accuracy_scores):.2f}")
    print(f"Mean Recall: {np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}")
    print(f"Mean F1 Score: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")

    # Return the model and summary results
    return {
        "model": model
    }

In [8]:
result = perform_model(X_p, y_p)
result


Cross-Validation Results:
Mean Accuracy: 1.00 ± 0.00
Mean Recall: 1.00 ± 0.00
Mean F1 Score: 1.00 ± 0.00


{'model': RandomForestClassifier(random_state=42)}

In [9]:
model = result["model"]

In [10]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_pancreatic_test = df_test[df_test['cancer_type'].isin(['normal', 'pancreatic'])]

In [11]:
X_test,y_test = preprocessing(df_pancreatic_test)
X_test = X_test[X_p.columns]

In [12]:
from sklearn.metrics import classification_report

# Predict using the final trained model
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print("\nFinal Model Evaluation on Test Dataset:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Final Model Evaluation on Test Dataset:
Accuracy: 0.95
Recall: 1.00
F1 Score: 0.73

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        52
           1       0.57      1.00      0.73         4

    accuracy                           0.95        56
   macro avg       0.79      0.97      0.85        56
weighted avg       0.97      0.95      0.95        56



In [13]:
# Save model

import joblib
joblib.dump(model, 'pancreatic.joblib')

['pancreatic.joblib']