In [1]:
import pandas as pd

## Preparing the Data for Cancer Type Classification

In [2]:
df_colorectal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/colorectal.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [3]:
def create_combined_df(cancer_df, normal_df):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=4, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [4]:
df_colorectal_cancer = create_combined_df(df_colorectal, df_normal)

In [5]:
# Preprocessing data
# Input df, return X,y for training

def preprocessing(df):
    
    # Drop type Columns
    if "type" in df.columns:
        df = df.drop(columns="type")

    # Convert 'cancer_type' column to binary type: normal = 0, other = 1
    if 'cancer_type' in df.columns and not df['cancer_type'].isin([0, 1]).all():
        df['cancer_type'] = df['cancer_type'].map({'normal': 0}).fillna(1).astype(int)
    
    # Get X,y
    target = 'cancer_type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data
X_c,y_c = preprocessing(df_colorectal_cancer)

display(X_c,y_c)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,9.619385,7.046702,3.941195,6.938480,2.848975,6.857807,5.688482,3.638475,7.986668,2.860325,...,12.490817,11.811781,13.508058,13.506478,3.144123,2.937982,2.587996,2.350123,2.932034,2.605678
1,9.430742,8.206753,4.161144,7.343511,2.592307,6.727380,5.932399,3.985782,6.810169,2.659053,...,11.852443,11.076892,13.142011,13.129857,2.869188,2.946861,2.542567,2.320794,2.919703,2.414407
2,9.338931,8.124252,4.114290,6.570741,2.428728,5.945522,5.492001,3.776078,8.171787,2.796055,...,12.083467,11.281992,13.123608,13.068289,3.139030,3.094092,2.471298,2.239316,2.834338,2.491721
3,9.442320,7.638560,4.018402,6.892336,2.459235,6.202368,5.613744,3.965668,6.749412,2.662916,...,12.367512,11.596857,13.260558,13.341141,3.033355,3.253789,2.437978,2.318163,2.960950,2.513157
4,9.492616,6.945367,3.913129,6.672961,2.750156,6.967156,5.326716,4.006410,7.955763,2.483739,...,11.907987,11.488572,13.204390,13.319602,3.185108,2.978889,2.449346,2.377659,2.702139,2.547656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,10.313773,7.538086,7.245006,9.848509,4.970065,7.940631,6.844568,6.486222,6.475101,5.793909,...,13.697772,13.312410,14.482585,14.408494,5.629199,5.550626,4.643727,4.323855,4.810055,4.928338
316,10.140518,6.493956,5.293636,7.471767,2.450517,7.406938,7.147774,3.698845,8.310904,3.856742,...,12.171680,11.796772,13.539391,13.371401,7.692508,3.850374,5.455213,2.480886,2.911229,2.610331
317,10.456393,6.492040,6.147456,7.867307,3.705554,7.129797,5.843678,5.287788,5.400974,3.867667,...,12.011763,11.424249,13.652698,13.495931,4.033132,3.697196,3.484437,3.505660,3.900688,3.830304
318,10.854855,7.586964,7.974392,8.378973,3.524966,8.830916,5.535973,7.765696,10.167490,4.419998,...,12.640816,11.752741,13.928000,13.777568,8.867246,5.827812,7.381320,3.540077,3.986037,3.993587


0      1
1      1
2      1
3      1
4      1
      ..
315    0
316    0
317    0
318    0
319    0
Name: cancer_type, Length: 320, dtype: int64

Feature Selection

In [6]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def feature_selection(X, y, k_anova=500, k_mutual=500, combine_features=True):
    
    # Perform ANOVA feature selection
    anova_selector = SelectKBest(score_func=f_classif, k=k_anova)
    anova_selector.fit(X, y)
    X_anova = set(X.columns[anova_selector.get_support()])

    # Perform Mutual Information feature selection
    mutual_info_selector = SelectKBest(score_func=mutual_info_classif, k=k_mutual)
    mutual_info_selector.fit(X, y)
    X_mut = set(X.columns[mutual_info_selector.get_support()])

    selected_features = X_anova.intersection(X_mut)  # Reduce overlap (intersection)

    # Subset data with selected features
    X_reduce = X[list(selected_features)]
    return X_reduce

X_c = feature_selection(X_c, y_c, k_anova=500, k_mutual=500, combine_features=True)

## Training the Models and Returning their LOOCV Score

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Function Definition
def perform_model(X, y, max_iter=1000):
    model = LogisticRegression(penalty='l1', solver='saga', max_iter=max_iter, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    accuracy_scores, recall_scores, f1_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, zero_division=0))

    # Train the final model on the entire dataset
    model.fit(X, y)

    # Print cross-validation results
    print("\nCross-Validation Results:")
    print(f"Mean Accuracy: {np.mean(accuracy_scores):.2f} ± {np.std(accuracy_scores):.2f}")
    print(f"Mean Recall: {np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}")
    print(f"Mean F1 Score: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")

# Return the model and summary results
    return {
        "model": model
    }

In [8]:
result = perform_model(X_c, y_c, max_iter=5000)
result


Cross-Validation Results:
Mean Accuracy: 1.00 ± 0.00
Mean Recall: 1.00 ± 0.00
Mean F1 Score: 1.00 ± 0.00


{'model': LogisticRegression(max_iter=5000, penalty='l1', random_state=42, solver='saga')}

In [9]:
model = result["model"]

In [10]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_colorectal_test = df_test[df_test['cancer_type'].isin(['normal', 'colorectal'])]

In [11]:
X_test,y_test = preprocessing(df_colorectal_test)
X_test = X_test[X_c.columns]

In [12]:
from sklearn.metrics import classification_report

# Predict using the final trained model
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print("\nFinal Model Evaluation on Test Dataset:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Final Model Evaluation on Test Dataset:
Accuracy: 1.00
Recall: 1.00
F1 Score: 1.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           1       1.00      1.00      1.00        17

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [13]:
# Save model

import joblib
joblib.dump(model, 'colorectal.joblib')

['colorectal.joblib']