In [1]:
import pandas as pd

## Preparing the Data for Cancer Type Classification

In [2]:
df_renal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/renal.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [3]:
def create_combined_df(cancer_df, normal_df, cancer_type):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=11, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [4]:
df_renal_cancer = create_combined_df(df_renal, df_normal, 'renal')

In [5]:
# Preprocessing data
# Input df, return X,y for training

def preprocessing(df):
    
    # Drop type Columns
    if "type" in df.columns:
        df = df.drop(columns="type")

    # Convert 'cancer_type' column to binary type: normal = 0, other = 1
    if 'cancer_type' in df.columns and not df['cancer_type'].isin([0, 1]).all():
        df['cancer_type'] = df['cancer_type'].map({'normal': 0}).fillna(1).astype(int)
    
    # Get X,y
    target = 'cancer_type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data
X_r,y_r = preprocessing(df_renal_cancer)

display(X_r,y_r)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,9.176833,8.096554,7.273344,8.181626,2.499844,7.881215,5.276175,4.867742,10.289692,3.285293,...,11.181759,10.762042,12.961899,12.688069,3.725097,3.438851,2.985189,2.687256,3.182581,3.151897
1,10.398563,7.705136,5.539738,9.639625,2.606360,7.527843,5.515707,5.380585,8.251765,3.226581,...,11.278896,10.922836,13.101919,12.879850,3.575279,3.394276,2.951872,2.595775,3.004451,3.046263
2,10.391154,7.844691,6.859150,9.706051,2.832692,8.091542,5.529405,5.072060,10.077566,3.227613,...,11.186446,10.775110,12.952916,12.706673,3.756748,3.593477,3.179602,2.670777,3.067572,3.205570
3,10.735776,8.011649,5.898992,9.988369,2.947561,7.219124,5.617897,5.050528,6.401838,3.129943,...,11.114506,10.843346,13.076192,12.767658,3.747672,3.292814,2.999690,2.707248,3.185152,3.118335
4,9.435494,8.015729,6.849532,8.852354,2.504070,7.337607,5.017304,5.118793,6.924959,3.555628,...,11.049942,10.619544,12.897566,12.632525,3.824783,3.472515,2.928189,2.604355,3.118041,3.041668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,11.532318,8.295377,8.032560,9.287000,5.237196,8.544502,6.221281,6.446083,7.601814,5.597363,...,13.002976,12.574812,14.100213,13.910309,5.495707,5.550453,4.587382,4.348775,4.871972,4.730194
154,10.601131,6.426123,5.493845,7.712389,3.569760,7.993786,5.969634,4.660047,7.245230,3.763940,...,13.509037,13.119966,14.622678,14.482744,8.804711,4.897431,6.028813,2.748260,3.739844,3.184603
155,10.456393,6.492040,6.147456,7.867307,3.705554,7.129797,5.843678,5.287788,5.400974,3.867667,...,12.011763,11.424249,13.652698,13.495931,4.033132,3.697196,3.484437,3.505660,3.900688,3.830304
156,9.736553,6.085462,5.575046,10.184738,2.502134,7.013732,5.174297,4.270660,6.164104,5.851820,...,12.588995,11.769854,13.926275,13.862556,10.171430,7.839900,9.014192,2.289525,2.987165,2.875362


0      1
1      1
2      1
3      1
4      1
      ..
153    0
154    0
155    0
156    0
157    0
Name: cancer_type, Length: 158, dtype: int64

Feature Selection

In [6]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def feature_selection(X, y, k_anova=500, k_mutual=500, combine_features=True):
    
    # Perform ANOVA feature selection
    anova_selector = SelectKBest(score_func=f_classif, k=k_anova)
    anova_selector.fit(X, y)
    X_anova = set(X.columns[anova_selector.get_support()])

    # Perform Mutual Information feature selection
    mutual_info_selector = SelectKBest(score_func=mutual_info_classif, k=k_mutual)
    mutual_info_selector.fit(X, y)
    X_mut = set(X.columns[mutual_info_selector.get_support()])

    selected_features = X_anova.union(X_mut)

    # Subset data with selected features
    X_reduce = X[list(selected_features)]
    return X_reduce

X_r = feature_selection(X_r, y_r, k_anova=100, k_mutual=100, combine_features=True)

## Training the Models and Returning their LOOCV Score

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Function Definition
def perform_model(X, y, max_iter=1000):
    model = LogisticRegression(penalty='l1', solver='saga', max_iter=max_iter, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    accuracy_scores, recall_scores, f1_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, zero_division=0))

    # Train the final model on the entire dataset
    model.fit(X, y)

    # Print cross-validation results
    print("\nCross-Validation Results:")
    print(f"Mean Accuracy: {np.mean(accuracy_scores):.2f} ± {np.std(accuracy_scores):.2f}")
    print(f"Mean Recall: {np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}")
    print(f"Mean F1 Score: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")

# Return the model and summary results
    return {
        "model": model
    }

In [8]:
result = perform_model(X_r, y_r, max_iter=5000)
result




Cross-Validation Results:
Mean Accuracy: 0.92 ± 0.06
Mean Recall: 0.90 ± 0.09
Mean F1 Score: 0.92 ± 0.06


{'model': LogisticRegression(max_iter=5000, penalty='l1', random_state=42, solver='saga')}

In [9]:
model = result["model"]

In [10]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_renal_test = df_test[df_test['cancer_type'].isin(['normal', 'renal'])]

In [11]:
X_test,y_test = preprocessing(df_renal_test)
X_test = X_test[X_r.columns]

In [12]:
from sklearn.metrics import classification_report

# Predict using the final trained model
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print("\nFinal Model Evaluation on Test Dataset:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Final Model Evaluation on Test Dataset:
Accuracy: 0.97
Recall: 1.00
F1 Score: 0.86

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        52
           1       0.75      1.00      0.86         6

    accuracy                           0.97        58
   macro avg       0.88      0.98      0.92        58
weighted avg       0.97      0.97      0.97        58



In [13]:
# Save model

import joblib
joblib.dump(model, 'renal.joblib')

['renal.joblib']