In [14]:
import pandas as pd

## Preparing the Data for Cancer Type Classification

In [15]:
df_prostate = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/prostate.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [16]:
def create_combined_df(cancer_df, normal_df):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=10, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [17]:
df_prostate_cancer = create_combined_df(df_prostate, df_normal)

In [18]:
# Preprocessing data
# Input df, return X,y for training

def preprocessing(df):
    
    # Drop type Columns
    if "type" in df.columns:
        df = df.drop(columns="type")

    # Convert 'cancer_type' column to binary type: normal = 0, other = 1
    if 'cancer_type' in df.columns and not df['cancer_type'].isin([0, 1]).all():
        df['cancer_type'] = df['cancer_type'].map({'normal': 0}).fillna(1).astype(int)
    
    # Get X,y
    target = 'cancer_type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data
X_p,y_p = preprocessing(df_prostate_cancer)

display(X_p,y_p)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,10.018234,7.707583,5.989747,5.829840,2.494476,8.939803,4.962801,4.856077,6.140604,5.268231,...,13.521793,13.492930,14.515047,14.346766,2.987548,2.427671,2.238646,2.156191,2.562932,2.855860
1,9.917901,7.140408,5.269716,5.894167,2.493712,7.013469,5.037927,5.783995,5.592940,6.262347,...,13.508332,13.606260,14.558157,14.404023,3.116301,2.732446,2.535558,2.249507,2.575246,2.541969
2,10.030899,7.009126,4.838489,5.734018,2.712703,8.360789,5.013073,5.070055,4.252096,5.499370,...,13.896166,13.905449,14.623102,14.468804,3.231902,2.652717,2.598874,2.182475,2.677108,2.756087
3,10.267389,7.238755,6.202870,5.868314,2.468433,8.668240,5.093923,4.842425,6.505953,4.859145,...,13.553855,13.570597,14.498986,14.352804,3.234453,2.562404,2.614932,2.359522,2.391233,2.503232
4,10.274597,7.408846,5.791614,5.702898,2.335951,8.075089,4.653027,5.377589,4.611386,5.793055,...,13.371034,13.495509,14.518759,14.327416,3.053767,2.570960,2.588031,2.247985,2.536752,2.604973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,11.536450,7.809675,7.251338,9.862932,5.084927,7.886534,6.523860,6.414895,6.543505,5.556717,...,12.455653,11.963920,13.949450,13.593756,5.745762,5.364486,4.611587,4.363301,4.814841,4.889502
88,11.056594,6.488804,5.609028,9.254046,3.263491,8.372543,6.361757,4.780016,6.468237,3.681773,...,12.872034,12.394845,13.978138,13.893436,10.465070,5.721868,8.401308,3.300762,3.900741,3.563504
89,10.014989,5.972408,5.370305,10.086734,2.597021,7.272397,5.354960,4.343836,5.418884,4.479235,...,13.140487,12.511706,14.077116,14.041201,10.398669,8.205191,9.216556,2.444298,2.794763,2.634380
90,12.536421,6.627878,6.974907,9.754692,5.434262,7.320907,6.289890,6.117296,6.722050,6.522252,...,12.643840,11.573833,13.977533,13.885077,5.169850,5.011484,4.188489,3.563915,4.336722,4.455248


0     1
1     1
2     1
3     1
4     1
     ..
87    0
88    0
89    0
90    0
91    0
Name: cancer_type, Length: 92, dtype: int64

Feature Selection

In [19]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def feature_selection(X, y, k_anova=500, k_mutual=500, combine_features=True):
    
    # Perform ANOVA feature selection
    anova_selector = SelectKBest(score_func=f_classif, k=k_anova)
    anova_selector.fit(X, y)
    X_anova = set(X.columns[anova_selector.get_support()])

    # Perform Mutual Information feature selection
    mutual_info_selector = SelectKBest(score_func=mutual_info_classif, k=k_mutual)
    mutual_info_selector.fit(X, y)
    X_mut = set(X.columns[mutual_info_selector.get_support()])

    selected_features = X_anova.union(X_mut)

    # Subset data with selected features
    X_reduce = X[list(selected_features)]
    return X_reduce

X_p = feature_selection(X_p, y_p, k_anova=300, k_mutual=300, combine_features=True)

## Training the Models and Returning their LOOCV Score

In [20]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Function Definition
def perform_model(X, y, max_iter=1000):
    model = LogisticRegression(penalty='l1', solver='saga', max_iter=max_iter, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    accuracy_scores, recall_scores, f1_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, zero_division=0))

    # Train the final model on the entire dataset
    model.fit(X, y)

    # Print cross-validation results
    print("\nCross-Validation Results:")
    print(f"Mean Accuracy: {np.mean(accuracy_scores):.2f} ± {np.std(accuracy_scores):.2f}")
    print(f"Mean Recall: {np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}")
    print(f"Mean F1 Score: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")

# Return the model and summary results
    return {
        "model": model
    }

In [21]:
result = perform_model(X_p, y_p, max_iter=5000)
result


Cross-Validation Results:
Mean Accuracy: 0.98 ± 0.03
Mean Recall: 1.00 ± 0.00
Mean F1 Score: 0.98 ± 0.02


{'model': LogisticRegression(max_iter=5000, penalty='l1', random_state=42, solver='saga')}

In [22]:
model = result["model"]

In [23]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_prostate_test = df_test[df_test['cancer_type'].isin(['normal', 'prostate'])]

In [24]:
X_test,y_test = preprocessing(df_prostate_test)
X_test = X_test[X_p.columns]

In [25]:
from sklearn.metrics import classification_report

# Predict using the final trained model
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print("\nFinal Model Evaluation on Test Dataset:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Final Model Evaluation on Test Dataset:
Accuracy: 0.98
Recall: 1.00
F1 Score: 0.91

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        52
           1       0.83      1.00      0.91         5

    accuracy                           0.98        57
   macro avg       0.92      0.99      0.95        57
weighted avg       0.99      0.98      0.98        57



In [26]:
# Save model

import joblib
joblib.dump(model, 'prostate.joblib')

['prostate.joblib']