In [1]:
import pandas as pd

## Preparing the Data for Cancer Type Classification

In [2]:
df_leukemia = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/leukemia.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [3]:
def create_combined_df(cancer_df, normal_df):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=6, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [4]:
df_leukemia_cancer = create_combined_df(df_leukemia, df_normal)

In [5]:
# Preprocessing data
# Input df, return X,y for training

def preprocessing(df):
    
    # Drop type Columns
    if "type" in df.columns:
        df = df.drop(columns="type")

    # Convert 'cancer_type' column to binary type: normal = 0, other = 1
    if 'cancer_type' in df.columns and not df['cancer_type'].isin([0, 1]).all():
        df['cancer_type'] = df['cancer_type'].map({'normal': 0}).fillna(1).astype(int)
    
    # Get X,y
    target = 'cancer_type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data
X_l,y_l = preprocessing(df_leukemia_cancer)

display(X_l,y_l)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,6.581923,6.973938,6.241571,8.169811,3.101444,7.521992,5.572300,4.746571,9.879799,3.947547,...,12.777497,12.152497,14.064551,14.008521,4.705099,3.885165,3.543180,2.830503,3.617848,3.277917
1,5.666901,8.268696,7.488496,7.366530,2.789599,8.256894,5.762561,4.074388,8.943661,4.256240,...,12.998342,12.262868,14.199057,14.067155,9.681597,7.461119,8.625853,2.899173,3.513921,3.250303
2,5.222456,8.064864,6.692422,7.543151,2.897822,8.095859,6.153365,3.923472,10.201796,3.961529,...,12.875292,12.362886,14.129486,14.199905,8.237609,5.254592,6.918730,3.041982,3.572885,3.124638
3,5.909173,7.694048,6.483238,7.424104,2.738126,7.412941,6.292811,4.326070,7.573979,3.794474,...,12.859101,12.231378,14.215266,14.176641,8.254922,5.402718,6.812010,2.831513,3.685706,3.182729
4,4.978725,8.263890,5.997606,7.455048,2.778287,7.314969,7.728294,4.092225,8.123734,3.763639,...,12.993051,12.384470,14.309715,14.205636,8.992892,7.191203,7.989721,2.896852,3.453444,3.195164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,10.560966,7.507896,7.998389,8.725496,3.572918,9.601450,5.484624,6.081885,10.979798,4.271587,...,12.590997,11.782307,13.877738,13.760668,10.129098,8.006231,9.042986,3.515665,4.039623,3.896915
62,9.876242,6.182263,5.184490,10.655231,2.735838,7.261265,5.099712,4.353748,5.778927,3.838955,...,12.510997,11.831010,13.941601,13.779334,10.713814,8.523429,9.550451,2.466835,2.982146,2.718202
63,9.267593,7.220185,6.153186,8.067816,3.253578,8.362388,4.477390,4.829014,10.733040,3.677779,...,12.538624,12.023002,13.846520,13.326740,7.720637,5.685199,6.633519,3.111038,3.465941,3.260076
64,9.070428,6.177355,5.571560,9.813483,2.814383,7.148465,4.939439,4.540741,5.613820,4.128969,...,13.210043,12.622573,14.174579,14.075071,9.825359,7.763833,8.675025,2.336180,2.762794,2.704182


0     1
1     1
2     1
3     1
4     1
     ..
61    0
62    0
63    0
64    0
65    0
Name: cancer_type, Length: 66, dtype: int64

Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def feature_selection(X, y, k_anova=500, k_mutual=500, combine_features=True):
    
    # Perform ANOVA feature selection
    anova_selector = SelectKBest(score_func=f_classif, k=k_anova)
    anova_selector.fit(X, y)
    X_anova = set(X.columns[anova_selector.get_support()])

    # Perform Mutual Information feature selection
    mutual_info_selector = SelectKBest(score_func=mutual_info_classif, k=k_mutual)
    mutual_info_selector.fit(X, y)
    X_mut = set(X.columns[mutual_info_selector.get_support()])

    selected_features = X_anova.union(X_mut)

    # Subset data with selected features
    X_reduce = X[list(selected_features)]
    return X_reduce

X_l = feature_selection(X_l, y_l, k_anova=300, k_mutual=300, combine_features=True)


## Training the Models and Returning their LOOCV Score

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Function Definition
def perform_model(X, y, max_iter=1000):
    model = LogisticRegression(penalty='l1', solver='saga', max_iter=max_iter, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    accuracy_scores, recall_scores, f1_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, zero_division=0))

    # Train the final model on the entire dataset
    model.fit(X, y)

    # Print cross-validation results
    print("\nCross-Validation Results:")
    print(f"Mean Accuracy: {np.mean(accuracy_scores):.2f} ± {np.std(accuracy_scores):.2f}")
    print(f"Mean Recall: {np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}")
    print(f"Mean F1 Score: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")

# Return the model and summary results
    return {
        "model": model
    }

In [8]:
result = perform_model(X_l, y_l, max_iter=5000)
result




Cross-Validation Results:
Mean Accuracy: 0.99 ± 0.03
Mean Recall: 1.00 ± 0.00
Mean F1 Score: 0.99 ± 0.03


{'model': LogisticRegression(max_iter=5000, penalty='l1', random_state=42, solver='saga')}

In [9]:
model = result["model"]

In [10]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_leukemia_test = df_test[df_test['cancer_type'].isin(['normal', 'leukemia'])]

In [11]:
X_test,y_test = preprocessing(df_leukemia_test)
X_test = X_test[X_l.columns]

In [12]:
from sklearn.metrics import classification_report

# Predict using the final trained model
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print("\nFinal Model Evaluation on Test Dataset:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Final Model Evaluation on Test Dataset:
Accuracy: 0.98
Recall: 1.00
F1 Score: 0.89

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        52
           1       0.80      1.00      0.89         4

    accuracy                           0.98        56
   macro avg       0.90      0.99      0.94        56
weighted avg       0.99      0.98      0.98        56



In [13]:
# Save model

import joblib
joblib.dump(model, 'leukemia.joblib')

['leukemia.joblib']