In [1]:
import pandas as pd

## Preparing the Data for Cancer Type Classification

In [2]:
df_throat = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/throat.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [3]:
def create_combined_df(cancer_df, normal_df, cancer_type):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=12, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [4]:
df_throat_cancer = create_combined_df(df_throat, df_normal, 'throat')

In [5]:
# Preprocessing data
# Input df, return X,y for training

def preprocessing(df):
    
    # Drop type Columns
    if "type" in df.columns:
        df = df.drop(columns="type")

    # Convert 'cancer_type' column to binary type: normal = 0, other = 1
    if 'cancer_type' in df.columns and not df['cancer_type'].isin([0, 1]).all():
        df['cancer_type'] = df['cancer_type'].map({'normal': 0}).fillna(1).astype(int)
    
    # Get X,y
    target = 'cancer_type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data
X_t,y_t = preprocessing(df_throat_cancer)

display(X_t,y_t)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,10.599588,6.836289,6.538190,9.241232,4.408715,7.803230,6.132835,5.443252,7.305780,4.540688,...,11.983106,11.676178,14.021765,13.823926,5.105141,4.678033,3.534824,3.206588,3.750947,3.842781
1,10.716057,7.659747,6.696159,9.520182,4.550704,8.575769,6.265001,5.846746,9.891390,4.876797,...,12.363657,12.011395,13.940837,13.946973,5.583514,5.250945,3.807858,3.253745,4.092656,3.993107
2,12.228771,7.685318,6.331153,9.254318,4.474251,8.237696,6.324230,5.552469,7.976104,4.501409,...,11.804301,11.415460,13.937761,13.704787,5.050041,4.877941,3.626494,3.319216,4.008475,3.884906
3,10.780113,7.035396,7.074058,9.546853,4.515546,7.413569,6.215684,5.657805,7.339120,4.669660,...,11.704449,11.241365,13.680625,13.539149,5.344068,5.002905,3.627019,3.295974,3.960866,3.986542
4,10.077956,7.115040,6.403655,8.970021,4.205101,8.696256,5.946710,5.429696,10.357856,4.475399,...,11.173857,10.888381,13.471849,13.315576,4.696630,4.663674,3.453795,3.241790,3.833571,3.792176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,10.456012,5.855968,4.882649,7.939533,3.376531,8.273950,5.965015,4.404898,7.993778,3.474908,...,13.344561,13.026807,14.546300,14.394317,9.080891,4.582756,5.135689,2.941492,3.376226,3.124302
186,10.723164,7.663587,7.983010,8.408445,3.667550,9.407959,5.534188,6.130281,11.744277,4.297202,...,13.047108,12.204483,13.999465,13.904224,9.822662,7.073574,8.363416,3.638903,3.999306,3.939453
187,11.022610,6.666889,6.822420,8.014927,3.529340,8.714623,5.759964,4.911467,6.880929,3.633933,...,12.778428,12.178552,13.903772,13.836587,9.775054,5.620937,7.650794,3.365332,3.695016,3.658442
188,10.232274,6.205853,4.941341,10.578067,2.649587,7.463537,5.370651,4.558546,6.590297,6.007346,...,12.579453,11.818198,13.934934,13.798611,10.866856,8.538460,9.570063,2.360374,2.832576,2.758846


0      1
1      1
2      1
3      1
4      1
      ..
185    0
186    0
187    0
188    0
189    0
Name: cancer_type, Length: 190, dtype: int64

Feature Selection

In [6]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def feature_selection(X, y, k_anova=500, k_mutual=500, combine_features=True):
    
    # Perform ANOVA feature selection
    anova_selector = SelectKBest(score_func=f_classif, k=k_anova)
    anova_selector.fit(X, y)
    X_anova = set(X.columns[anova_selector.get_support()])

    # Perform Mutual Information feature selection
    mutual_info_selector = SelectKBest(score_func=mutual_info_classif, k=k_mutual)
    mutual_info_selector.fit(X, y)
    X_mut = set(X.columns[mutual_info_selector.get_support()])

    selected_features = X_anova.union(X_mut)

    # Subset data with selected features
    X_reduce = X[list(selected_features)]
    return X_reduce

X_t = feature_selection(X_t, y_t, k_anova=500, k_mutual=500, combine_features=True)

## Training the Models and Returning their LOOCV Score

In [10]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Function Definition
def perform_model(X, y):
    
    # Initialize Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Lists to store cross-validation results
    accuracy_scores, recall_scores, f1_scores = [], [], []

    # Perform K-Fold Cross-Validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training set
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Calculate metrics
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, zero_division=0))

    # Train the final model on the entire dataset
    model.fit(X, y)

    # Print cross-validation results
    print("\nCross-Validation Results:")
    print(f"Mean Accuracy: {np.mean(accuracy_scores):.2f} ± {np.std(accuracy_scores):.2f}")
    print(f"Mean Recall: {np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}")
    print(f"Mean F1 Score: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")

    # Return the model and summary results
    return {
        "model": model
    }

In [11]:
result = perform_model(X_t, y_t)
result


Cross-Validation Results:
Mean Accuracy: 0.98 ± 0.03
Mean Recall: 1.00 ± 0.00
Mean F1 Score: 0.98 ± 0.03


{'model': RandomForestClassifier(random_state=42)}

In [12]:
model = result["model"]

In [13]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_throat_test = df_test[df_test['cancer_type'].isin(['normal', 'throat'])]

In [14]:
X_test,y_test = preprocessing(df_throat_test)
X_test = X_test[X_t.columns]

In [15]:
from sklearn.metrics import classification_report

# Predict using the final trained model
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print("\nFinal Model Evaluation on Test Dataset:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Final Model Evaluation on Test Dataset:
Accuracy: 0.94
Recall: 1.00
F1 Score: 0.83

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        52
           1       0.71      1.00      0.83        10

    accuracy                           0.94        62
   macro avg       0.86      0.96      0.90        62
weighted avg       0.95      0.94      0.94        62



In [16]:
# Save model

import joblib
joblib.dump(model, 'throat.joblib')

['throat.joblib']