In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

## Preparing the Data for Cancer Type Classification

In [2]:
df_gastric = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/gastric.csv')
df_normal = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/normal.csv')

In [3]:
def create_combined_df(cancer_df, normal_df, cancer_type):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=42, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [4]:
df_gastric_cancer = create_combined_df(df_gastric, df_normal, 'gastric')

In [5]:
# Preprocessing data
# Input df, return X,y for training

def preprocessing(df):
    
    # Drop type Columns
    if "type" in df.columns:
        df = df.drop(columns="type")

    # Convert 'cancer_type' column to binary type: normal = 0, other = 1
    if 'cancer_type' in df.columns and not df['cancer_type'].isin([0, 1]).all():
        df['cancer_type'] = df['cancer_type'].map({'normal': 0}).fillna(1).astype(int)
    
    # Get X,y
    target = 'cancer_type'
    X = df.drop(columns=target)
    y = df[target]
    
    return X,y

# Display proccesed data
X_g,y_g = preprocessing(df_gastric_cancer)

display(X_g,y_g)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,8.729061,5.180322,4.608926,5.647848,2.042212,7.206777,4.422622,2.755938,8.812442,2.573335,...,11.164609,10.775514,12.857273,12.731003,9.234305,8.248071,8.344889,1.631211,1.978618,1.892722
1,9.001334,6.617095,4.395646,7.123623,3.241733,5.498223,4.2439,2.925823,4.534666,1.964338,...,10.788976,10.302022,12.440039,12.300295,8.163385,7.362267,7.501064,1.756396,1.977184,2.015272
2,8.159827,5.92643,5.122536,6.226437,2.019512,6.624482,4.5518,2.767274,7.986535,2.210271,...,11.187776,10.748935,12.66579,12.592801,8.167704,6.801099,6.979345,1.662133,2.070686,1.924248
3,9.559118,7.043322,4.971121,6.7686,2.671845,6.883452,4.032452,3.171697,6.465104,4.093294,...,10.731669,10.294698,12.534455,12.378446,7.68566,5.212969,6.233874,1.731197,1.949998,2.027056
4,8.25784,6.26811,5.317055,6.096988,2.095831,5.882465,4.526486,2.666876,7.004057,1.829208,...,10.663157,10.080286,12.349305,12.303505,7.658543,6.624243,6.80969,1.617277,2.042376,2.088563
5,8.268637,5.673706,4.041234,5.940841,2.092022,7.395003,5.180165,2.788542,8.33776,2.37271,...,11.630287,11.226991,12.952052,12.891272,9.438176,6.562677,7.899977,1.713044,2.031004,1.826066
6,8.072856,6.055937,5.0261,6.05028,1.739597,6.663004,4.890627,2.831725,8.391452,2.39175,...,11.234398,10.733645,12.696144,12.586035,7.879248,6.413369,6.656621,1.630604,1.974087,2.036339
7,7.680028,5.894405,5.199329,6.30754,1.90292,6.742952,4.502145,2.649148,7.964943,2.582655,...,10.878437,10.401695,12.39047,12.366916,7.831222,6.800263,6.979293,1.630654,1.879426,1.92526
8,8.451915,5.863011,4.382178,6.026346,1.947833,6.923885,4.499549,3.01664,7.658565,2.525693,...,10.78152,10.42831,12.621985,12.550523,8.586549,7.755926,7.762786,1.655331,1.823594,1.941182
9,8.123749,5.428503,4.366489,5.876885,1.968247,6.917033,4.814167,2.804359,8.31408,2.925554,...,11.17294,10.922962,12.603767,12.570259,8.182451,6.412731,6.597469,1.592001,2.002152,2.093622


0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
Name: cancer_type, dtype: int64

Feature Selection

In [6]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def feature_selection(X, y, k_anova=500, k_mutual=500, combine_features=True):
    """
    Perform feature selection using ANOVA and Mutual Information.

    Parameters:
        X (pd.DataFrame): Input features.
        y (pd.Series): Target variable.
        k_anova (int): Number of top features to select using ANOVA F-statistic.
        k_mutual (int): Number of top features to select using Mutual Information.
        combine_features (bool): Whether to combine (union) or reduce overlap (intersection).

    Returns:
        pd.DataFrame: Subset of X with selected features.
    """
    # Perform ANOVA feature selection
    anova_selector = SelectKBest(score_func=f_classif, k=k_anova)
    anova_selector.fit(X, y)
    X_anova = set(X.columns[anova_selector.get_support()])

    # Perform Mutual Information feature selection
    mutual_info_selector = SelectKBest(score_func=mutual_info_classif, k=k_mutual)
    mutual_info_selector.fit(X, y)
    X_mut = set(X.columns[mutual_info_selector.get_support()])

    # Combine or reduce overlap between features
    if combine_features:
        selected_features = X_anova.union(X_mut)  # Combine features (union)
    else:
        selected_features = X_anova.intersection(X_mut)  # Reduce overlap (intersection)

    # Subset data with selected features
    X_reduce = X[list(selected_features)]
    return X_reduce

X_g = feature_selection(X_g, y_g, k_anova=500, k_mutual=500, combine_features=True)

## Training the Models and Returning their LOOCV Score

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Function Definition
def perform_model(X, y, max_iter=1000):
    model = LogisticRegression(penalty='l1', solver='saga', max_iter=max_iter, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    accuracy_scores, recall_scores, f1_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, zero_division=0))

    # Train the final model on the entire dataset
    model.fit(X, y)

    # Print cross-validation results
    print("\nCross-Validation Results:")
    print(f"Mean Accuracy: {np.mean(accuracy_scores):.2f} ± {np.std(accuracy_scores):.2f}")
    print(f"Mean Recall: {np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}")
    print(f"Mean F1 Score: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")

# Return the model and summary results
    return {
        "model": model
    }

In [8]:
result = perform_model(X_g, y_g, max_iter=5000)
result


Cross-Validation Results:
Mean Accuracy: 1.00 ± 0.00
Mean Recall: 1.00 ± 0.00
Mean F1 Score: 1.00 ± 0.00


{'model': LogisticRegression(max_iter=5000, penalty='l1', random_state=42, solver='saga')}

In [9]:
model = result["model"]

In [10]:
df_test = pd.read_csv('/Users/ledamduyen/Desktop/CS 539/project/dataset/clean/test_data.csv')
df_gastric_test = df_test[df_test['cancer_type'].isin(['normal', 'gastric'])]

In [11]:
X_test,y_test = preprocessing(df_gastric_test)
X_test = X_test[X_g.columns]

In [12]:
from sklearn.metrics import classification_report

# Predict using the final trained model
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print("\nFinal Model Evaluation on Test Dataset:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Final Model Evaluation on Test Dataset:
Accuracy: 0.94
Recall: 1.00
F1 Score: 0.57

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        52
           1       0.40      1.00      0.57         2

    accuracy                           0.94        54
   macro avg       0.70      0.97      0.77        54
weighted avg       0.98      0.94      0.96        54



In [13]:
# Save model

import joblib
joblib.dump(model, 'gastric.joblib')

['gastric.joblib']

## Preparing the Data for Cancer Sub-Type Classification

## Sub-Type Classification using Logistic Regression and LOOCV Score

## Functions to Return Cancer Classification Probabilities for Logistic Regression

The following are basic functions that utilize the trained logistic regression models to return the class (whether it is the type of cancer or normal), as well as the probabilities