In [80]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

## Preparing the Data for Cancer Type Classification

In [82]:
df_colorectal = pd.read_csv('colorectal.csv')
df_gastric = pd.read_csv('gastric.csv')
df_leukemia = pd.read_csv('leukemia.csv')
df_normal = pd.read_csv('normal.csv')

In [5]:
def create_combined_df(cancer_df, normal_df, cancer_type):
    cancer_count = len(cancer_df)
    normal_sample = normal_df.sample(n=cancer_count, random_state=42, replace=False)

    combined_df = pd.concat([cancer_df, normal_sample], ignore_index=True)
    return combined_df

In [7]:
combined_colorectal = create_combined_df(df_colorectal, df_normal, 'colorectal')
combined_gastric = create_combined_df(df_gastric, df_normal, 'gastric')
combined_leukemia = create_combined_df(df_leukemia, df_normal, 'leukemia')

In [9]:
df_colorectal_cancer_classification = combined_colorectal.copy()
df_gastric_cancer_classification = combined_gastric.copy()
df_leukemia_cancer_classification = combined_leukemia.copy()

In [10]:
df_colorectal_cancer_classification.drop(columns = ['type'], inplace = True)
df_gastric_cancer_classification.drop(columns = ['type'], inplace = True)
df_leukemia_cancer_classification.drop(columns = ['type'], inplace = True)

## Training the Models and Returning their LOOCV Score

I tested both Logistic Regression and Random Forest for these models, but for the final model I recommend the Logistic Regression model only. For the subtype classification and final function I will only use the LR model

In [55]:
X_colorectal_type = df_colorectal_cancer_classification.drop(columns = ['cancer_type'])
y_colorectal_type = df_colorectal_cancer_classification['cancer_type']

loo_colorectal_classification_lr = LeaveOneOut()
colorectal_classification_lr = LogisticRegression(max_iter = 200)

y_true_colorectal_classification_lr = []
y_pred_colorectal_classification_lr = []

for train_index, test_index in loo_colorectal_classification_lr.split(X_colorectal_type):
    X_train, X_test = X_colorectal_type.iloc[train_index], X_colorectal_type.iloc[test_index]
    y_train, y_test = y_colorectal_type.iloc[train_index], y_colorectal_type.iloc[test_index]

    colorectal_classification_lr.fit(X_train, y_train)

    y_pred_colorectal_classification_lr.append(colorectal_classification_lr.predict(X_test)[0])
    y_true_colorectal_classification_lr.append(y_test.iloc[0])

# LOOCV accuracy is 99.38%

In [17]:
colorectal_classification_lr_accuracy = accuracy_score(y_true_colorectal_classification_lr, y_pred_colorectal_classification_lr)
print(f'LOOCV Score: {colorectal_classification_lr_accuracy * 100:.2f}%')

LOOCV Score: 99.38%


In [19]:
X_colorectal_type = df_colorectal_cancer_classification.drop(columns = ['cancer_type'])
y_colorectal_type = df_colorectal_cancer_classification['cancer_type']

loo_colorectal_classification_rf = LeaveOneOut()
colorectal_classification_rf = RandomForestClassifier(n_estimators = 100, random_state = 42)

y_true_colorectal_classification_rf = []
y_pred_colorectal_classification_rf = []

for train_index, test_index in loo_colorectal_classification_rf.split(X_colorectal_type):
    X_train, X_test = X_colorectal_type.iloc[train_index], X_colorectal_type.iloc[test_index]
    y_train, y_test = y_colorectal_type.iloc[train_index], y_colorectal_type.iloc[test_index]

    colorectal_classification_rf.fit(X_train, y_train)

    y_pred_colorectal_classification_rf.append(colorectal_classification_rf.predict(X_test)[0])
    y_true_colorectal_classification_rf.append(y_test.iloc[0])

# LOOCV Score: 99.06%

In [21]:
colorectal_classification_rf_accuracy = accuracy_score(y_true_colorectal_classification_rf, y_pred_colorectal_classification_rf)
print(f'LOOCV Score: {colorectal_classification_rf_accuracy * 100:.2f}%')

LOOCV Score: 99.06%


In [19]:
X_gastric_type = df_gastric_cancer_classification.drop(columns = ['cancer_type'])
y_gastric_type = df_gastric_cancer_classification['cancer_type']

loo_gastric_classification_lr = LeaveOneOut()
gastric_classification_lr = LogisticRegression(max_iter = 200)

y_true_gastric_classification_lr = []
y_pred_gastric_classification_lr = []

for train_index, test_index in loo_gastric_classification_lr.split(X_gastric_type):
    X_train, X_test = X_gastric_type.iloc[train_index], X_gastric_type.iloc[test_index]
    y_train, y_test = y_gastric_type.iloc[train_index], y_gastric_type.iloc[test_index]

    gastric_classification_lr.fit(X_train, y_train)

    y_pred_gastric_classification_lr.append(gastric_classification_lr.predict(X_test)[0])
    y_true_gastric_classification_lr.append(y_test.iloc[0])

# LOOCV accuracy is 100%

In [25]:
gastric_classification_lr_accuracy = accuracy_score(y_true_gastric_classification_lr, y_pred_gastric_classification_lr)
print(f'LOOCV Score: {gastric_classification_lr_accuracy * 100:.2f}%')

LOOCV Score: 100.00%


In [29]:
X_gastric_type = df_gastric_cancer_classification.drop(columns = ['cancer_type'])
y_gastric_type = df_gastric_cancer_classification['cancer_type']

loo_gastric_classification_rf = LeaveOneOut()
gastric_classification_rf = RandomForestClassifier(n_estimators = 100, random_state = 42)

y_true_gastric_classification_rf = []
y_pred_gastric_classification_rf = []

for train_index, test_index in loo_gastric_classification_rf.split(X_gastric_type):
    X_train, X_test = X_gastric_type.iloc[train_index], X_gastric_type.iloc[test_index]
    y_train, y_test = y_gastric_type.iloc[train_index], y_gastric_type.iloc[test_index]

    gastric_classification_rf.fit(X_train, y_train)

    y_pred_gastric_classification_rf.append(gastric_classification_rf.predict(X_test)[0])
    y_true_gastric_classification_rf.append(y_test.iloc[0])

# LOOCV Score: 97.50%

In [31]:
gastric_classification_rf_accuracy = accuracy_score(y_true_gastric_classification_rf, y_pred_gastric_classification_rf)
print(f'LOOCV Score: {gastric_classification_rf_accuracy * 100:.2f}%')

LOOCV Score: 97.50%


In [92]:
X_leukemia_type = df_leukemia_cancer_classification.drop(columns = ['cancer_type'])
y_leukemia_type = df_leukemia_cancer_classification['cancer_type']

loo_leukemia_classification_lr = LeaveOneOut()
leukemia_classification_lr = LogisticRegression(max_iter = 200)

y_true_leukemia_classification_lr = []
y_pred_leukemia_classification_lr = []

for train_index, test_index in loo_leukemia_classification_lr.split(X_leukemia_type):
    X_train, X_test = X_leukemia_type.iloc[train_index], X_leukemia_type.iloc[test_index]
    y_train, y_test = y_leukemia_type.iloc[train_index], y_leukemia_type.iloc[test_index]

    leukemia_classification_lr.fit(X_train, y_train)

    y_pred_leukemia_classification_lr.append(leukemia_classification_lr.predict(X_test)[0])
    y_true_leukemia_classification_lr.append(y_test.iloc[0])

# LOOCV accuracy is 100%

In [34]:
leukemia_classification_lr_accuracy = accuracy_score(y_true_leukemia_classification_lr, y_pred_leukemia_classification_lr)
print(f'LOOCV Score: {leukemia_classification_lr_accuracy * 100:.2f}%')

Accuracy: 100.00%


In [88]:
X_leukemia_type = df_leukemia_cancer_classification.drop(columns = ['cancer_type'])
y_leukemia_type = df_leukemia_cancer_classification['cancer_type']

loo_leukemia_classification_rf = LeaveOneOut()
leukemia_classification_rf = RandomForestClassifier(n_estimators = 100, random_state = 42)

y_true_leukemia_classification_rf = []
y_pred_leukemia_classification_rf = []

for train_index, test_index in loo_leukemia_classification_rf.split(X_leukemia_type):
    X_train, X_test = X_leukemia_type.iloc[train_index], X_leukemia_type.iloc[test_index]
    y_train, y_test = y_leukemia_type.iloc[train_index], y_leukemia_type.iloc[test_index]

    leukemia_classification_rf.fit(X_train, y_train)

    y_pred_leukemia_classification_rf.append(leukemia_classification_rf.predict(X_test)[0])
    y_true_leukemia_classification_rf.append(y_test.iloc[0])

# LOOCV accuracy is 100%

In [37]:
leukemia_classification_rf_accuracy = accuracy_score(y_true_leukemia_classification_rf, y_pred_leukemia_classification_rf)
print(f'LOOCV Score: {leukemia_classification_rf_accuracy * 100:.2f}%')

LOOCV Score: 100.00%


## Preparing the Data for Cancer Sub-Type Classification

In [84]:
df_colorectal_subtype_classification = combined_colorectal.copy()
df_gastric_subtype_classification = combined_gastric.copy()
df_leukemia_subtype_classification = combined_leukemia.copy()

In [86]:
df_colorectal_subtype_classification.drop(columns = ['cancer_type'], inplace = True)
df_gastric_subtype_classification.drop(columns = ['cancer_type'], inplace = True)
df_leukemia_subtype_classification.drop(columns = ['cancer_type'], inplace = True)

## Sub-Type Classification using Logistic Regression and LOOCV Score

In [46]:
X_colorectal_subtype = df_colorectal_subtype_classification.drop(columns = ['type'])
y_colorectal_subtype = df_colorectal_subtype_classification['type']

loo_colorectal_subtype_lr = LeaveOneOut()
colorectal_subtype_lr = LogisticRegression(max_iter = 200)

y_true_colorectal_subtype_lr = []
y_pred_colorectal_subtype_lr = []

for train_index, test_index in loo_colorectal_subtype_lr.split(X_colorectal_subtype):
    X_train, X_test = X_colorectal_subtype.iloc[train_index], X_colorectal_subtype.iloc[test_index]
    y_train, y_test = y_colorectal_subtype.iloc[train_index], y_colorectal_subtype.iloc[test_index]

    colorectal_subtype_lr.fit(X_train, y_train)

    y_pred_colorectal_subtype_lr.append(colorectal_subtype_lr.predict(X_test)[0])
    y_true_colorectal_subtype_lr.append(y_test.iloc[0])

# LOOCV accuracy is 99.69%

In [48]:
colorectal_subtype_lr_accuracy = accuracy_score(y_true_colorectal_subtype_lr, y_pred_colorectal_subtype_lr)
print(f'LOOCV Score: {colorectal_subtype_lr_accuracy * 100:.2f}%')

LOOCV Score: 99.69%


In [50]:
X_gastric_subtype = df_gastric_subtype_classification.drop(columns = ['type'])
y_gastric_subtype = df_gastric_subtype_classification['type']

loo_gastric_subtype_lr = LeaveOneOut()
gastric_subtype_lr = LogisticRegression(max_iter = 200)

y_true_gastric_subtype_lr = []
y_pred_gastric_subtype_lr = []

for train_index, test_index in loo_gastric_subtype_lr.split(X_gastric_subtype):
    X_train, X_test = X_gastric_subtype.iloc[train_index], X_gastric_subtype.iloc[test_index]
    y_train, y_test = y_gastric_subtype.iloc[train_index], y_gastric_subtype.iloc[test_index]

    gastric_subtype_lr.fit(X_train, y_train)

    y_pred_gastric_subtype_lr.append(gastric_subtype_lr.predict(X_test)[0])
    y_true_gastric_subtype_lr.append(y_test.iloc[0])

# LOOCV accuracy is 100%

In [52]:
gastric_subtype_lr_accuracy = accuracy_score(y_true_gastric_subtype_lr, y_pred_gastric_subtype_lr)
print(f'LOOCV Score: {gastric_subtype_lr_accuracy * 100:.2f}%')

LOOCV Score: 100.00%


In [54]:
X_leukemia_subtype = df_leukemia_subtype_classification.drop(columns = ['type'])
y_leukemia_subtype = df_leukemia_subtype_classification['type']

loo_leukemia_subtype_lr = LeaveOneOut()
leukemia_subtype_lr = LogisticRegression(max_iter = 200)

y_true_leukemia_subtype_lr = []
y_pred_leukemia_subtype_lr = []

for train_index, test_index in loo_leukemia_subtype_lr.split(X_leukemia_subtype):
    X_train, X_test = X_leukemia_subtype.iloc[train_index], X_leukemia_subtype.iloc[test_index]
    y_train, y_test = y_leukemia_subtype.iloc[train_index], y_leukemia_subtype.iloc[test_index]

    leukemia_subtype_lr.fit(X_train, y_train)

    y_pred_leukemia_subtype_lr.append(leukemia_subtype_lr.predict(X_test)[0])
    y_true_leukemia_subtype_lr.append(y_test.iloc[0])

# LOOCV accuracy is 100%

In [56]:
leukemia_subtype_lr_accuracy = accuracy_score(y_true_leukemia_subtype_lr, y_pred_leukemia_subtype_lr)
print(f'LOOCV Score: {leukemia_subtype_lr_accuracy * 100:.2f}%')

LOOCV Score: 100.00%


## Functions to Return Cancer Classification Probabilities for Logistic Regression

The following are basic functions that utilize the trained logistic regression models to return the class (whether it is the type of cancer or normal), as well as the probabilities

In [41]:
def colorectal_lr(X_features):
    model = colorectal_classification_lr
    y_pred = model.predict(X_features)
    y_pred_prob = model.predict_proba(X_features)

    return y_pred, y_pred_prob

In [57]:
classes, probs = colorectal_lr(X_colorectal_type)
for i in range(10):
    print(classes[i], probs[i])

colorectal [0.99846371 0.00153629]
colorectal [9.99987817e-01 1.21827471e-05]
colorectal [9.99978543e-01 2.14571929e-05]
colorectal [9.99999923e-01 7.65182901e-08]
colorectal [0.99890875 0.00109125]
colorectal [9.99993924e-01 6.07558017e-06]
colorectal [9.99992893e-01 7.10714267e-06]
colorectal [9.99999980e-01 2.02485687e-08]
colorectal [9.99998222e-01 1.77841676e-06]
colorectal [9.99999044e-01 9.56078693e-07]


In [60]:
def gastric_lr(X_features):
    model = gastric_classification_lr
    y_pred = model.predict(X_features)
    y_pred_prob = model.predict_proba(X_features)

    return y_pred, y_pred_prob

In [62]:
classes, probs = gastric_lr(X_gastric_type)
for i in range(10):
    print(classes[i], probs[i])

gastric [9.99984549e-01 1.54508684e-05]
gastric [9.99975461e-01 2.45390724e-05]
gastric [9.99954576e-01 4.54244250e-05]
gastric [9.99968345e-01 3.16551209e-05]
gastric [9.99962693e-01 3.73071462e-05]
gastric [9.99922916e-01 7.70843369e-05]
gastric [9.99991796e-01 8.20380857e-06]
gastric [9.99964757e-01 3.52430527e-05]
gastric [9.99994984e-01 5.01635003e-06]
gastric [9.99812126e-01 1.87874176e-04]


In [94]:
def leukemia_lr(X_features):
    model = leukemia_classification_lr
    y_pred = model.predict(X_features)
    y_pred_prob = model.predict_proba(X_features)

    return y_pred, y_pred_prob

In [96]:
classes, probs = leukemia_lr(X_leukemia_type)
for i in range(10):
    print(classes[i], probs[i])

leukemia [9.99999997e-01 2.67787218e-09]
leukemia [9.99999999e-01 1.22675544e-09]
leukemia [1.00000000e+00 1.68706056e-11]
leukemia [1.00000000e+00 1.84591998e-11]
leukemia [1.00000000e+00 3.13739624e-11]
leukemia [9.99999999e-01 1.37428056e-09]
leukemia [9.99999918e-01 8.22915531e-08]
leukemia [1.00000000e+00 1.10577366e-10]
leukemia [9.99999999e-01 1.18486184e-09]
leukemia [9.99999999e-01 7.74379038e-10]
