In [14]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, confusion_matrix

In [15]:
train_df = pd.read_csv("../Train_Set.csv")
test_df = pd.read_csv("../Test_Set.csv")

In [16]:
print("Dataset Distribution")
print(train_df.shape)
print(test_df.shape)

print("Class Distribution")
print(train_df["class_label"].value_counts())
print(test_df["class_label"].value_counts())

Dataset Distribution
(841, 351)
(409, 351)
Class Distribution
class_label
cancer     781
healthy     60
Name: count, dtype: int64
class_label
cancer     368
healthy     41
Name: count, dtype: int64


In [17]:
X_train = train_df.drop(columns=["class_label"])
y_train = train_df["class_label"]

X_test  = test_df.drop(columns=["class_label"])
y_test  = test_df["class_label"]

In [24]:
# Train Logistic Regression to optimize recall since false negative is what we try to minimize in disease classification
# Here, we simply train plain Logistic regression then we'll improve it later on
clf = LogisticRegression()

clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

print(f"Recall Train: {recall_score(y_train, y_train_pred, pos_label='cancer')}")
print(f"Recall Test: {recall_score(y_test, y_test_pred, pos_label='cancer')}")
cm = confusion_matrix(y_test, y_test_pred, labels=["cancer", "healthy"])
print(pd.DataFrame(cm,
                   index=["actual_cancer", "actual_healthy"],
                   columns=["pred_cancer", "pred_healthy"]))
# Hihgly imbalance, everything is predicted as cancer. 

Recall Train: 1.0
Recall Test: 1.0
                pred_cancer  pred_healthy
actual_cancer           368             0
actual_healthy           41             0


In [26]:
# We try to standardize it first
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

clf = LogisticRegression(solver="liblinear")

clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

print(f"Recall Train: {recall_score(y_train, y_train_pred, pos_label='cancer')}")
print(f"Recall Test: {recall_score(y_test, y_test_pred, pos_label='cancer')}")
cm = confusion_matrix(y_test, y_test_pred, labels=["cancer", "healthy"])
print(pd.DataFrame(cm,
                   index=["actual_cancer", "actual_healthy"],
                   columns=["pred_cancer", "pred_healthy"]))


Recall Train: 0.9961587708066582
Recall Test: 0.970108695652174
                pred_cancer  pred_healthy
actual_cancer           357            11
actual_healthy           27            14
