In [None]:
import os
import cv2
import numpy as np
import seaborn as sns
import tensorflow as tf
import pandas as pd
from tabulate import tabulate
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, roc_curve, auc

In [None]:
folder_path = "ct_scans/"

categories = ["benign", "malignant", "normal"]
labels = [0, 1, 0]

X = []
y = []

for category, label in zip(categories, labels):
    category_path = os.path.join(folder_path, category)
    
    for file_name in os.listdir(category_path):
        file_path = os.path.join(category_path, file_name)
        
        image = cv2.imread(file_path, cv2.IMREAD_COLOR)
        
        X.append(image)
        y.append(label)

X = np.array(X)
y = np.array(y)

table = plt.figure()

for category in categories:
    category_images = X[y == categories.index(category)][:3]
    
    for i, image in enumerate(category_images):
        subplots = table.add_subplot(3, 3, categories.index(category) * 3 + i + 1)
        
        subplots.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        subplots.axis("off")
    
        subplots.set_title(category)

table.subplots_adjust(wspace=0.1, hspace=0.3)

plt.show()


print ('The shape of X is: ' + str(X.shape))
print ('The shape of y is: ' + str(y.shape))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

print("Training set size:", len(X_train))
print("Test set size:", len(X_test))
print("Validation set size:", len(X_val))

In [None]:
train_counts = pd.Series(y_train).value_counts().reindex([0, 1], fill_value=0)
test_counts = pd.Series(y_test).value_counts().reindex([0, 1], fill_value=0)
val_counts = pd.Series(y_val).value_counts().reindex([0, 1], fill_value=0)

data = {
    'Dataset Split': ['Training', 'Validation', 'Testing'],
    'Size': [len(X_train), len(X_val), len(X_test)],
    'Benign/Normal Count': [train_counts[0], val_counts[0], test_counts[0]],
    'Malignant Count': [train_counts[1], val_counts[1], test_counts[1]],
}

df = pd.DataFrame(data)

table = tabulate(df, headers='keys', tablefmt='presto')

print(table)

In [None]:
for i in range(5):
    plt.imshow(X_train[i])
    plt.title(f"Label: {y_train[i]}")
    plt.axis('off')
    plt.show()

In [None]:
X_train = X_train.astype("float32") / 255.0
X_val = X_val.astype("float32") / 255.0
X_test = X_test.astype("float32") / 255.0

In [None]:
tf.random.set_seed(1234)
model = Sequential(
    [
        tf.keras.Input(shape=(224, 224, 3)),
        layers.Conv2D(16, (3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(32, (3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, (3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(64, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(16, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(2, activation="softmax"),
    ]
)

model.summary()

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
history = model.fit(X_train, y_train, batch_size=32, epochs=40, validation_data=(X_val, y_val))

train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(train_acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)

print(f"Accuracy on the testing set: {accuracy * 100:.2f}%")

In [None]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm, index=['Benign or Healthy', 'Malignant'], columns=['Benign or Healthy', 'Malignant'])

plt.figure(figsize=(8, 6))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
y_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()