In [2]:
# Import libraries
import os
import cv2
import numpy as np
import seaborn as sns
import tensorflow as tf
import pandas as pd
from tabulate import tabulate
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, roc_curve, auc

2023-06-12 18:27:09.580240: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Set the path to the main folder containing the CT scan folders
folder_path = "ct_scans/"

# Define the categories and their corresponding labels
categories = ["benign", "malignant", "normal"]
labels = [0, 1, 0]

# Initialize empty lists to store the images and labels
X = []
y = []

# Loop through the categories
for category, label in zip(categories, labels):
    # Set the path to the current category folder
    category_path = os.path.join(folder_path, category)
    
    # Loop through the files in the current category folder
    for file_name in os.listdir(category_path):
        # Set the path to the current file
        file_path = os.path.join(category_path, file_name)
        
        # Load the image in RGB format
        image = cv2.imread(file_path, cv2.IMREAD_COLOR)
        
        # Append the image and its corresponding label to the lists
        X.append(image)
        y.append(label)

# Convert the lists to NumPy arrays for further processing
X = np.array(X)
y = np.array(y)

# Create a table to display the images
table = plt.figure()

# Loop through the categories
for category in categories:
    # Select three random images from the current category
    category_images = X[y == categories.index(category)][:3]
    
    # Loop through the images in the current category
    for i, image in enumerate(category_images):
        # Create a subplot for the current image
        subplots = table.add_subplot(3, 3, categories.index(category) * 3 + i + 1)
        
        # Display the image in the subplot
        subplots.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        subplots.axis("off")
    
        # Set the title of the subplot as the category
        subplots.set_title(category)

# Adjust the spacing between subplots
table.subplots_adjust(wspace=0.1, hspace=0.3)

# Display the table of images
plt.show()


print ('The shape of X is: ' + str(X.shape))
print ('The shape of y is: ' + str(y.shape))

In [None]:
# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

print("Training set size:", len(X_train))
print("Test set size:", len(X_test))
print("Validation set size:", len(X_val))

In [None]:
# Calculate class counts in each split
train_counts = pd.Series(y_train).value_counts().reindex([0, 1], fill_value=0)
test_counts = pd.Series(y_test).value_counts().reindex([0, 1], fill_value=0)
val_counts = pd.Series(y_val).value_counts().reindex([0, 1], fill_value=0)

# Create a DataFrame to store the table
data = {
    'Dataset Split': ['Training', 'Validation', 'Testing'],
    'Size': [len(X_train), len(X_val), len(X_test)],
    'Benign/Normal Count': [train_counts[0], val_counts[0], test_counts[0]],
    'Malignant Count': [train_counts[1], val_counts[1], test_counts[1]],
}

df = pd.DataFrame(data)

# Convert DataFrame to a formatted table
table = tabulate(df, headers='keys', tablefmt='presto')

print(table)

In [None]:
# Display the first 5 images
for i in range(5):
    plt.imshow(X_train[i])
    plt.title(f"Label: {y_train[i]}")
    plt.axis('off')
    plt.show()

In [None]:
print

In [None]:
# Normalize the image data to a range of 0 to 1
X_train = X_train.astype("float32") / 255.0
X_val = X_val.astype("float32") / 255.0
X_test = X_test.astype("float32") / 255.0

In [3]:
# Build the CNN model
tf.random.set_seed(1234)
model = Sequential(
    [
        tf.keras.Input(shape=(224, 224, 3)),
        layers.Conv2D(16, (3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(32, (3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, (3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(64, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(16, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(2, activation="softmax"),
    ]
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 222, 222, 16)      448       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 111, 111, 16)     0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 109, 109, 32)      4640      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 54, 54, 32)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 52, 52, 64)        18496     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 26, 26, 64)       0

In [None]:
# Compile the model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
# Train the model and store the training history
history = model.fit(X_train, y_train, batch_size=32, epochs=40, validation_data=(X_val, y_val))

# Get the training and validation accuracy/loss history
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Plot the accuracy
plt.plot(train_acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot the loss
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test)

# Print the accuracy
print(f"Accuracy on the testing set: {accuracy * 100:.2f}%")

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)
# Convert predictions from one-hot encoded format to class labels
y_pred = np.argmax(y_pred, axis=1)

# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Convert the confusion matrix to a Pandas DataFrame for better visualization
cm_df = pd.DataFrame(cm, index=['Benign or Healthy', 'Malignant'], columns=['Benign or Healthy', 'Malignant'])

# Plot the confusion matrix using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
y_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()