In [None]:
# 1. Imports
# ----------
# All necessary libraries and modules are imported here.

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import os
import PIL.Image
from PIL import ImageOps
import PIL
import pathlib
import matplotlib.pyplot as plt
import datetime
import tensorboard
import IPython
import sklearn
import cv2
import subprocess
import sys

In [None]:
# 2. Dataset Loading
# --------------------
# This cell handles the loading of the training and testing datasets.
# The paths are hardcoded, so they will need to be changed if running elsewhere.

# Define the paths to the training and testing data.
# IMPORTANT: These paths are specific to the original author's machine.
data_path = pathlib.Path('/home/samer/Documents/Programming/AI50xIraq/Cancerdetection/archivecopy/Training/')
data_path_test = pathlib.Path('/home/samer/Documents/Programming/AI50xIraq/Cancerdetection/archivecopy/Testing/')

# Create the training and validation datasets from the training directory.
dataset_path, dataset_path_val = tf.keras.utils.image_dataset_from_directory(
    data_path,
    labels='inferred',
    validation_split=0.2,  # 20% of the data will be used for validation.
    subset='both',         # Returns both training and validation sets.
    seed=1,                # Seed for reproducibility.
    batch_size=5,
    image_size=(180, 180), # Resize images to 180x180.
    color_mode="grayscale",# Convert images to grayscale.
    shuffle=True)

# Create the testing dataset from the testing directory.
dataset_path_test = tf.keras.utils.image_dataset_from_directory(
    data_path_test,
    labels='inferred',
    seed=3,
    batch_size=5,
    image_size=(180, 180),
    color_mode="grayscale",
    shuffle=True)


In [None]:
# 3. Performance Optimization
# ---------------------------
# To improve performance, the datasets are cached and prefetched.
# .cache() keeps the images in memory after they're loaded off disk during the first epoch.
# .prefetch() overlaps data preprocessing and model execution while training.

AUTOTUNE = tf.data.AUTOTUNE
dataset_path = dataset_path.cache().prefetch(buffer_size=AUTOTUNE)
dataset_path_val = dataset_path_val.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
# 4. Data Exploration and Visualization
# -------------------------------------
# This cell is for exploring the dataset and visualizing some of the images.
# Note: Some of this code is for debugging and exploration purposes.

# Get the class names from the dataset.
class_names = dataset_path.class_names
print(f"Class names: {class_names}")

# Get the total number of images.
data_dir = pathlib.Path(data_path)
image_count = len(list(data_dir.glob('*/*.jpg')))
print(f"Total images: {image_count}")

# Visualize a few images from the training set.
plt.figure(figsize=(10, 10))
for images, labels in dataset_path.take(1):
   for i in range(2):
       ax = plt.subplot(1, 2, i + 1)
       plt.imshow(images[i].numpy().astype("uint8"))
       plt.title(class_names[labels[i]])
       plt.axis("off")
   plt.show()

# Print the shape of the image and label batches for debugging.
print("--- Batch Shapes ---")
for image_batch, labels_batch in dataset_path.take(1):
   print(f"Image batch shape: {image_batch.shape}")
   print(f"Labels batch shape: {labels_batch.shape}")

# The following code seems to be for debugging and will cause an error
# as `get_label_name` is not defined. It has been commented out.
# image, label = next(iter(dataset_path.take(1)))
# _ = plt.imshow(image.numpy().astype("uint8"))
# _ = plt.title(get_label_name(label))


In [None]:
# 5. Model Definition and Training
# --------------------------------
# This cell defines, compiles, and trains the Convolutional Neural Network (CNN).

num_classes = 2 # Binary classification (e.g., 'meningioma' or 'notumor')

# Define the model architecture using tf.keras.Sequential.
model = tf.keras.Sequential([
    # Input layers: Resize images and rescale pixel values to [0, 1].
    tf.keras.layers.Resizing(60, 60),
    tf.keras.layers.Rescaling(1./255),

    # Convolutional Block 1
    tf.keras.layers.Conv2D(16, 3, activation='ELU', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Dropout(0.2), # Dropout for regularization.

    # Convolutional Block 2
    tf.keras.layers.Conv2D(32, 3, activation='ELU', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Dropout(0.2),

    # Convolutional Block 3
    tf.keras.layers.Conv2D(64, 3, activation='ELU', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Dropout(0.2),

    # Flatten the feature maps and feed into Dense layers.
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='ELU', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='ELU', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.2),

    # Output layer with softmax activation for classification.
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model.
model.compile(
    optimizer='adam',
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy', 'mse'])

# Set up TensorBoard for logging.
# Note: The log directory is hardcoded.
log_dir = "/home/samer/Documents/Programming/AI50xIraq/Cancerdetection/TBLog/" + datetime.datetime.now().strftime("%Y%M%D-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Set up ModelCheckpoint to save weights during training.
# Note: The checkpoint path is hardcoded.
checkpoint_path = "/home/samer/Documents/Programming/AI50xIraq/Cancerdetection/Checkpoints/cp-{epoch:04d}.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1)

# Save the initial weights.
model.save_weights(checkpoint_path.format(epoch=0))

# Train the model.
model.fit(
    dataset_path,
    epochs=15,
    validation_data=dataset_path_val,
    callbacks=[
        tensorboard_callback,
        cp_callback
    ])

# Evaluate the model on the test set.
print("\nEvaluating model on the test dataset...")
model.evaluate(dataset_path_test, batch_size=5, verbose=2)

# Print a summary of the model architecture.
model.summary()

# Save the entire model.
# Note: The saving path is hardcoded.
saving_path = pathlib.Path('/home/samer/Documents/Programming/AI50xIraq/Cancerdetection/SavedModel/')
tf.keras.models.save_model(
    model,
    saving_path,
    overwrite=True,
    save_format='tf')

print(f"\nModel saved to {saving_path}")

# To load the model later:
# loaded_model = tf.keras.models.load_model('/home/samer/Documents/Programming/AI50xIraq/Cancerdetection/SavedModel/')


# Todo
- use the dataset to train, validate and test.               (done)
- use dropout to avoid overfitting.                          (done)
- save a copy to load when deploying.                        (done)
- deploy with tensorflow.js. or TFX                          (predictionful)
- add some augmentation to expand the dataset.               (done)
- add more classes to work on the full dataset.              (done)

# Record
- ELU + sigmoid around 98 val 97
- ELU + softmax around 97 val 98 dips though but best yet