# Lab 10 - Vanilla CNN and Fine-Tune VGG16 - for Dogs and Cats Classification

### Goal : Take an existing model that performs a similar task of image classification to achieve and fine-tuning it for the specific task at hand which is of classifying dogs and cats.

# loading Necessary Modules

In [None]:
# Importing libraries for numerical operations and data manipulation
import numpy as np
import pandas as pd
import random
random.seed(42) # Setting random seed for reproducibility

# Importing libraries for file handling and system operations
import os, shutil, pathlib

# Importing libraries for data visualization
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
from PIL import Image

# Importing libraries for deep learning
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img
from tensorflow.keras.utils import image_dataset_from_directory
from tensorflow.keras.models import load_model
from tensorflow.python.keras.models import Sequential


# Importing libraries for machine learning model evaluation
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

# Importing specific components from Keras
from keras.layers import Dense, Conv2D, Flatten, Dropout
# Setting up offline mode for Plotly
plotly.offline.init_notebook_mode()

# Ignoring warnings
import warnings
warnings.filterwarnings("ignore")

# Loading Dataset

In [None]:
# Paths to the original dataset 
original_dir = pathlib.Path("./train/train/")
data_folder = pathlib.Path("./train/kaggle_dogs_vs_cats_small")

# Function to create a subset of the dataset with a specified range of files
def make_subset(subset_name, start_index, end_index):
    for category in ("cat", "dog"):
        dir = data_folder / subset_name / category # Path for the current category within the subset
        os.makedirs(dir, exist_ok=True)
        
        fnames = [f"{category}.{i}.jpg" for i in range(start_index, end_index)]
        for fname in fnames:
            shutil.copyfile(src=original_dir / fname, dst=dir / fname)


# Total number of files
total_files = 4000

# Calculate the number of files for validation and test subsets (15% each)
validation_size = int(0.15 * total_files)
test_size = int(0.15 * total_files)

# Calculate the number of files for the training subset
train_size = total_files - validation_size - test_size

# Create subsets for training, validation, and testing with specified start and end indices
make_subset("train", start_index=0, end_index=train_size)
make_subset("validation", start_index=train_size, end_index=train_size + validation_size)
make_subset("test", start_index=train_size + validation_size, end_index=train_size + validation_size + test_size)


# Images represent in orignal dataset

In [None]:
original_dir = "./train/train"  # Replace with the actual directory containing images

num_samples = 9  # Images to plot

# List all image filenames
image_filenames = os.listdir(original_dir)

# Shuffle the list of image filenames randomly
random.shuffle(image_filenames)

# Plotting the images
plt.figure(figsize=(12, 24))
for index in range(num_samples):
    # Get the filename of the image at the current index
    filename = image_filenames[index]
    
    # Determine whether the image is of a cat or a dog based on the filename
    category = "cat" if "cat" in filename else "dog"
    
    # Load the image with a target size of (150, 150) (adjust target_size as needed)
    img_path = os.path.join(original_dir, filename)
    img = load_img(img_path, target_size=(150, 150))
    
    # Create a subplot and plot the image
    plt.subplot(6, 3, index + 1)  
    plt.imshow(img)  
    plt.xlabel(filename + ' (' + category + ')' )  

plt.tight_layout()
plt.show()


# Labeling for Cat and Dogs

In [None]:
# Define the directory where the files are located
original_dir = "./train/train"

# Get a list of filenames in the original directory
filenames = os.listdir(original_dir)

# Initialize an empty list to store categories
categories = []

# Iterate over each filename
for filename in filenames:
    category = filename.split('.')[0]
    if category == 'dog':
        categories.append("dog")
    else:
        categories.append("cat")

# Create a DataFrame using filenames and corresponding categories
df = pd.DataFrame({
    'filename': filenames,
    'category': categories
})

df.head()

In [None]:
# Paths to the data folder
data_folder = pathlib.Path("./train/kaggle_dogs_vs_cats_small")
train_dir = data_folder / "train"

filenames = os.listdir(train_dir)

# Extract labels from filenames (assuming filenames are in the format 'cat.xxx.jpg' or 'dog.xxx.jpg')
labels = [str(x)[:3] for x in filenames]

# Create a DataFrame using filenames and labels
train_df = pd.DataFrame({'filename': filenames, 'label': labels})

train_df.head()


# Plotting : Data Subset Distribution

In [None]:
def count_files(data_folder, subset_names, categories):
    """
    Count the number of files in each category within each subset.

    Args:
    - data_folder (str): The path to the data folder.
    - subset_names (list): Names of data subsets.
    - categories (list): Categories within each data subset.

    Returns:
    - counts (list of lists): List of counts for each category within each subset.
    """
    counts = []
    for subset_name in subset_names:
        subset_counts = []
        for category in categories:
            subset_dir = os.path.join(data_folder, subset_name, category)
            count = len(os.listdir(subset_dir))
            subset_counts.append(count)
        counts.append(subset_counts)
    return counts

In [None]:
def create_plot(subset_names, categories, counts):
    """
    Create a plot of the distribution of data subsets.

    Args:
    - subset_names (list): Names of data subsets.
    - categories (list): Categories within each data subset.
    - counts (list of lists): List of counts for each category within each subset.
    """
    # Create traces for each category
    traces = []
    for i, category in enumerate(categories):
        trace = go.Bar(
            x=subset_names,
            y=[count[i] for count in counts],
            name=category
        )
        traces.append(trace)

    # Layout configuration for the plot
    layout = go.Layout(
        title='Distribution of Data Subset',
        xaxis=dict(title='Subset'),
        yaxis=dict(title='Number of Files'),
        barmode='group'
    )

    # Create the figure using the traces and layout
    fig = go.Figure(data=traces, layout=layout)

    # Display the plot
    fig.show()

In [None]:
subset_names = ["train", "validation", "test"]
categories = ["cat", "dog"]

# Count files
counts = count_files(data_folder, subset_names, categories)

# Create and display plot
create_plot(subset_names, categories, counts)

In [None]:
import os
import plotly.graph_objs as go
from plotly.subplots import make_subplots

def create_bar_chart(subset_name, categories, counts):
    """
    Create a bar chart for a data subset.

    Args:
    - subset_name (str): Name of the data subset.
    - categories (list): Categories within the data subset.
    - counts (list): Counts for each category within the subset.
    """
    # Create bars for each category
    bars = go.Bar(
        x=categories,
        y=counts,
        name=subset_name
    )

    return bars

# Example usage
subset_names = ["train", "validation", "test"]
categories = ["cat", "dog"]

# Count files
counts = count_files(data_folder, subset_names, categories)

# Create subplots with shared y axes
fig = make_subplots(rows=1, cols=len(subset_names), subplot_titles=subset_names)

# Add bar charts to subplots
for i, subset_name in enumerate(subset_names, start=1):
    bars = create_bar_chart(subset_name, categories, counts[i-1])
    fig.add_trace(bars, row=1, col=i)

# Update layout
fig.update_layout(
    title='Distribution of Data Subsets (Bar Charts)',
    showlegend=True
)

# Display the plot
fig.show()


# Visualising Cat and Dog Images in Sub-Dataset

In [None]:
# Define paths to cat and dog images
cat_img_path = data_folder / "train" / "cat" / "cat.30.jpg"  # Path to cat image
dog_img_path = data_folder / "train" / "dog" / "dog.20.jpg"  # Path to dog image

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Plot cat image
cat_img = Image.open(cat_img_path)  # Open cat image
axes[0].imshow(cat_img)  # Display cat image
axes[0].set_title('Cat Image')  # Set title for the subplot

# Plot dog image
dog_img = Image.open(dog_img_path)  # Open dog image
axes[1].imshow(dog_img)  # Display dog image
axes[1].set_title('Dog Image')  # Set title for the subplot

# Show the plot
plt.show()

# Counting Cats and Dogs Images in subdataset 

In [None]:
# Loading the training dataset from the specified directory
train_dataset = image_dataset_from_directory(
    data_folder / "train",    # Path to the training data folder
    image_size=(180, 180),    # Resizing images to (180, 180) pixels
    batch_size=32             # Batch size for training
)

# Loading the validation dataset from the specified directory
validation_dataset = image_dataset_from_directory(
    data_folder / "validation",
    image_size=(180, 180),     
    batch_size=32              
)

# Loading the test dataset from the specified directory
test_dataset = image_dataset_from_directory(
    data_folder / "test",    
    image_size=(180, 180),   
    batch_size=32            
)


# Defining Model

In [None]:
import keras
from keras import layers

# Define the input shape for the model
inputs = keras.Input(shape=(180, 180, 3))

# Rescale input values to be between 0 and 1
x = layers.Rescaling(1./255)(inputs)

# Convolutional Block 1
x = layers.Conv2D(filters=32, kernel_size=3, activation="relu")(x)
x = layers.MaxPooling2D(pool_size=2)(x)

# Convolutional Block 2
x = layers.Conv2D(filters=64, kernel_size=3, activation="relu")(x)
x = layers.MaxPooling2D(pool_size=2)(x)

# Convolutional Block 3
x = layers.Conv2D(filters=128, kernel_size=3, activation="relu")(x)
x = layers.MaxPooling2D(pool_size=2)(x)

# Convolutional Block 4
x = layers.Conv2D(filters=256, kernel_size=3, activation="relu")(x)
x = layers.MaxPooling2D(pool_size=2)(x)

# Convolutional Block 5
x = layers.Conv2D(filters=256, kernel_size=3, activation="relu")(x)

# Flatten the output from convolutional layers for dense layers
x = layers.Flatten()(x)

# Dense layers
x = layers.Dense(512, activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)

# Output layer with 2 neurons and softmax activation function for binary classification
outputs = layers.Dense(1, activation="sigmoid")(x)

# Define the model using functional API, specifying inputs and outputs
model = keras.Model(inputs=inputs, outputs=outputs)

model.summary()


In [None]:
# Compile the model with binary crossentropy loss, RMSprop optimizer, and accuracy metric
model.compile(loss="binary_crossentropy",
              optimizer="rmsprop",
              metrics=["accuracy"])

# Create ModelCheckpoint callback to save the best model based on validation loss
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath="./models/convnet_from_scratch.keras",
    save_best_only=True,  # Save only the best model
    monitor="val_loss"  # Monitor validation loss
)

# Train the model with custom and ModelCheckpoint callbacks
history = model.fit(
    train_dataset, 
    epochs=25, 
    validation_data=validation_dataset, 
    callbacks=model_checkpoint_callback
)

In [None]:
import os

# Directory where the saved models are stored
models_directory = "./models/"

# List all files in the directory
model_files = os.listdir(models_directory)

# Filter out only the saved model files
model_files = [file for file in model_files if file.endswith(".keras")]

# Sort the model files by modification time (ascending order)
model_files.sort(key=lambda x: os.path.getmtime(models_directory + x))

# The last file will be the model saved at the best epoch
best_model_file = model_files[-1]

# Extract the epoch number from the filename
best_epoch = int(best_model_file.split("_")[1])

print("Best epoch level:", best_epoch)

# **Displaying curves of loss and accuracy during training**

In [None]:
def plot_model_history(model_history, acc='accuracy', val_acc='val_accuracy'):
    """
    Function to plot model training history.

    Parameters:
        model_history (History): History object returned by model.fit()
        acc (str): Name of the training accuracy metric
        val_acc (str): Name of the validation accuracy metric
    """
    # Create a figure with two subplots side by side
    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot training and validation accuracy
    axs[0].plot(range(1, len(model_history.history[acc]) + 1), model_history.history[acc])
    axs[0].plot(range(1, len(model_history.history[val_acc]) + 1), model_history.history[val_acc])
    axs[0].set_title('Model Accuracy')  
    axs[0].set_ylabel('Accuracy')  
    axs[0].set_xlabel('Epoch')  
    axs[0].set_xticks(np.arange(1, len(model_history.history[acc]) + 1))  
    axs[0].legend(['train', 'val'], loc='best')  
    
    # Plot training and validation loss
    axs[1].plot(range(1, len(model_history.history['loss']) + 1), model_history.history['loss'])
    axs[1].plot(range(1, len(model_history.history['val_loss']) + 1), model_history.history['val_loss'])
    axs[1].set_title('Model Loss')  
    axs[1].set_ylabel('Loss')  
    axs[1].set_xlabel('Epoch') 
    axs[1].set_xticks(np.arange(1, len(model_history.history['loss']) + 1))  
    axs[1].legend(['train', 'val'], loc='best') 

    plt.show()

In [None]:
plot_model_history(history)

# Model -2 

In [None]:
# Load the VGG16 model pre-trained on ImageNet data, excluding the fully-connected layers at the top
conv_base = keras.applications.vgg16.VGG16(
    weights="imagenet",   # Load pre-trained weights from ImageNet
    include_top=False    # Exclude the fully-connected layers at the top
)

# Set the convolutional base (VGG16) to be non-trainable
conv_base.trainable = False

# Display a summary of the convolutional base (VGG16) architecture
conv_base.summary()


In [None]:
# Define data augmentation pipeline using Keras Sequential API
data_augmentation = keras.Sequential(
    [
        # Randomly flip images horizontally
        layers.RandomFlip("horizontal"),
        # Randomly rotate images by a maximum of 0.1 radians
        layers.RandomRotation(0.1),
        # Randomly zoom into images by a maximum of 20%
        layers.RandomZoom(0.2),
    ]
)

### Model Construction

In [None]:
inputs = keras.Input(shape=(180, 180, 3))

# Apply data augmentation to the input data
x = data_augmentation(inputs)

# Preprocess the input data using VGG16's preprocess_input function
x = keras.applications.vgg16.preprocess_input(x)

In [None]:
#  Feature Extraction

# Pass preprocessed data through the convolutional base (VGG16)
x = conv_base(x)

# Flatten the output from the convolutional base
x = layers.Flatten()(x)

In [None]:
# Classifier Head

# Add a dense layer with 256 units and ReLU activation
x = layers.Dense(256)(x)

In [None]:
# Apply dropout with a rate of 0.5 to prevent overfitting
x = layers.Dropout(0.5)(x)

# Add a dense layer with a single unit and sigmoid activation for binary classification
outputs = layers.Dense(1, activation="sigmoid")(x)

In [None]:
# Define the model with inputs and outputs
model_vgg = keras.Model(inputs, outputs)

# Display a summary of the model architecture and parameters
model_vgg.summary()

In [None]:
# Compile the model with loss function, optimizer, and evaluation metrics
model_vgg.compile(loss="binary_crossentropy",
              optimizer="rmsprop",
              metrics=["accuracy"])

# Create ModelCheckpoint callback to save the best model based on validation loss
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./models/vgg16_best_epoch.keras",
    save_best_only=True,  # Save only the best model
    monitor="val_loss"    # Monitor validation loss for saving the best model
)

# Train the model with custom and ModelCheckpoint callbacks
history = model_vgg.fit(
    train_dataset,
    epochs=25,  # Number of epochs for training
    validation_data=validation_dataset,
    callbacks=model_checkpoint_callback  # List of callbacks to be used during training
)

In [None]:
plot_model_history(history)

In [None]:
# Define paths and batch size
test_data_dir = './train/kaggle_dogs_vs_cats_small/test'  # Directory containing test data
best_model_1_path = './models/convnet_from_scratch.keras'  # Path to the first best model
best_model_2_path = './models/vgg16_best_epoch.keras'      # Path to the second best model
batch_size = 32  # Batch size for evaluation

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model

# Define paths and batch size
test_data_dir = './train/kaggle_dogs_vs_cats_small/test'  # Directory containing test data
best_model_path = './models/convnet_from_scratch.keras'  # Path to the first best model

# Load the best model
best_model = load_model(best_model_path)

# Create a data generator for the test dataset
test_datagen = ImageDataGenerator(rescale=1./255)  # Rescale pixel values
batch_size = 32  # Define batch size
test_generator = test_datagen.flow_from_directory(
    test_data_dir,
    target_size=(180, 180),  
    batch_size=batch_size,   
    class_mode='binary',
    shuffle=False  # Disable shuffling for evaluation
)

# Evaluate the best model on the test dataset
test_loss, test_accuracy = best_model.evaluate(test_generator, verbose=1)

print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


In [None]:
# Load the test dataset using ImageDataGenerator
test_datagen = ImageDataGenerator(rescale=1./255)  # Rescale pixel values
test_generator = test_datagen.flow_from_directory(
    test_data_dir,
    target_size=(180, 180),  
    batch_size=batch_size,   
    class_mode='binary',
    shuffle=False) # Disable shuffling for evaluation

In [None]:
# Load the best versions of each model
best_model_1 = load_model(best_model_1_path)  # Load first best model
best_model_2 = load_model(best_model_2_path)  # Load second best model

In [None]:
# Evaluate model 1
model_1_evaluation = best_model_1.evaluate(test_generator, verbose=0)
print("Model 1 Evaluation:")
print("Test Loss:", model_1_evaluation[0])
print("Test Accuracy:", model_1_evaluation[1])


In [None]:
# Evaluate model 2
model_2_evaluation = best_model_2.evaluate(test_generator, verbose=0)
print("\nModel 2 Evaluation:")
print("Test Loss:", model_2_evaluation[0])
print("Test Accuracy:", model_2_evaluation[1])


In [None]:
# Get predictions for both models
y_true = test_generator.classes  # True labels
y_pred_model_1 = np.squeeze(best_model_1.predict(test_generator))  # Predictions of model 1
y_pred_model_2 = np.squeeze(best_model_2.predict(test_generator))  # Predictions of model 2


In [None]:
# Calculate confusion matrices
cm_model_1 = confusion_matrix(y_true, y_pred_model_1 > 0.5)  # Confusion matrix for model 1
cm_model_2 = confusion_matrix(y_true, y_pred_model_2 > 0.5)  # Confusion matrix for model 2


In [None]:
# Plot confusion matrices
plt.figure(figsize=(12, 6))

# Subplot for Model 1 confusion matrix
plt.subplot(1, 2, 1)
plt.title("Confusion Matrix - Model 1")
plt.imshow(cm_model_1, cmap=plt.cm.Blues, interpolation='nearest')
plt.colorbar()
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks([0, 1], ["Cat", "Dog"])
plt.yticks([0, 1], ["Cat", "Dog"])

# Subplot for Model 2 confusion matrix
plt.subplot(1, 2, 2)
plt.title("Confusion Matrix - Model 2")
plt.imshow(cm_model_2, cmap=plt.cm.Blues, interpolation='nearest')
plt.colorbar()
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks([0, 1], ["Cat", "Dog"])
plt.yticks([0, 1], ["Cat", "Dog"])

plt.show()


In [None]:
# Calculate precision, recall, and F1-score for both models
print("\nModel 1 Classification Report:")
print(classification_report(y_true, y_pred_model_1 > 0.5, target_names=['Cat', 'Dog']))

print("\nModel 2 Classification Report:")
print(classification_report(y_true, y_pred_model_2 > 0.5, target_names=['Cat', 'Dog']))

In [None]:
# Calculate precision-recall curve for both models
precision_model_1, recall_model_1, _ = precision_recall_curve(y_true, y_pred_model_1)
precision_model_2, recall_model_2, _ = precision_recall_curve(y_true, y_pred_model_2)

# Plot precision-recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall_model_1, precision_model_1, label='Model 1')
plt.plot(recall_model_2, precision_model_2, label='Model 2')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()

# prediction on test dataset

In [None]:
test_dir = "./train/kaggle_dogs_vs_cats_small/test"

# List to store the filenames of all test images
test_filenames = []

# Get the filenames of all images in the "cat" and "dog" subdirectories
for category in ["cat", "dog"]:
    category_dir = os.path.join(test_dir, category)
    filenames = os.listdir(category_dir)

    # Append the file paths to the list of test filenames
    test_filenames.extend([os.path.join(category, fname) for fname in filenames])

# Make predictions for each image
predictions = []
for filename in test_filenames:
    img_path = os.path.join(test_dir, filename)

    # Load and preprocess the image
    img = load_img(img_path, target_size=(180, 180))  
    img_array = np.expand_dims(img, axis=0) / 255.0  # Normalize pixel values to [0, 1]

    # Make prediction using the model
    prediction = model.predict(img_array)
    predictions.append(prediction)

threshold = 0.5

# Convert predictions to binary categories based on the threshold
binary_predictions = [1 if pred > threshold else 0 for pred in predictions]


In [None]:
plt.figure(figsize=(12, 12))

for i, filename in enumerate(test_filenames[:9]):
    img_path = os.path.join(test_dir, filename)

    # Load the image
    img = load_img(img_path, target_size=(180, 180))
    
    plt.subplot(3, 3, i+1)
    plt.imshow(img)
    
    plt.title(f"Prediction: {binary_predictions[i]}")
    plt.axis("off")  
    
plt.tight_layout()  
plt.show()  

In [None]:
def get_binary_predictions(y_pred, threshold):
    """Get binary predictions based on a threshold."""
    return (y_pred > threshold).astype(int)

def calculate_incorrect_predictions(predictions, y_true, filenames):
    """Calculate incorrect predictions."""
    incorrect_predictions = []
    for i, filename in enumerate(filenames):
        actual_label = "dog" if y_true[i] == 1 else "cat"
        predicted_label = "dog" if predictions[i] == 1 else "cat"
        if actual_label != predicted_label:
            incorrect_predictions.append((filename, actual_label, predicted_label))
    return incorrect_predictions

# Get binary predictions for both models
binary_predictions_model_1 = get_binary_predictions(y_pred_model_1, threshold)
# binary_predictions_model_2 = get_binary_predictions(y_pred_model_2, threshold)

# Calculate incorrect predictions for both models
incorrect_predictions_model_1 = calculate_incorrect_predictions(binary_predictions_model_1, y_true, test_generator.filenames)
incorrect_predictions_model_2 = calculate_incorrect_predictions(binary_predictions_model_2, y_true, test_generator.filenames)

# Display information about incorrect predictions for Model 1
print(f"{len(incorrect_predictions_model_1)} Incorrect Predictions for Model 1:")

for filename, actual_label, predicted_label in incorrect_predictions_model_1:
    print(f"Filename: {filename}, Actual Label: {actual_label}, Predicted Label: {predicted_label}")

In [None]:
# Display information about incorrect predictions for Model 2
print(f"\n{len(incorrect_predictions_model_2)} Incorrect Predictions for Model 2:")
for filename, actual_label, predicted_label in incorrect_predictions_model_2:
    print(f"Filename: {filename}, Actual Label: {actual_label}, Predicted Label: {predicted_label}")