In [None]:
# Write your imports here
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import re

import os
from PIL import Image
import imagehash

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
import glob

import tensorflow as tf
import cv2


# Working with Images Lab
## Information retrieval, preprocessing, and feature extraction

In this lab, you'll work with images of felines (cats), which have been classified according to their taxonomy. Each subfolder contains images of a particular species. The dataset is located [here](https://www.kaggle.com/datasets/datahmifitb/felis-taxonomy-image-classification) but it's also provided to you in the `data/` folder.

### Problem 1. Some exploration (1 point)
How many types of cats are there? How many images do we have of each? What is a typical image size? Are there any outliers in size?

First, we are going to read the data and store the needed information in variables for species, number and size of the images. 

In [None]:
data_folder = 'data'

species = []
num_images = []
image_sizes = []

for subfolder in os.listdir(data_folder):
    subfolder_path = os.path.join(data_folder, subfolder)
    if os.path.isdir(subfolder_path):
        # List images in the subfolder
        images = [img for img in os.listdir(subfolder_path) if img.lower().endswith(('png', 'jpg', 'jpeg', 'tiff', 'bmp', 'gif'))]
        
        # Store species and number of images
        species.append(subfolder)
        num_images.append(len(images))
        
        # Analyze image sizes
        for image_name in images:
            image_path = os.path.join(subfolder_path, image_name)
            try:
                with Image.open(image_path) as img:
                    image_sizes.append(img.size)
            except Exception as e:
                print(f"Error opening image {image_path}: {e}")

Second, we are going to create a dataframe with image height and width and then calculate the typical image size using mean and median. In order to analyze the size and detect any potential outliers, we are going to use the $IQR$ method, which focuses on the middle 50% of the data. The Interquartile Range is the range between the first quartile ($Q1$) and the third quartile ($Q3$) by the formula $IQR = Q3 - Q1$. The outliers are typically defined as data points which fall below $Q1 - 1.5 * IQR$ or above $Q3 + 1.5 * IQR$.

In [None]:
sizes_df = pd.DataFrame(image_sizes, columns=['Width', 'Height'])

# Calculate typical image size
mean_size = sizes_df.mean()
median_size = sizes_df.median()

# Detect outliers using IQR method
Q1 = sizes_df.quantile(0.25)
Q3 = sizes_df.quantile(0.75)
IQR = Q3 - Q1
outliers = sizes_df[(sizes_df < (Q1 - 1.5 * IQR)) | (sizes_df > (Q3 + 1.5 * IQR))]

# Output the results
species_images_count = dict(zip(species, num_images))
typical_size = (mean_size, median_size)
outliers_count = outliers.dropna().shape[0]


In [None]:
for k, v in species_images_count.items():
    print(f'{k}: {v}')

print(f'Mean image size: Width = {mean_size["Width"]:.2f}, Height = {mean_size["Height"]:.2f}')
print(f'Median image size: Width = {median_size["Width"]:.1f}, Height = {median_size["Height"]:.1f}')

print(f'Number of outliers: {outliers_count}.')

### Problem 2. Duplicat(e)s (1 point)
Find a way to filter out (remove) identical images. I would recommnend using file hashes, but there are many approaches. Keep in mind that during file saving, recompression, etc., a lot of artifacts can change the file content (bytes), but not visually.

We are going to need a set to store the unique image hashes and traverse the data folder in order to go through all of the images to check for duplicates, which in fact is quite straight-forward using hashes.

In [None]:
species = []
num_images = []
image_hashes = set()
removed_images = 0

# Traverse the data folder
for subfolder in os.listdir(data_folder):
    subfolder_path = os.path.join(data_folder, subfolder)
    if os.path.isdir(subfolder_path):
        images = [img for img in os.listdir(subfolder_path) if img.lower().endswith(('png', 'jpg', 'jpeg', 'tiff', 'bmp', 'gif'))]
        
        species.append(subfolder)
        unique_images = 0
        
        # Analyze image hashes
        for image_name in images:
            image_path = os.path.join(subfolder_path, image_name)
            try:
                with Image.open(image_path) as img:
                    img_hash = imagehash.average_hash(img)
                    if img_hash not in image_hashes:
                        image_hashes.add(img_hash)
                        unique_images += 1
                    else:
                        removed_images += 1
            except Exception as e:
                print(f"Error opening image {image_path}: {e}")
        
        num_images.append(unique_images)

species_images_count = dict(zip(species, num_images))

for specie, count in species_images_count.items():
    print(f'Species: {specie}, Number of unique images: {count}')

print(f'Number of removed images: {removed_images}')

### Problem 3. Loading a model (2 points)
Find a suitable, trained convolutional neural network classifier. I recommend `ResNet50` as it's small enough to run well on any machine and powerful enough to make reasonable predictions. Most ready-made classifiers have been trained for 1000 classes.

You'll need to install libraries and possibly tinker with configurations for this task. When you're done, display the total number of layers and the total number of parameters. For ResNet50, you should expect around 50 layers and 25M parameters.

In [None]:
model = ResNet50(weights='imagenet')

model.summary()

total_layers = len(model.layers)
total_params = model.count_params()

print(f'Total number of layers: {total_layers}')
print(f'Total number of parameters: {total_params:,}')

### Problem 4. Prepare the images (1 point)
You'll need to prepare the images for passing to the model. To do so, they have to be resized to the same dimensions. Most available models have a specific requirement for sizes. You may need to do additional preprocessing, depending on the model requirements. These requirements should be easily available in the model documentation.

In [None]:
model = ResNet50(weights='imagenet')

def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    return img_array

def preprocess_images_in_directory(directory_path):
    img_paths = glob.glob(os.path.join(directory_path, '**', '*.jpg'), recursive=True)
    preprocessed_images = []

    if not img_paths:
        print("No images found in the directory.")
        return np.array([])

    for img_path in img_paths:
        preprocessed_img = preprocess_image(img_path)
        if preprocessed_img is not None:
            preprocessed_images.append(preprocessed_img)

    if preprocessed_images:
        batch = np.vstack(preprocessed_images)
        return batch
    else:
        print("No valid images to preprocess.")
        return np.array([])

directory_path = 'data'
preprocessed_batch = preprocess_images_in_directory(directory_path)


### Problem 5. Load the images efficiently (1 point)
Now that you've seen how to prepare the images for passing to the model... find a way to do it efficiently. Instead of loading the entire dataset in the RAM, read the images in batches (e.g. 4 images at a time). The goal is to read these, preprocess them, maybe save the preprocessed results in RAM.

If you've already done this in one of the previous problems, just skip this one. You'll get your point for it.

\* Even better, save the preprocessed image arrays (they will not be valid .jpg file) as separate files, so you can load them "lazily" in the following steps. This is a very common optimization to work with large datasets.

In [None]:
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = preprocess_input(img_array)
    return img_array

def preprocess_images_in_batches(directory_path, batch_size=4, save_dir='preprocessed_batches'):
    img_paths = glob.glob(os.path.join(directory_path, '**', '*.jpg'), recursive=True) + \
                glob.glob(os.path.join(directory_path, '**', '*.png'), recursive=True)

    os.makedirs(save_dir, exist_ok=True)

    for i in range(0, len(img_paths), batch_size):
        batch_paths = img_paths[i:i + batch_size]
        batch = np.array([preprocess_image(p) for p in batch_paths])
        batch_index = i // batch_size
        np.save(os.path.join(save_dir, f'batch_{batch_index}.npy'), batch)

    print(f"Processed and saved {len(img_paths) // batch_size + 1} batches.")

# Call this function to preprocess and save images in batches
preprocess_images_in_batches('data')


This is a good place to fine-tune our model in order to make better predictions based on our dataset and to serve us better for the further problems in this lab.

In [None]:
# Load ResNet50 with ImageNet weights, excluding the top layers
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers except the last few
for layer in base_model.layers[:-4]:
    layer.trainable = False

# Add custom classification layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x)  # Add dropout for regularization
x = Dense(1024, activation='relu')(x)
predictions = Dense(len(class_names), activation='softmax')(x)

# Combine the base model and the custom layers into a new model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Convert labels to categorical
labels_categorical = tf.keras.utils.to_categorical(true_labels, num_classes=len(class_names))

# Train the model
history = model.fit(preprocessed_batch, labels_categorical, epochs=10, validation_split=0.2)

# Optionally, unfreeze some more layers and fine-tune
for layer in base_model.layers[-10:]:  # Unfreeze the last 10 layers
    layer.trainable = True

# Recompile the model with a lower learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Continue training the model with the unfrozen layers
history_fine_tune = model.fit(preprocessed_batch, labels_categorical, epochs=10, validation_split=0.2)

# Save the fine-tuned model for future use
model.save('fine_tuned_resnet50.keras')

### Problem 6. Predictions (1 point)
Finally, you're ready to get into the meat of the problem. Obtain predictions from your model and evaluate them. This will likely involve manual work to decide how the returned classes relate to the original ones.

Create a [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) to evaluate the classification.

In [None]:
# Function to clip label values within the valid range
def clip_labels(labels, num_classes):
    return np.clip(labels, 0, num_classes - 1)

# Function to load preprocessed batch
def load_preprocessed_batch(batch_index, save_dir='preprocessed_batches'):
    batch_path = os.path.join(save_dir, f'batch_{batch_index}.npy')
    return np.load(batch_path)

# Function to make predictions
def make_predictions(num_batches, model, save_dir='preprocessed_batches'):
    all_predictions = []
    for batch_index in range(num_batches):
        batch = load_preprocessed_batch(batch_index, save_dir)
        predictions = model.predict(batch)
        all_predictions.extend(predictions)
    return np.argmax(np.array(all_predictions), axis=1)

# Function to plot confusion matrix
def plot_confusion_matrix(true_labels, predicted_labels, num_classes):
    # Ensure the labels are within the valid range
    true_labels = clip_labels(true_labels, num_classes)
    predicted_labels = clip_labels(predicted_labels, num_classes)

    # Print the maximum and minimum values of labels for debugging
    print(f"True labels range: {true_labels.min()} to {true_labels.max()}")
    print(f"Predicted labels range: {predicted_labels.min()} to {predicted_labels.max()}")

    # Create confusion matrix
    cm = tf.math.confusion_matrix(true_labels, predicted_labels, num_classes=num_classes)
    cm_np = cm.numpy()
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    plt.imshow(cm_np, cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.colorbar()
    tick_marks = np.arange(num_classes)
    plt.xticks(tick_marks)
    plt.yticks(tick_marks)

    for i in range(num_classes):
        for j in range(num_classes):
            plt.text(j, i, int(cm_np[i, j]), ha='center', va='center', 
                     color='white' if cm_np[i, j] > cm_np.max() / 2 else 'black')

    plt.show()

def get_true_labels(directory_path, class_to_label):
    true_labels = []
    img_paths = glob.glob(os.path.join(directory_path, '**', '*.jpg'), recursive=True) + \
                glob.glob(os.path.join(directory_path, '**', '*.png'), recursive=True)

    for img_path in img_paths:
        class_name = os.path.basename(os.path.dirname(img_path))
        label = class_to_label.get(class_name)
        if label is not None:
            true_labels.append(label)
        else:
            print(f"Warning: Class name '{class_name}' not found in class_to_label mapping.")
    return true_labels

# Directory path and class names
directory_path = 'data'  # The path to your dataset
class_names = sorted([d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))])
class_to_label = {class_name: idx for idx, class_name in enumerate(class_names)}

# Get true labels
true_labels = get_true_labels(directory_path, class_to_label)

# Make predictions
num_batches = len(os.listdir('preprocessed_batches'))
predicted_labels = make_predictions(num_batches, model)

# Plot confusion matrix
plot_confusion_matrix(true_labels, predicted_labels, num_classes=len(class_names))


### Problem 7. Grayscale (1 point)
Converting the images to grayscale should affect the classification negatively, as we lose some of the color information.

Find a way to preprocess the images to grayscale (using what you already have in Problem 4 and 5), pass them to the model, and compare the classification results to the previous ones.

In [None]:
def preprocess_image_grayscale(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    
    img_array = image.img_to_array(img)
    
    # Convert to grayscale
    img_gray = cv2.cvtColor(img_array.astype('uint8'), cv2.COLOR_RGB2GRAY)
    
    # Expand dimensions to match the input shape of the model
    img_gray = np.expand_dims(img_gray, axis=-1)
    img_gray = np.repeat(img_gray, 3, axis=-1)

    img_gray = preprocess_input(img_gray)
    
    return img_gray

In [None]:
def preprocess_images_in_batches_grayscale(directory_path, batch_size=4, save_dir='preprocessed_batches_grayscale'):
    img_paths = glob.glob(os.path.join(directory_path, '**', '*.jpg'), recursive=True) + \
                glob.glob(os.path.join(directory_path, '**', '*.png'), recursive=True)

    os.makedirs(save_dir, exist_ok=True)

    for i in range(0, len(img_paths), batch_size):
        batch_paths = img_paths[i:i + batch_size]
        batch = np.array([preprocess_image_grayscale(p) for p in batch_paths])
        batch_index = i // batch_size
        np.save(os.path.join(save_dir, f'batch_{batch_index}.npy'), batch)

    print(f"Processed and saved {len(img_paths) // batch_size + 1} grayscale batches.")

In [None]:
# Load preprocessed batch for grayscale
def load_preprocessed_batch_grayscale(batch_index, save_dir='preprocessed_batches_grayscale'):
    batch_path = os.path.join(save_dir, f'batch_{batch_index}.npy')
    return np.load(batch_path)

# Make predictions for grayscale images
def make_predictions_grayscale(num_batches, model, save_dir='preprocessed_batches_grayscale'):
    all_predictions = []
    for batch_index in range(num_batches):
        batch = load_preprocessed_batch_grayscale(batch_index, save_dir)
        predictions = model.predict(batch)
        all_predictions.extend(predictions)
    return np.argmax(np.array(all_predictions), axis=1)

In [None]:
# Function to clip label values within the valid range
def clip_labels(labels, num_classes):
    return np.clip(labels, 0, num_classes - 1)

# Plot confusion matrix
def plot_confusion_matrix(true_labels, predicted_labels, num_classes):
    true_labels = clip_labels(true_labels, num_classes)
    predicted_labels = clip_labels(predicted_labels, num_classes)

    print(f"True labels range: {true_labels.min()} to {true_labels.max()}")
    print(f"Predicted labels range: {predicted_labels.min()} to {predicted_labels.max()}")

    cm = tf.math.confusion_matrix(true_labels, predicted_labels, num_classes=num_classes)
    cm_np = cm.numpy()

    plt.figure(figsize=(10, 8))
    plt.imshow(cm_np, cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.colorbar()
    tick_marks = np.arange(num_classes)
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    for i in range(num_classes):
        for j in range(num_classes):
            plt.text(j, i, int(cm_np[i, j]), ha='center', va='center', 
                     color='white' if cm_np[i, j] > cm_np.max() / 2 else 'black')

    plt.show()

# Function to get true labels
def get_true_labels(directory_path, class_to_label):
    true_labels = []
    img_paths = glob.glob(os.path.join(directory_path, '**', '*.jpg'), recursive=True) + \
                glob.glob(os.path.join(directory_path, '**', '*.png'), recursive=True)

    for img_path in img_paths:
        class_name = os.path.basename(os.path.dirname(img_path))
        label = class_to_label.get(class_name)
        if label is not None:
            true_labels.append(label)
        else:
            print(f"Warning: Class name '{class_name}' not found in class_to_label mapping.")
    return true_labels

# Directory path and class names
directory_path = 'data'
class_names = sorted([d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))])
class_to_label = {class_name: idx for idx, class_name in enumerate(class_names)}

# Get true labels
true_labels = get_true_labels(directory_path, class_to_label)

# Preprocess and predict for RGB images
preprocess_images_in_batches(directory_path, batch_size=4, save_dir='preprocessed_batches')
num_batches_rgb = len(os.listdir('preprocessed_batches'))
predicted_labels_rgb = make_predictions(num_batches_rgb, model, save_dir='preprocessed_batches')
print("Evaluating RGB predictions...")
plot_confusion_matrix(true_labels, predicted_labels_rgb, num_classes=len(class_names))

# Preprocess and predict for grayscale images
preprocess_images_in_batches_grayscale(directory_path, batch_size=4, save_dir='preprocessed_batches_grayscale')
num_batches_gray = len(os.listdir('preprocessed_batches_grayscale'))
predicted_labels_gray = make_predictions_grayscale(num_batches_gray, model, save_dir='preprocessed_batches_grayscale')
print("Evaluating Grayscale predictions...")
plot_confusion_matrix(true_labels, predicted_labels_gray, num_classes=len(class_names))

### Problem 8. Deep image features (1 point)
Find a way to extract one-dimensional vectors (features) for each (non-grayscale) image, using your model. This is typically done by "short-circuiting" the model output to be an intermediate layer, while keeping the input the same. 

In case the outputs (also called feature maps) have different shapes, you can flatten them in different ways. Try to not create huge vectors; the goal is to have a relatively short sequence of numbers which describes each image.

You may find a tutorial like [this](https://towardsdatascience.com/exploring-feature-extraction-with-cnns-345125cefc9a) pretty useful but note your implementation will depend on what model (and framework) you've decided to use.

It's a good idea to save these as one or more files, so you'll spare yourself a ton of preprocessing.

Let's work on modifying the model first.

In [None]:
feature_extractor = Model(inputs=model.input, outputs=model.get_layer('global_average_pooling2d_4').output)

Next we are going to extract the features and save them.

In [None]:
def extract_features(img_batch, feature_extractor):
    features = feature_extractor.predict(img_batch)
    return features

def get_image_paths(directory_path):
    img_paths = glob.glob(os.path.join(directory_path, '**', '*.jpg'), recursive=True) + \
                glob.glob(os.path.join(directory_path, '**', '*.png'), recursive=True)
    return img_paths

def save_features(directory_path, feature_extractor, batch_size=4, save_dir='extracted_features'):
    img_paths = get_image_paths(directory_path)
    os.makedirs(save_dir, exist_ok=True)

    all_features = []
    for i in range(0, len(img_paths), batch_size):
        batch_paths = img_paths[i:i + batch_size]
        batch = np.array([preprocess_image(p) for p in batch_paths])
        features = extract_features(batch, feature_extractor)
        all_features.append(features)

    all_features = np.vstack(all_features)
    np.save(os.path.join(save_dir, 'extracted_features.npy'), all_features)

    # Save image paths for reference
    with open(os.path.join(save_dir, 'img_paths.txt'), 'w') as f:
        for path in img_paths:
            f.write(path + '\n')

    print(f"Extracted and saved features for {len(img_paths)} images.")

In [None]:
directory_path = 'data'

save_features(directory_path, feature_extractor, batch_size=4, save_dir='extracted_features')

### Problem 9. Putting deep image features to use (1 points)
Try to find similar images, using a similarity metric on the features you got in the previous problem. Two good metrics are `mean squared error` and `cosine similarity`. How do they work? Can you spot images that look too similar? Can you explain why?

\* If we were to take Fourier features (in a similar manner, these should be a vector of about the same length), how do they compare to the deep features; i.e., which features are better to "catch" similar images?

In [None]:
# Function to calculate MSE between two vectors
def mse(x, y):
    return np.mean((x - y) ** 2)

# Function to calculate cosine similarity between two vectors
def cosine_sim(x, y):
    dot_product = np.dot(x, y)
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)
    return dot_product / (norm_x * norm_y)

# Function to find most similar images based on features
def find_similar_images(features, metric='cosine'):
    num_images = features.shape[0]
    similarity_matrix = np.zeros((num_images, num_images))
    
    for i in range(num_images):
        for j in range(num_images):
            if metric == 'mse':
                similarity_matrix[i, j] = mse(features[i], features[j])
            elif metric == 'cosine':
                similarity_matrix[i, j] = cosine_sim(features[i], features[j])

    most_similar_pairs = np.argsort(-similarity_matrix, axis=1) if metric == 'cosine' else np.argsort(similarity_matrix, axis=1)
    return similarity_matrix, most_similar_pairs

# Load features from directory
def load_features_from_directory(directory_path):
    npy_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.npy')]
    feature_list = [np.load(file) for file in npy_files]
    return np.concatenate(feature_list, axis=0)

# Plot similarity matrix
def plot_similarity_matrix(similarity_matrix, title='Similarity Matrix'):
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, cmap='viridis')
    plt.title(title)
    plt.xlabel('Image Index')
    plt.ylabel('Image Index')
    plt.show()

# Load image paths
def load_img_paths(file_path):
    with open(file_path, 'r') as f:
        return [line.strip() for line in f]

# Display top N similar images in a grid
def display_similar_images(img_paths, most_similar_pairs, top_n=5):
    num_images = len(img_paths)
    
    for i in range(num_images):
        plt.figure(figsize=(20, 5))
        
        # Display the original image
        plt.subplot(1, top_n + 1, 1)
        img = plt.imread(img_paths[i])
        plt.imshow(img)
        plt.title(f'Original Image {i}')
        plt.axis('off')

        # Display similar images
        unique_similar_indices = set()
        for j in range(1, top_n + 1):
            sim_idx = most_similar_pairs[i, j]
            if 0 <= sim_idx < num_images and sim_idx != i:
                if sim_idx not in unique_similar_indices:
                    unique_similar_indices.add(sim_idx)
                    sim_img = plt.imread(img_paths[sim_idx])
                    plt.subplot(1, top_n + 1, j + 1)
                    plt.imshow(sim_img)
                    plt.title(f'Similar {j} (Idx {sim_idx})')
                    plt.axis('off')
                    
                # Stop if we have displayed the required number of unique similar images
                if len(unique_similar_indices) >= top_n:
                    break
        
        plt.show()

In [None]:
# Load the features and image paths
features = load_features_from_directory('extracted_features')
img_paths = load_img_paths('extracted_features/img_paths.txt')
print(features.shape)

In [None]:
def load_features_from_directory(directory_path):
    npy_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.npy')]
    print(f"Found {len(npy_files)} .npy files.")
    feature_list = []
    for file in npy_files:
        data = np.load(file)
        print(f"{file} shape: {data.shape}")
        feature_list.append(data)
    features = np.concatenate(feature_list, axis=0)
    print(f"Concatenated features shape: {features.shape}")
    return features

In [None]:
# Calculate similarity matrices and find most similar pairs
similarity_matrix_cosine, most_similar_pairs_cosine = find_similar_images(features, metric='cosine')
similarity_matrix_mse, most_similar_pairs_mse = find_similar_images(features, metric='mse')

# Plot similarity matrices
plot_similarity_matrix(similarity_matrix_cosine, title='Cosine Similarity Matrix')
plot_similarity_matrix(similarity_matrix_mse, title='MSE Similarity Matrix')

CAUTION BEFORE EXECUTING THE NEXT CODE CELL!

For some reason I see dublicates on each image and on top of that I cannot seem to make the code work to display only the 5 most similar pairs and break out of the loop, thus the following cell leads to an infinite loop. I tried though.

In [None]:
# Display similar images
display_similar_images(img_paths, most_similar_pairs_cosine, top_n=5)

In [None]:
# Function to calculate MSE between two vectors
def mse(x, y):
    return np.mean((x - y) ** 2)

# Function to calculate cosine similarity between two vectors
def cosine_sim(x, y):
    dot_product = np.dot(x, y)
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)
    return dot_product / (norm_x * norm_y)

# Function to find most similar images based on features
def find_similar_images(features, metric='cosine'):
    num_images = features.shape[0]
    similarity_matrix = np.zeros((num_images, num_images))

    for i in range(num_images):
        for j in range(num_images):
            if metric == 'mse':
                similarity_matrix[i, j] = mse(features[i], features[j])
            elif metric == 'cosine':
                similarity_matrix[i, j] = cosine_sim(features[i], features[j])

    if metric == 'cosine':
        most_similar_pairs = np.argsort(-similarity_matrix, axis=1)
    else:
        most_similar_pairs = np.argsort(similarity_matrix, axis=1)

    return similarity_matrix, most_similar_pairs


# Load all .npy files from the directory
def load_features_from_directory(directory_path):
    feature_list = []
    npy_files = [f for f in os.listdir(directory_path) if f.endswith('.npy')]
    
    for npy_file in npy_files:
        file_path = os.path.join(directory_path, npy_file)
        features = np.load(file_path)
        feature_list.append(features)
    
    # Combine all the features into a single numpy array
    all_features = np.concatenate(feature_list, axis=0)
    return all_features

# Example usage with extracted features
directory_path = 'extracted_features'  # Directory containing .npy files
features = load_features_from_directory(directory_path)  # Load precomputed features

# Calculate similarity matrices and find most similar pairs
similarity_matrix_cosine, most_similar_pairs_cosine = find_similar_images(features, metric='cosine')
similarity_matrix_mse, most_similar_pairs_mse = find_similar_images(features, metric='mse')

# Optional: print out the most similar pairs
print("Most similar pairs (Cosine Similarity):")
print(most_similar_pairs_cosine)

print("Most similar pairs (MSE):")
print(most_similar_pairs_mse)

We can use heatmaps to better understand these similarity pairs.

In [None]:
def plot_similarity_matrix(similarity_matrix, title='Similarity Matrix'):
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, cmap='viridis')
    plt.title(title)
    plt.xlabel('Image Index')
    plt.ylabel('Image Index')
    plt.show()

# Plotting Cosine Similarity Matrix
plot_similarity_matrix(similarity_matrix_cosine, title='Cosine Similarity Matrix')

# Plotting MSE Similarity Matrix
plot_similarity_matrix(similarity_matrix_mse, title='MSE Similarity Matrix')

Or even display the most similar images in a grid.

In [None]:
def load_img_paths(file_path):
    with open(file_path, 'r') as f:
        img_paths = [line.strip() for line in f]
    return img_paths

# Load the features and image paths
features = np.load('extracted_features/extracted_features.npy')
img_paths = load_img_paths('extracted_features/img_paths.txt')

# Example usage for visualization
print(f"Loaded {len(img_paths)} image paths and features.")


In [None]:
def display_similar_images(img_paths, most_similar_pairs, top_n=5):
    num_images = len(img_paths)
    break_counter = 0

    for i in range(num_images):
        if break_counter > 5:
            break
        plt.figure(figsize=(20, 5))

        # Display the original image
        try:
            img = plt.imread(img_paths[i])
            plt.subplot(1, top_n + 1, 1)
            plt.imshow(img)
            plt.title(f'Original Image {i}')
            plt.axis('off')
        except Exception as e:
            print(f"Error loading image {img_paths[i]}: {e}")
            continue
        
        # Display similar images
        for j in range(1, top_n + 1):
            sim_idx = most_similar_pairs[i, j]
            
            if 0 <= sim_idx < num_images:
                try:
                    sim_img = plt.imread(img_paths[sim_idx])
                    plt.subplot(1, top_n + 1, j + 1)
                    plt.imshow(sim_img)
                    plt.title(f'Similar {j} (Idx {sim_idx})')
                    plt.axis('off')
                except Exception as e:
                    print(f"Error loading similar image {img_paths[sim_idx]}: {e}")
            else:
                break_counter += 1
                print(f"Warning: Index {sim_idx} is out of range for img_paths (total: {num_images}).")
                break

        plt.show()

# Call the function
display_similar_images(img_paths, most_similar_pairs_cosine, top_n=5)

### * Problem 10. Explore, predict, and evaluate further
You can do a ton of things here, at your desire. For example, how does masking different areas of the image affect classification - a method known as **saliency map** ([info](https://en.wikipedia.org/wiki/Saliency_map))? Can we detect objects? Can we significantly reduce the number of features (keeping the quality) that we get? Can we reliably train a model to predict our own classes? We'll look into these in detail in the future.