In [37]:
import os
from PIL import Image
import numpy as np
from collections import defaultdict
import imagehash
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
from tensorflow.keras.preprocessing import image
from sklearn.metrics import confusion_matrix, classification_report

# Working with Images Lab
## Information retrieval, preprocessing, and feature extraction

In this lab, you'll work with images of felines (cats), which have been classified according to their taxonomy. Each subfolder contains images of a particular species. The dataset is located [here](https://www.kaggle.com/datasets/datahmifitb/felis-taxonomy-image-classification) but it's also provided to you in the `data/` folder.

### Problem 1. Some exploration (1 point)
How many types of cats are there? How many images do we have of each? What is a typical image size? Are there any outliers in size?

In [18]:
dataset_path = 'data/'

cat_types = [
    'african-wildcat', 'blackfoot-cat', 'chinese-mountain-cat', 
    'domestic-cat', 'european-wildcat', 'jungle-cat', 'sand-cat'
]

# Count the number of cat types
num_cat_types = len(cat_types)
print(f"Number of cat types: {num_cat_types}")

Number of cat types: 7


In [19]:
# Count the number of images for each cat type
image_counts = defaultdict(int)
for cat_type in cat_types:
    cat_folder = os.path.join(dataset_path, cat_type)
    if os.path.isdir(cat_folder):
        num_images = len([f for f in os.listdir(cat_folder) if os.path.isfile(os.path.join(cat_folder, f))])
        image_counts[cat_type] = num_images

print("\nNumber of images for each cat type:")
for cat_type, count in image_counts.items():
    print(f"{cat_type}: {count} images")


Number of images for each cat type:
african-wildcat: 91 images
blackfoot-cat: 79 images
chinese-mountain-cat: 42 images
domestic-cat: 64 images
european-wildcat: 85 images
jungle-cat: 86 images
sand-cat: 72 images


In [20]:
# Determine typical image size and check for outliers
image_sizes = []
for cat_type in cat_types:
    cat_folder = os.path.join(dataset_path, cat_type)
    for img_file in os.listdir(cat_folder):
        img_path = os.path.join(cat_folder, img_file)
        if os.path.isfile(img_path):
            try:
                with Image.open(img_path) as img:
                    image_sizes.append(img.size)
            except:
                print(f"Skipping file {img_path}, unable to open as an image")

In [21]:
image_sizes = np.array(image_sizes)

if len(image_sizes) > 0:
    mean_size = np.mean(image_sizes, axis=0)
    std_size = np.std(image_sizes, axis=0)

    print(f"\nTypical image size (mean): {mean_size}")
    print(f"Standard deviation of image size: {std_size}")

    size_threshold = mean_size + 2 * std_size
    outliers = image_sizes[np.any(image_sizes > size_threshold, axis=1)]

    print(f"Number of outliers: {len(outliers)}")
else:
    print("No images were processed, so size analysis cannot be performed.")


Typical image size (mean): [406.55298651 310.94990366]
Standard deviation of image size: [438.26754371 323.00468777]
Number of outliers: 30


### Problem 2. Duplicat(e)s (1 point)
Find a way to filter out (remove) identical images. I would recommnend using file hashes, but there are many approaches. Keep in mind that during file saving, recompression, etc., a lot of artifacts can change the file content (bytes), but not visually.

In [25]:
dataset_path = 'data/'

cat_types = [
    'african-wildcat', 'blackfoot-cat', 'chinese-mountain-cat', 
    'domestic-cat', 'european-wildcat', 'jungle-cat', 'sand-cat'
]

image_hashes = {}
duplicates = defaultdict(list)

# Function to compute the perceptual hash of an image
def compute_hash(image_path):
    try:
        with Image.open(image_path) as img:
            hash_value = imagehash.phash(img)
        return hash_value
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [26]:
# Process each image
for cat_type in cat_types:
    cat_folder = os.path.join(dataset_path, cat_type)
    if os.path.isdir(cat_folder):
        for img_file in os.listdir(cat_folder):
            img_path = os.path.join(cat_folder, img_file)
            if os.path.isfile(img_path):
                img_hash = compute_hash(img_path)
                if img_hash:
                    if img_hash in image_hashes:
                        duplicates[img_hash].append(img_path)
                    else:
                        image_hashes[img_hash] = img_path

In [27]:
print("Duplicate images:")
for img_hash, paths in duplicates.items():
    print(f"Hash: {img_hash}")
    for path in paths:
        print(f"  {path}")

for paths in duplicates.values():
    for path in paths[1:]:  # Keep the first occurrence and remove the rest
        os.remove(path)
        print(f"Removed: {path}")

print("Duplicate removal completed.")

Duplicate images:
Hash: 9133eeece0763318
  data/african-wildcat\af (32).jpg
Hash: cba4349b0cca75ab
  data/african-wildcat\af (37).jpg
Hash: a9d4960be960cb9e
  data/african-wildcat\af (61).jpg
Hash: 86421f393cc1c1ff
  data/african-wildcat\af (74).jpg
Hash: e3383b6bbc45906a
  data/blackfoot-cat\bc (63).jpg
Hash: d425191aa621ff75
  data/chinese-mountain-cat\ch (20).jpg
Hash: 89f469d57611be21
  data/chinese-mountain-cat\ch (32).jpg
Hash: 847713396d36934b
  data/chinese-mountain-cat\ch (39).jpg
Hash: c73f709bd22431d8
  data/chinese-mountain-cat\ch (42).jpg
Hash: afe0d882c0ba37b5
  data/chinese-mountain-cat\ch (9).jpg
Hash: aeec8684df769089
  data/domestic-cat\dc (27).jpg
  data/domestic-cat\dc (36).jpg
Hash: d4d482453fd067bc
  data/domestic-cat\dc (42).jpg
Hash: ea2e9591382c73c7
  data/domestic-cat\dc (5).jpg
Hash: e2b5984aadd69368
  data/domestic-cat\dc (52).jpg
Hash: e2959a4aad569369
  data/european-wildcat\eu (11).jpg
Hash: f9a007d79a68cd31
  data/european-wildcat\eu (3).jpg
Hash: 991b73

### Problem 3. Loading a model (2 points)
Find a suitable, trained convolutional neural network classifier. I recommend `ResNet50` as it's small enough to run well on any machine and powerful enough to make reasonable predictions. Most ready-made classifiers have been trained for 1000 classes.

You'll need to install libraries and possibly tinker with configurations for this task. When you're done, display the total number of layers and the total number of parameters. For ResNet50, you should expect around 50 layers and 25M parameters.

In [29]:
model = ResNet50(weights='imagenet')

num_layers = len(model.layers)
print(f"Total number of layers: {num_layers}")

num_params = model.count_params()
print(f"Total number of parameters: {num_params}")

model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5
[1m102967424/102967424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 0us/step
Total number of layers: 177
Total number of parameters: 25636712


### Problem 4. Prepare the images (1 point)
You'll need to prepare the images for passing to the model. To do so, they have to be resized to the same dimensions. Most available models have a specific requirement for sizes. You may need to do additional preprocessing, depending on the model requirements. These requirements should be easily available in the model documentation.

In [32]:
def load_and_preprocess_image(img_path, target_size=(224, 224)):
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    return img_array

data_folder = 'data/'

subfolders = ['african-wildcat', 'blackfoot-cat', 'chinese-mountain-cat', 
              'domestic-cat', 'european-wildcat', 'jungle-cat', 'sand-cat']

processed_images = []

for folder in subfolders:
    folder_path = os.path.join(data_folder, folder)
    for fname in os.listdir(folder_path):
        if fname.endswith('.jpg') or fname.endswith('.png'):
            img_path = os.path.join(folder_path, fname)
            processed_images.append(load_and_preprocess_image(img_path))

images_array = np.vstack(processed_images)

print("Shape of combined image array:", images_array.shape)

Shape of combined image array: (512, 224, 224, 3)


### Problem 5. Load the images efficiently (1 point)
Now that you've seen how to prepare the images for passing to the model... find a way to do it efficiently. Instead of loading the entire dataset in the RAM, read the images in batches (e.g. 4 images at a time). The goal is to read these, preprocess them, maybe save the preprocessed results in RAM.

If you've already done this in one of the previous problems, just skip this one. You'll get your point for it.

\* Even better, save the preprocessed image arrays (they will not be valid .jpg file) as separate files, so you can load them "lazily" in the following steps. This is a very common optimization to work with large datasets.

In [41]:
def load_image(img_path, target_size=(224, 224)):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, target_size)
    img = img / 255.0
    return img

def preprocess_and_save_image(img_path, target_size=(224, 224), save_dir='preprocessed_images'):
    img = load_and_preprocess_image(img_path, target_size)
    img = tf.image.convert_image_dtype(img, dtype=tf.uint8)
    img_name = os.path.basename(img_path).replace('.jpg', '.npy')
    save_path = os.path.join(save_dir, img_name)
    os.makedirs(save_dir, exist_ok=True)
    np.save(save_path, img.numpy())

def process_images_in_batches(image_paths, batch_size=4, target_size=(224, 224), save_dir='preprocessed_images'):
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i + batch_size]
        for path in batch_paths:
            preprocess_and_save_image(path, target_size, save_dir)
        print(f'Processed and saved batch {i // batch_size + 1}/{(len(image_paths) + batch_size - 1) // batch_size}')

data_folder = 'data/'

subfolders = ['african-wildcat', 'blackfoot-cat', 'chinese-mountain-cat', 
              'domestic-cat', 'european-wildcat', 'jungle-cat', 'sand-cat']

image_paths = []
for folder in subfolders:
    folder_path = os.path.join(data_folder, folder)
    for fname in os.listdir(folder_path):
        if fname.endswith('.jpg') or fname.endswith('.png'):
            image_paths.append(os.path.join(folder_path, fname))

process_images_in_batches(image_paths, batch_size=4)

Processed and saved batch 1/128
Processed and saved batch 2/128
Processed and saved batch 3/128
Processed and saved batch 4/128
Processed and saved batch 5/128
Processed and saved batch 6/128
Processed and saved batch 7/128
Processed and saved batch 8/128
Processed and saved batch 9/128
Processed and saved batch 10/128
Processed and saved batch 11/128
Processed and saved batch 12/128
Processed and saved batch 13/128
Processed and saved batch 14/128
Processed and saved batch 15/128
Processed and saved batch 16/128
Processed and saved batch 17/128
Processed and saved batch 18/128
Processed and saved batch 19/128
Processed and saved batch 20/128
Processed and saved batch 21/128
Processed and saved batch 22/128
Processed and saved batch 23/128
Processed and saved batch 24/128
Processed and saved batch 25/128
Processed and saved batch 26/128
Processed and saved batch 27/128
Processed and saved batch 28/128
Processed and saved batch 29/128
Processed and saved batch 30/128
Processed and saved

### Problem 6. Predictions (1 point)
Finally, you're ready to get into the meat of the problem. Obtain predictions from your model and evaluate them. This will likely involve manual work to decide how the returned classes relate to the original ones.

Create a [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix) to evaluate the classification.

In [42]:
def load_preprocessed_images(image_dir):
    image_files = [f for f in os.listdir(image_dir) if f.endswith('.npy')]
    images = [np.load(os.path.join(image_dir, f)) for f in image_files]
    return np.array(images), image_files

preprocessed_dir = 'preprocessed_images'
images, image_files = load_preprocessed_images(preprocessed_dir)

model = ResNet50(weights='imagenet')

model.summary()

### Problem 7. Grayscale (1 point)
Converting the images to grayscale should affect the classification negatively, as we lose some of the color information.

Find a way to preprocess the images to grayscale (using what you already have in Problem 4 and 5), pass them to the model, and compare the classification results to the previous ones.

### Problem 8. Deep image features (1 point)
Find a way to extract one-dimensional vectors (features) for each (non-grayscale) image, using your model. This is typically done by "short-circuiting" the model output to be an intermediate layer, while keeping the input the same. 

In case the outputs (also called feature maps) have different shapes, you can flatten them in different ways. Try to not create huge vectors; the goal is to have a relatively short sequence of numbers which describes each image.

You may find a tutorial like [this](https://towardsdatascience.com/exploring-feature-extraction-with-cnns-345125cefc9a) pretty useful but note your implementation will depend on what model (and framework) you've decided to use.

It's a good idea to save these as one or more files, so you'll spare yourself a ton of preprocessing.

### Problem 9. Putting deep image features to use (1 points)
Try to find similar images, using a similarity metric on the features you got in the previous problem. Two good metrics are `mean squared error` and `cosine similarity`. How do they work? Can you spot images that look too similar? Can you explain why?

\* If we were to take Fourier features (in a similar manner, these should be a vector of about the same length), how do they compare to the deep features; i.e., which features are better to "catch" similar images?

### * Problem 10. Explore, predict, and evaluate further
You can do a ton of things here, at your desire. For example, how does masking different areas of the image affect classification - a method known as **saliency map** ([info](https://en.wikipedia.org/wiki/Saliency_map))? Can we detect objects? Can we significantly reduce the number of features (keeping the quality) that we get? Can we reliably train a model to predict our own classes? We'll look into these in detail in the future.