In [23]:
import os
import glob
import numpy as np
import pandas as pd
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image

def get_image_embeddings(folder_list, 
                         model=None,
                         exts=('jpg','jpeg','png','bmp','tiff')):
    """
    Given a list of folder paths, returns a DataFrame of image embeddings with class labels.
    
    Parameters
    ----------
    folder_list : list of str
        Paths to directories to scan for images. The folder name itself will be used as the 'class'.
    model : keras.Model, optional
        Preloaded backbone. If None, a ResNet50(include_top=False, pooling='avg') is created.
    exts : tuple of str
        File extensions to include (without dot).
    
    Returns
    -------
    pd.DataFrame
        Columns:
          - 'filepath': full path to image
          - 'class': name of the folder the image came from
          - 'embedding': numpy.ndarray of shape (2048,)
    """
    # 1) Load model if not provided
    if model is None:
        model = ResNet50(weights='imagenet',
                         include_top=False,
                         pooling='avg')
    
    records = []
    
    # 2) For each folder, collect image paths and tag with its class
    for folder in folder_list:
        class_name = os.path.basename(os.path.normpath(folder))
        # build glob patterns
        for ext in exts:
            pattern = os.path.join(folder, f'**/*.{ext}')
            for fp in glob.glob(pattern, recursive=True):
                try:
                    # load & preprocess
                    img = image.load_img(fp)  # original size
                    x   = image.img_to_array(img)
                    x   = np.expand_dims(x, axis=0)
                    x   = preprocess_input(x)
                    # get embedding
                    emb = model.predict(x, verbose=0).reshape(-1)  # (2048,)
                    # record
                    records.append({
                        'filepath': fp,
                        'class': class_name,
                        'embedding': emb
                    })
                except Exception as e:
                    print(f"Warning: could not process {fp}: {e}")
    
    # 3) Pack into DataFrame
    df = pd.DataFrame(records, columns=['filepath','class','embedding'])
    return df



folders = ['data/Amylax_triacantha',
          'data/Aphanothece_paralleliformis',
           'data/Ciliata',
           'data/Cryptomonadales',
           'data/Katablepharis_remigera'
          ]

df = get_image_embeddings(folders)
print(df.head())
print(df.shape)
    # access the embedding of the first image:
    # emb0 = df.loc[0, 'embedding']
    
    


                                          filepath              class  \
0   data/Amylax_triacantha/Amylax_triacantha_6.png  Amylax_triacantha   
1   data/Amylax_triacantha/Amylax_triacantha_8.png  Amylax_triacantha   
2  data/Amylax_triacantha/Amylax_triacantha_12.png  Amylax_triacantha   
3  data/Amylax_triacantha/Amylax_triacantha_11.png  Amylax_triacantha   
4  data/Amylax_triacantha/Amylax_triacantha_10.png  Amylax_triacantha   

                                           embedding  
0  [0.13757417, 0.5561774, 0.0, 0.85218066, 0.058...  
1  [1.0831827, 0.43321905, 0.49506465, 1.6894163,...  
2  [0.54868233, 0.7074472, 0.0, 0.24827078, 0.036...  
3  [0.19746064, 0.6422223, 0.008550907, 1.3343827...  
4  [0.34589344, 1.245259, 0.0, 0.13843867, 0.0, 3...  
(289, 3)


## Task
Train a Random Forest classifier on the embeddings and evaluate it using stratified 80/20 split of the data. Plot a confusion matrix, compute precision, recall and F1 score per class and plot all of the misclassified images.