In [1]:
import pandas as pd
import numpy as np
import os
import sys
import h5py
import gc

from PIL import Image
from dotenv import load_dotenv

sys.path.append("../")

load_dotenv()

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


# Loading and converting images to H5 files for use in tensorflow
- Loading dataframe referencing the images
- Removing the categories with a too small image count
- Organisation images by categories in list
- Loading images into a dictionnary per category, converting and exporting to h5

In [2]:
df_images = pd.read_pickle(filepath_or_buffer="../data/dataset_images_paths.pkl")


In [None]:
def get_image_count(df):
    df_exploded = df.explode("images")
    
    image_counts = df_exploded["mtype"].value_counts()
    
    return image_counts


get_image_count(df=df_images)


In [4]:
# Keeping :

category_keep = ["H6", "L6", "H5", "LL5", "H5-6", "LL6", "L5"]

df_images = df_images[df_images["mtype"].isin(category_keep)]


In [None]:
get_image_count(df=df_images)


In [6]:
# images path list :
# relative path of image is : ../data/processed_images/{image_name}  (extension in name)

df_exploded = df_images.explode("images")

h6_list = df_exploded[df_exploded["mtype"] == "H6"]["images"].values.tolist()
l6_list = df_exploded[df_exploded["mtype"] == "L6"]["images"].values.tolist()
h5_list = df_exploded[df_exploded["mtype"] == "H5"]["images"].values.tolist()
ll5_list = df_exploded[df_exploded["mtype"] == "LL5"]["images"].values.tolist()
h56_list = df_exploded[df_exploded["mtype"] == "H5-6"]["images"].values.tolist()
ll6_list = df_exploded[df_exploded["mtype"] == "LL6"]["images"].values.tolist()
l5_list = df_exploded[df_exploded["mtype"] == "L5"]["images"].values.tolist()


In [7]:
prefix = "../data/processed_images/"

h6_list = [prefix + im_path for im_path in h6_list]
l6_list = [prefix + im_path for im_path in l6_list]
h5_list = [prefix + im_path for im_path in h5_list]
ll5_list = [prefix + im_path for im_path in ll5_list]
h56_list = [prefix + im_path for im_path in h56_list]
ll6_list = [prefix + im_path for im_path in ll6_list]
l5_list = [prefix + im_path for im_path in l5_list]


In [8]:
# Dict where each mtype corresponds to its list of image paths
image_dict = {
    "H6": h6_list,
    "L6": l6_list,
    "H5": h5_list,
    "LL5": ll5_list,
    "H5-6": h56_list,
    "LL6": ll6_list,
    "L5": l5_list
}


In [9]:
def load_and_process_image(image_path):
    img = Image.open(image_path)
    img_array = np.array(img)
    img_array = img_array / 255.0  # Normalize pixel values to range [0, 1]
    gc.collect()
    return img_array


In [None]:
# This is a very memory heavy process

processed_image_dict = {}

for mtype, image_paths in image_dict.items():
    processed_images = []
    
    for image_path in image_paths:
        try:
            processed_image = load_and_process_image(image_path, target_size=(400, 400))
            processed_images.append(processed_image)
        except FileNotFoundError:
            pass
        except Exception as e:
            print(f"Error loading {image_path}: {e}")
    
    processed_image_dict[mtype] = np.array(processed_images)
