In [1]:
import pandas as pd
import numpy as np
import os
import sys
import h5py
import cv2

from sklearn.model_selection import train_test_split
from dotenv import load_dotenv

sys.path.append("../")

load_dotenv()

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


# Loading and converting images to H5 files for use in tensorflow
- Loading dataframe referencing the images
- Removing the categories with a too small image count
- Organisation images by categories in list
- Loading images into a dictionnary per category, converting and exporting to h5

In [2]:
df_images = pd.read_pickle(filepath_or_buffer="../data/dataset_images_paths.pkl")


In [3]:
def get_image_count(df):
    df_exploded = df.explode("images")
    
    image_counts = df_exploded["mtype"].value_counts()
    
    return image_counts


get_image_count(df=df_images)


mtype
H5       1270
H6       1143
LL5       850
L6        820
H5-6      544
LL6       510
L5        459
L4          4
L5-6        4
L3.8        3
LL5-6       3
CV3         2
LL4         2
Name: count, dtype: int64

In [4]:
# Keeping :

category_keep = ["H6", "L6", "H5", "LL5", "H5-6", "LL6", "L5"]

df_images = df_images[df_images["mtype"].isin(category_keep)]


In [5]:
get_image_count(df=df_images)


mtype
H5      1270
H6      1143
LL5      850
L6       820
H5-6     544
LL6      510
L5       459
Name: count, dtype: int64

In [6]:
# images path list :
# relative path of image is : ../data/processed_images/{image_name}  (extension in name)

df_exploded = df_images.explode("images")


In [7]:
prefix = "../data/processed_images/"
if os.name == "nt":
    prefix = "..\\data\\processed_images\\"

df_export = df_exploded[["mtype", "images"]].copy()

df_export.loc[:, "images"] = df_export["images"].apply(lambda x: prefix + x)


In [8]:
def filter_existing_paths(df, image_column="images"):
    """
    Filter DataFrame rows where image paths exist.
    """
    # Only keep rows where the image file exists
    df_filtered = df[df[image_column].apply(lambda img: os.path.exists(img))].copy()
    
    # Log how many non-existent paths were culled
    culled_count = len(df) - len(df_filtered)
    print(f"Removed {culled_count} non-existent image paths.")
    
    return df_filtered.reset_index(drop=True)


df_export = filter_existing_paths(df=df_export)

get_image_count(df=df_export)


Removed 684 non-existent image paths.


mtype
H6      1016
H5      1016
LL5      800
L6       656
H5-6     512
LL6      480
L5       432
Name: count, dtype: int64

In [9]:
df_train, df_test = train_test_split(
    df_export,
    test_size=0.3,
    stratify=df_export["mtype"]
    )

df_test.reset_index(inplace=True)
df_train.reset_index(inplace=True)


In [10]:
height = 400
width = height
channel = 3  # RGB

with h5py.File("../data/mtype_images_tf.h5", "w") as h5_file:
    image_shape_train = (len(df_train), height, width, channel)
    image_shape_test = (len(df_test), height, width, channel)

    images_train = h5_file.create_dataset("images_train", image_shape_train, dtype="uint8")
    categories_train = h5_file.create_dataset("mtype_train", (len(df_train), ), dtype=h5py.special_dtype(vlen=str))

    images_test = h5_file.create_dataset("images_test", image_shape_test, dtype="uint8")
    categories_test = h5_file.create_dataset("mtype_test", (len(df_test), ), dtype=h5py.special_dtype(vlen=str))

    for index, row in df_train.iterrows():
        image = cv2.imread(row["images"])
        images_train[index] = image
        categories_train[index] = row["mtype"]

    for index, row in df_test.iterrows():
        image = cv2.imread(row["images"])
        images_test[index] = image
        categories_test[index] = row["mtype"]


In [11]:
print("Train : ", get_image_count(df=df_train))
print("Test : ", get_image_count(df=df_test))


Train :  mtype
H6      711
H5      711
LL5     560
L6      459
H5-6    358
LL6     336
L5      303
Name: count, dtype: int64
Test :  mtype
H6      305
H5      305
LL5     240
L6      197
H5-6    154
LL6     144
L5      129
Name: count, dtype: int64
