In [1]:
import json
import os
import random
import cv2
import numpy as np
# import polars as pl  # didn't use pl because i didn't know how to use it when normalizing
import pandas as pd

In [2]:
SEED = 462
random.seed(SEED)
np.random.seed(SEED)
DESCRIPTION_WORDS_COUNT = 3
GRAYSCALE_SIZE = (8, 8)

In [3]:
data_path = "data"
image_data_path = os.path.join(data_path, "image")
vocabulary_path = os.path.join(data_path, "vocabulary.json")
# dataset_splits = ["train", "validation", "test"]
dataset_splits = ["train", "test"]

In [4]:
FEATURES = {"image": True, "numeric": False, "text": False}

In [5]:
def enumerate_dataset(dataset_path):
    return {cls: os.listdir(os.path.join(dataset_path, cls)) for cls in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, cls))}

In [6]:
datasets = {dataset: enumerate_dataset(os.path.join(image_data_path, dataset)) for dataset in dataset_splits}

In [7]:
feature_words = {
    "banana": [
        "tropical", "long", "sweet", "soft", "peel",
        "fruit", "fresh", "yellow", "curved", "food",
        "healthy", "vitamin", "market", "tasty", "snack"
    ],
    "carrot": [
        "temperate", "long", "sweet", "crunchy", "skin",
        "vegetable", "fresh", "orange", "root", "food",
        "healthy", "vitamin", "market", "tasty", "salad"
    ],
    "cucumber": [
        "temperate", "long", "bland", "crunchy", "seeds",
        "vegetable", "fresh", "green", "water", "food",
        "healthy", "vitamin", "market", "salad", "skin"
    ],
    "mandarin": [
        "tropical", "spherical", "sweet", "sour", "soft", "peel",
        "fruit", "fresh", "orange", "citrus", "food",
        "healthy", "vitamin", "market", "tasty", "snack"
    ],
    "tomato": [
        "warm", "spherical", "savory", "sour", "soft", "seeds",
        "vegetable", "fresh", "red", "juice", "food",
        "healthy", "vitamin", "market", "salad", "sauce"
    ],
}

In [8]:
vocabulary = set()
for words in feature_words.values():
    for word in words:
        vocabulary.add(word)
vocabulary = sorted(vocabulary)
with open(vocabulary_path, "w") as fp:
    json.dump(vocabulary, fp)

In [9]:
# grams(mean, std), cm(mean, std)
dist_params = {
    "banana": {"weight": (160, 30), "size": (18, 5)},
    "carrot": {"weight": (75, 20), "size": (17, 4)},
    "cucumber": {"weight": (140, 55), "size": (20, 6)},
    "mandarin": {"weight": (85, 12), "size": (6, 1.5)},
    "tomato": {"weight": (95, 20), "size": (6.5, 1.5)},
}

In [10]:
def extract_features(image, image_class):
    features = {}

    if FEATURES["image"]:
        small = cv2.resize(image, GRAYSCALE_SIZE, interpolation=cv2.INTER_AREA)
        gray_small = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
        gray_flat = gray_small.reshape(-1).astype("float32")
        for i, val in enumerate(gray_flat):
            features[f"gray_{i:03d}"] = float(val)  # type: ignore | here we know gray_flat is 1d array

        blue = image[:, :, 0]
        green = image[:, :, 1]
        red = image[:, :, 2]

        features["blue_mean"] = float(np.mean(blue))
        features["blue_std"] = float(np.std(blue))
        features["green_mean"] = float(np.mean(green))
        features["green_std"] = float(np.std(green))
        features["red_mean"] = float(np.mean(red))
        features["red_std"] = float(np.std(red))

    if FEATURES["numeric"]:
        params = dist_params[image_class]
        features["weight"] = float(np.random.normal(*params["weight"]))
        features["size"] = float(np.random.normal(*params["size"]))

    if FEATURES["text"]:
        features["description"] = " ".join(random.sample(feature_words[image_class], DESCRIPTION_WORDS_COUNT))

    features["class"] = image_class
    return features

In [None]:
tabular_path = os.path.join(data_path, "tabular")
os.makedirs(tabular_path, exist_ok=True)
normalized_cols = [f"gray_{i:03d}" for i in range(GRAYSCALE_SIZE[0] ** 2)] + ["blue_mean", "blue_std", "green_mean", "green_std", "red_mean", "red_std"]
for split, dataset in datasets.items():
    destination_path = os.path.join(tabular_path, f"{split}_img_only.csv")
    rows = []
    for class_, images in dataset.items():
        for image in images:
            image_path = os.path.join(image_data_path, split, class_, image)
            features = extract_features(cv2.imread(image_path), class_)
            features["file_name"] = image_path
            rows.append(features)
    df = pd.DataFrame(rows)
    df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)  # to shuffle df rows
    df.to_csv(destination_path, index=False)

"""
we need to apply normalization based on the values calculated from
the training set since they are the values that are learned by the 
model
"""
train_df = pd.read_csv(os.path.join(tabular_path, "train_img_only.csv"))
mean     = train_df[normalized_cols].mean()
std      = train_df[normalized_cols].std()

for split in dataset_splits:
    df = pd.read_csv(os.path.join(tabular_path, f"{split}_img_only.csv"))
    df[normalized_cols] = (df[normalized_cols] - mean) / std
    df.to_csv(os.path.join(tabular_path, f"{split}.csv"), index=False)


KeyboardInterrupt: 