In [26]:
import json
import os
import random

import numpy as np
import cv2
import polars as pl

In [27]:
SEED = 462
random.seed(SEED)
np.random.seed(SEED)
DESCRIPTION_WORDS_COUNT = 2
GRAYSCALE_SIZE = (8, 8)

In [28]:
data_path = "data"
image_data_path = os.path.join(data_path, "image")
vocabulary_path = os.path.join(data_path, "vocabulary.json")
dataset_splits = ["train", "validation", "test"]

In [29]:
def enumerate_dataset(dataset_path):
    return {
        cls: os.listdir(os.path.join(dataset_path, cls))
        for cls in os.listdir(dataset_path)
        if os.path.isdir(os.path.join(dataset_path, cls))
    }

In [30]:
datasets = {
    dataset: enumerate_dataset(os.path.join(image_data_path, dataset))
    for dataset in dataset_splits
}

In [31]:
feature_words = {
    'banana'  : ["yellow", "tropical", "long", "sweet", "soft", "peel"],
    'carrot'  : ["orange", "temperate", "long", "sweet", "crunchy", "skin"],
    'cucumber': ["green", "temperate", "long", "bland", "crunchy", "seeds"],
    'mandarin': ["orange", "tropical", "spherical", "sweet", "sour", "soft", "peel",],
    'tomato'  : ["red", "warm", "spherical", "savory", "sour", "soft", "seeds"]
}

In [32]:
vocabulary = set()
for words in feature_words.values():
    for word in words:
        vocabulary.add(word)
vocabulary = sorted(vocabulary)
with open(vocabulary_path, "w") as fp:
    json.dump(vocabulary, fp)

In [33]:
# grams(mean, std), cm(mean, std)
dist_params = {
    "banana": {"weight": (120, 16), "size": (18, 2.5)},
    "carrot": {"weight": (60, 11), "size": (15, 3)},
    "cucumber": {"weight": (300, 42), "size": (20, 3.5)},
    "mandarin": {"weight": (80, 13), "size": (6.5, 1)},
    "tomato": {"weight": (100, 16), "size": (7, 1.2)}
}

In [34]:
def extract_features(image, image_class):
    blue = image[:, :, 0]
    green = image[:, :, 1]
    red = image[:, :, 2]

    features = {}

    small = cv2.resize(image, GRAYSCALE_SIZE, interpolation=cv2.INTER_AREA)
    gray_small = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
    gray_flat = gray_small.reshape(-1).astype("float32")
    for i, val in enumerate(gray_flat):
        features[f"gray_{i:03d}"] = float(val) # type: ignore | here we know gray_flat is 1d array

    features["blue_mean"] = float(np.mean(blue))
    features["blue_std"] = float(np.std(blue))
    features["green_mean"] = float(np.mean(green))
    features["green_std"] = float(np.std(green))
    features["red_mean"] = float(np.mean(red))
    features["red_std"] = float(np.std(red))

    params = dist_params[image_class]
    features["weight"] = float(np.random.normal(*params["weight"]))
    features["size"] = float(np.random.normal(*params["size"]))

    features["description"] = " ".join(random.sample(feature_words[image_class], DESCRIPTION_WORDS_COUNT))

    features["class"] = image_class
    return features

In [35]:
tabular_path = os.path.join(data_path, "tabular")
os.makedirs(tabular_path, exist_ok=True)
for split, dataset in datasets.items():
    destination_path = os.path.join(tabular_path, f"{split}.csv")
    rows = []
    for class_, images in dataset.items():
        for image in images:
            image_path = os.path.join(image_data_path, split, class_, image)
            rows.append(extract_features(cv2.imread(image_path), class_))
    df = pl.DataFrame(rows)
    df.write_csv(destination_path)