In [8]:
import os
import cv2
import numpy as np
import polars as pl
from IPython.display import display
import random

In [None]:
# PLEASE UPDATE THE PATH VARIABLES ACCORDINGLY
data_path = "../data"
image_data_path = os.path.join(data_path, "462-images")
# generated_image_data_path = os.path.join(image_data_path, "generated")

In [25]:
image_classes = os.listdir(image_data_path)
image_paths = {image_class: [] for image_class in image_classes}
for image_class in image_classes:
    image_path = os.path.join(image_data_path, image_class)
    image_paths[image_class] = sorted(os.listdir(image_path))

In [26]:
images = {image_class: [] for image_class in image_classes}
for image_class, image_list in image_paths.items():
    images[image_class] = [
        cv2.imread(
            os.path.join(
                image_data_path,
                image_class,
                image_name,
            )
        )
        for image_name in image_list
    ]

In [None]:
# list of words for each class
feature_words = {
    'banana'  : ["yellow", "tropical", "long", "sweet", "soft", "peel"],
    'carrot'  : ["orange", "temperate", "long", "sweet", "crunchy", "skin"],
    'cucumber': ["green", "temperate", "long", "bland", "crunchy", "seeds"],
    'mandarin': ["orange", "tropical", "spherical", "sweet", "sour", "soft", "peel",],
    'tomato'  : ["red", "warm", "spherical", "savory", "sour", "soft", "seeds"]
}
all_words = set()
for words in feature_words.values():
    all_words.update(words)

vocab = sorted(all_words)
print(vocab)

vocab_index = {word: i for i, word in enumerate(vocab)}
print(vocab_index)  # we can use this dictionary while using the method "bag of words (bow)"

['bland', 'crunchy', 'green', 'long', 'orange', 'peel', 'red', 'savory', 'seeds', 'skin', 'soft', 'sour', 'spherical', 'sweet', 'temperate', 'tropical', 'warm', 'yellow']
{'bland': 0, 'crunchy': 1, 'green': 2, 'long': 3, 'orange': 4, 'peel': 5, 'red': 6, 'savory': 7, 'seeds': 8, 'skin': 9, 'soft': 10, 'sour': 11, 'spherical': 12, 'sweet': 13, 'temperate': 14, 'tropical': 15, 'warm': 16, 'yellow': 17}


In [29]:
# noinspection PyDictCreation
def extract_features(image, image_class):
    blue = image[:, :, 0]
    green = image[:, :, 1]
    red = image[:, :, 2]

    features = {}

    features["blue_mean"] = float(np.mean(blue))
    features["blue_std"] = float(np.std(blue))
    features["green_mean"] = float(np.mean(green))
    features["green_std"] = float(np.std(green))
    features["red_mean"] = float(np.mean(red))
    features["red_std"] = float(np.std(red))

    small = cv2.resize(image, (8, 8), interpolation=cv2.INTER_AREA)
    gray_small = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
    gray_flat = gray_small.reshape(-1).astype("float32")
    for i, val in enumerate(gray_flat):
        features[f"gray_{i:03d}"] = float(val)  # type: ignore

    # grams(mean, std), cm(mean, std)
    dist_params = {
        "banana": {"weight": (120, 15), "size": (18, 2)},
        "carrot": {"weight": (60, 10), "size": (15, 2.5)},
        "cucumber": {"weight": (300, 40), "size": (20, 3)},
        "mandarin": {"weight": (80, 12), "size": (6.5, 0.8)},
        "tomato": {"weight": (100, 15), "size": (7, 1)}
    }

    params = dist_params[image_class]
    features["weight"] = float(np.random.normal(params["weight"][0], params["weight"][1]))
    features["size"] = float(np.random.normal(params["size"][0], params["size"][1]))

    text_feature = ""
    for _ in range(3):
        text_feature += " " + random.choice(feature_words[image_class])
    features["text"] = text_feature
    
    features["class"] = image_class
    return features

In [None]:
rows = [
    extract_features(img, img_class) for img_class, img_matrices in images.items() for img in img_matrices
]
df = pl.DataFrame(rows)
df.write_csv(os.path.join(data_path, "feature_extraction.csv"))  # ALSO YOU MAY NEED TO UPDATE THIS
display(df)

blue_mean,blue_std,green_mean,green_std,red_mean,red_std,gray_000,gray_001,gray_002,gray_003,gray_004,gray_005,gray_006,gray_007,gray_008,gray_009,gray_010,gray_011,gray_012,gray_013,gray_014,gray_015,gray_016,gray_017,gray_018,gray_019,gray_020,gray_021,gray_022,gray_023,gray_024,gray_025,gray_026,gray_027,gray_028,gray_029,gray_030,gray_031,gray_032,gray_033,gray_034,gray_035,gray_036,gray_037,gray_038,gray_039,gray_040,gray_041,gray_042,gray_043,gray_044,gray_045,gray_046,gray_047,gray_048,gray_049,gray_050,gray_051,gray_052,gray_053,gray_054,gray_055,gray_056,gray_057,gray_058,gray_059,gray_060,gray_061,gray_062,gray_063,weight,size,text,class
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str
221.990681,66.734803,230.326927,37.883318,238.635288,19.818595,244.0,243.0,238.0,215.0,230.0,249.0,249.0,249.0,245.0,245.0,240.0,203.0,238.0,250.0,249.0,249.0,245.0,244.0,234.0,169.0,231.0,249.0,250.0,249.0,245.0,242.0,213.0,134.0,199.0,249.0,249.0,249.0,243.0,241.0,217.0,129.0,203.0,249.0,249.0,249.0,242.0,240.0,230.0,145.0,216.0,249.0,249.0,249.0,241.0,240.0,237.0,163.0,226.0,249.0,249.0,249.0,239.0,239.0,240.0,192.0,241.0,249.0,248.0,248.0,63.23355,17.52971,""" sweet temperate crunchy""","""carrot"""
101.912769,85.672357,126.5182,82.45209,159.507484,90.782262,8.0,8.0,7.0,7.0,6.0,5.0,5.0,4.0,37.0,37.0,37.0,36.0,36.0,37.0,38.0,40.0,191.0,195.0,198.0,201.0,204.0,207.0,205.0,207.0,196.0,201.0,196.0,183.0,174.0,173.0,177.0,190.0,196.0,147.0,162.0,159.0,145.0,141.0,139.0,178.0,152.0,85.0,96.0,80.0,86.0,129.0,178.0,217.0,89.0,56.0,96.0,159.0,203.0,213.0,213.0,212.0,187.0,190.0,195.0,197.0,199.0,200.0,201.0,203.0,53.85879,12.729241,""" long skin temperate""","""carrot"""
138.244175,103.919392,178.207298,53.074722,207.69912,27.102115,140.0,148.0,183.0,200.0,198.0,180.0,151.0,144.0,146.0,211.0,216.0,213.0,209.0,207.0,200.0,147.0,186.0,223.0,224.0,228.0,225.0,218.0,210.0,177.0,210.0,201.0,150.0,142.0,139.0,139.0,204.0,194.0,202.0,213.0,176.0,174.0,176.0,190.0,219.0,190.0,149.0,234.0,227.0,216.0,218.0,222.0,223.0,153.0,117.0,165.0,235.0,237.0,233.0,224.0,156.0,130.0,127.0,119.0,118.0,148.0,147.0,120.0,126.0,127.0,62.642504,13.181726,""" temperate long sweet""","""carrot"""
118.130939,65.120933,166.759361,51.899922,215.363323,48.376039,187.0,181.0,177.0,176.0,168.0,173.0,166.0,168.0,196.0,200.0,200.0,198.0,202.0,189.0,175.0,182.0,198.0,203.0,200.0,188.0,171.0,174.0,168.0,160.0,200.0,189.0,176.0,167.0,146.0,130.0,111.0,102.0,170.0,143.0,103.0,91.0,97.0,113.0,148.0,185.0,116.0,116.0,155.0,175.0,190.0,199.0,203.0,210.0,187.0,185.0,191.0,192.0,195.0,195.0,193.0,202.0,191.0,199.0,205.0,211.0,210.0,217.0,221.0,220.0,51.994575,13.801513,""" skin temperate crunchy""","""carrot"""
105.556702,80.683725,150.641182,48.948548,195.140034,61.765831,200.0,192.0,187.0,183.0,129.0,93.0,70.0,101.0,190.0,181.0,163.0,135.0,96.0,57.0,80.0,87.0,204.0,197.0,180.0,150.0,106.0,71.0,100.0,104.0,204.0,194.0,169.0,145.0,136.0,168.0,162.0,156.0,134.0,143.0,151.0,159.0,160.0,167.0,168.0,156.0,133.0,125.0,132.0,153.0,173.0,166.0,161.0,173.0,161.0,161.0,160.0,162.0,169.0,189.0,206.0,222.0,191.0,201.0,208.0,216.0,219.0,219.0,220.0,220.0,47.125152,11.183807,""" crunchy long long""","""carrot"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
162.02211,83.182319,194.391556,45.717801,227.50808,25.894003,231.0,234.0,234.0,234.0,234.0,231.0,228.0,224.0,232.0,234.0,235.0,234.0,234.0,234.0,230.0,225.0,232.0,234.0,234.0,221.0,217.0,229.0,230.0,225.0,233.0,234.0,187.0,162.0,173.0,181.0,220.0,224.0,232.0,234.0,123.0,137.0,162.0,170.0,199.0,220.0,217.0,215.0,134.0,123.0,144.0,154.0,202.0,213.0,198.0,193.0,173.0,95.0,120.0,166.0,207.0,206.0,193.0,196.0,185.0,183.0,183.0,175.0,175.0,169.0,95.636288,6.098675,""" sweet soft soft""","""mandarin"""
103.950916,90.725353,140.48592,79.33903,177.592381,89.231654,35.0,36.0,38.0,39.0,40.0,41.0,42.0,45.0,38.0,35.0,39.0,64.0,56.0,39.0,39.0,38.0,97.0,110.0,182.0,189.0,192.0,179.0,122.0,125.0,126.0,139.0,153.0,156.0,152.0,149.0,137.0,133.0,235.0,218.0,142.0,132.0,132.0,150.0,236.0,241.0,235.0,229.0,165.0,126.0,124.0,173.0,233.0,237.0,229.0,219.0,206.0,138.0,138.0,197.0,215.0,231.0,224.0,230.0,232.0,232.0,231.0,233.0,234.0,233.0,66.536463,6.113691,""" sour tropical sour""","""mandarin"""
143.194893,76.616754,165.916393,46.909785,187.266174,42.974,137.0,139.0,139.0,138.0,138.0,137.0,137.0,134.0,135.0,137.0,137.0,138.0,137.0,135.0,134.0,132.0,163.0,167.0,142.0,163.0,156.0,167.0,167.0,167.0,214.0,189.0,109.0,126.0,171.0,210.0,216.0,216.0,218.0,183.0,120.0,116.0,144.0,185.0,220.0,221.0,196.0,175.0,110.0,134.0,152.0,196.0,225.0,223.0,135.0,129.0,123.0,135.0,193.0,230.0,229.0,226.0,196.0,205.0,211.0,217.0,221.0,224.0,223.0,222.0,85.185032,6.672997,""" peel orange soft""","""mandarin"""
99.86026,98.497532,162.152992,47.195994,212.980568,40.470507,140.0,160.0,153.0,198.0,178.0,179.0,210.0,203.0,147.0,96.0,137.0,161.0,170.0,167.0,200.0,215.0,205.0,169.0,182.0,164.0,119.0,136.0,183.0,185.0,217.0,166.0,187.0,174.0,147.0,162.0,170.0,189.0,213.0,155.0,134.0,142.0,191.0,201.0,139.0,143.0,175.0,193.0,139.0,173.0,192.0,164.0,161.0,127.0,148.0,220.0,203.0,156.0,158.0,126.0,218.0,216.0,138.0,213.0,206.0,122.0,89.0,136.0,215.0,220.0,77.667093,6.382266,""" soft orange sweet""","""mandarin"""
