# Explorative Notebook

This notebook contains exploratory visualizations that were used for the report of the MSE semester psoject "Generative Diffusion Models for 2D Geometric Objects". Descriptions of the visualizations are primarily found in the report itself, which is why only the rough structure of the visualizations is given in the notebook. The explorative analysis is performed on the datasets `train256`, `val256` and `test256` which are present in the data folder. They have been generated using the source code in this repository in `shape_generator.py` and `shapes.py`.

In [None]:
import os
import numpy as np
import random
import textwrap
import webcolors
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.pyplot import imread

## Color descriptions
The textual color description is generated using `CSS3_HEX_TO_NAMES` from the [webcolors](https://webcolors.readthedocs.io/en/1.5/index.html) package. 

Description according to the documentation: 

"*A dictionary whose keys are the normalized hexadecimal values of the 147 names CSS 3 colors, and whose values are the corresponding normalized names.*"

In [None]:


def get_color_name(requested_colour):
        min_colours = {}
        for key, name in webcolors.CSS3_HEX_TO_NAMES.items():
            r_c, g_c, b_c = webcolors.hex_to_rgb(key)
            rd = (r_c - requested_colour[0]) ** 2
            gd = (g_c - requested_colour[1]) ** 2
            bd = (b_c - requested_colour[2]) ** 2
            min_colours[(rd + gd + bd)] = name
        closest_color = min_colours[min(min_colours.keys())]
        return closest_color

def show_color_selection(requested_colour):

    closest_name = get_color_name(requested_colour)
    similar_color = dict(webcolors.CSS3_NAMES_TO_HEX)[closest_name]

    fig, ax = plt.subplots(1, 2, figsize=(4, 2))

    # Check if the input is RGB array
    if (isinstance(requested_colour, list) or isinstance(requested_colour, tuple)) and len(requested_colour) == 3:
        requested_colour = [x/255 for x in requested_colour]  # Normalize to [0,1]

    rect = patches.Rectangle((0, 0), 1, 1, facecolor=requested_colour)
    ax[0].add_patch(rect)
    ax[0].axis('off')
    ax[0].set_title("input")

    rect = patches.Rectangle((0, 0), 1, 1, facecolor=similar_color)
    ax[1].add_patch(rect)
    ax[1].axis('off')
    ax[1].set_title(f"match")
    plt.suptitle(closest_name)
 
print("Exaples of sampled RGB colors and the closest CSS3 color name that is used to describe the color:")

requested_colour = (244,37,20)
show_color_selection(requested_colour)

requested_colour = (104,53,140)
show_color_selection(requested_colour)

requested_colour = (133,133,0)
show_color_selection(requested_colour)

requested_colour = (200,250,0)
show_color_selection(requested_colour)

## Explorative data analysis of generated data

The visualisations that are used to describe the generated data sets *train_256*, *val_256* and *test_256* are created below.

In [None]:
path_train = "data/train256/"
path_val = "data/val256/"
path_test = "data/test256/"

df_train = pd.read_csv(os.path.join(path_train, "labels.csv"))

image_dir = os.path.join(path_train, "images")
filenames = os.listdir(image_dir)

fig, axs = plt.subplots(5, 3, figsize=(6,8))
for i in range(5):
    filename = filenames[random.randint(0, len(filenames))]
    sub_df = df_train.loc[df_train["file"].str.endswith(filename)].reset_index(drop=True)
    prompt = sub_df.at[0, "prompt"]
    image = imread(os.path.join(image_dir, filename))

    fname = textwrap.fill(filename, 18)
    axs[i, 0].text(0.5, 0.5, fname, size=9, ha='center', va='center')
    axs[i, 0].axis('off')  # Hide axes on the text subplot

    
    axs[i, 1].imshow(image)
    axs[i, 1].axis('off')  # Hide axes on the image subplot

    prompt = textwrap.fill(prompt, 27)
    axs[i, 2].text(0.6, 0.5, prompt, size=9, ha='center', va='center')
    axs[i, 2].axis('off')  # Hide axes on the text subplot

    axs[0, 0].set_title("label")
    axs[0, 1].set_title("image")
    axs[0, 2].set_title("prompt")

print("Sampled values from the train set")
plt.show()

### Nr of samples per shape

In [None]:
labels_train = pd.read_csv(os.path.join(path_train, "labels.csv"))
labels_val = pd.read_csv(os.path.join(path_val, "labels.csv"))
labels_test = pd.read_csv(os.path.join(path_test, "labels.csv"))

image_dir = os.path.join(path_train, "images")
filenames = os.listdir(image_dir)

print("Training set")
print(labels_train["shape_name"].value_counts())

In [None]:
print("\nValidation set")
print(labels_val["shape_name"].value_counts())


In [None]:
print("\nTest set")
print(labels_test["shape_name"].value_counts())

### Value distributions

#### Shape diameter

In [None]:
labels_train["proportional_diameter"] = (labels_train["radius"] / labels_train["im_res"]) * 2
labels_val["proportional_diameter"] = (labels_val["radius"] / labels_val["im_res"]) * 2
labels_test["proportional_diameter"] = (labels_test["radius"] / labels_test["im_res"]) * 2

titles = ["train dataset", "validation dataset", "test dataset"]
labels = [labels_train, labels_val, labels_test]
fig, axs = plt.subplots(1,3, figsize=(15,3))

for i, (title, label) in enumerate(zip(titles, labels)):
    axs[i].hist(label["proportional_diameter"])
    axs[i].set_xlabel("Shape diameter / image width")
    axs[i].set_ylabel("count")
    axs[i].set_title(title)

plt.suptitle("Distribution of shape diameters proportional to image width", y=1.1)
plt.show()

#### Position 

In [None]:
fig, axs = plt.subplots(2,6,figsize=(15,5), gridspec_kw={'height_ratios': [1, 4], 'width_ratios': [4, 1, 4, 1, 4, 1]})

labels_train["x_prop"] = labels_train["x"] / 256
labels_val["x_prop"] = labels_val["x"] / 256
labels_test["x_prop"] = labels_test["x"] / 256

labels_train["y_prop"] = labels_train["y"] / 256
labels_val["y_prop"] = labels_val["y"] / 256
labels_test["y_prop"] = labels_test["y"] / 256


axs[0, 0].hist(labels_train["x_prop"])
axs[1, 0].scatter(labels_train["x_prop"], labels_train["y_prop"], marker=".")
axs[1, 1].hist(labels_train["y_prop"], orientation="horizontal")

axs[0, 2].hist(labels_val["x_prop"])
axs[1, 2].scatter(labels_val["x_prop"], labels_val["y_prop"], marker=".")
axs[1, 3].hist(labels_val["y_prop"], orientation="horizontal")

axs[0, 4].hist(labels_test["x_prop"])
axs[1, 4].scatter(labels_test["x_prop"], labels_test["y_prop"], marker=".")
axs[1, 5].hist(labels_test["y_prop"], orientation="horizontal")

axs[0, 0].set_title("train dataset")
axs[0, 2].set_title("validation dataset")
axs[0, 4].set_title("test dataset")


axs[0, 1].axis("off")
axs[0, 3].axis("off")
axs[0, 5].axis("off")

axs[1, 0].set_xlabel("x")
axs[1, 2].set_xlabel("x")
axs[1, 4].set_xlabel("x")

axs[1, 0].set_ylabel("y")
axs[1, 2].set_ylabel("y")
axs[1, 4].set_ylabel("y")


plt.suptitle("Position of shapes in relation to image width and height")
plt.tight_layout()
plt.show()

#### Rotation

In [None]:
titles = ["train dataset", "validation dataset", "test dataset"]
labels = [labels_train, labels_val, labels_test]
fig, axs = plt.subplots(1,3, figsize=(15,3))

for i, (title, label) in enumerate(zip(titles, labels)):
    axs[i].hist(label["rotation"] * 180 / np.pi)
    axs[i].set_xlabel("rotation in degrees")
    axs[i].set_ylabel("count")
    axs[i].set_title(title)

plt.suptitle("Distribution of shape rotation in degrees", y=1.1)
plt.show()




# plt.hist(labels_train["rotation"] * 180 / np.pi)
# plt.title("Distribution of shape rotation in degrees")
# plt.xlabel("rotation in degrees")
# plt.ylabel("count")
# plt.show()

#### Aspect Ratio

In [None]:
titles = ["train dataset", "validation dataset", "test dataset"]
labels = [labels_train, labels_val, labels_test]
fig, axs = plt.subplots(1,3, figsize=(15,3))

for i, (title, label) in enumerate(zip(titles, labels)):
    axs[i].hist(label["aspect_ratio"], bins=5)
    axs[i].set_xlabel("aspect ratio")
    axs[i].set_ylabel("count")
    axs[i].set_title(title)

plt.suptitle("Distribution of the aspect ratio of shapes", y=1.1)
plt.show()

