In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import io
import plotly.express as px
from collections import Counter, defaultdict
import numpy as np
from IPython.display import display, HTML
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

labels = {
    0: 'T-shirt/top',
    1: 'Trouser',
    2: 'Pullover',
    3: 'Dress',
    4: 'Coat',
    5: 'Sandal',
    6: 'Shirt',
    7: 'Sneaker',
    8: 'Bag',
    9: 'Ankle boot'
}

def load_image(image):
    return Image.open(io.BytesIO(image['bytes']))

def visualize_image(image, size=6):
    plt.figure(figsize=(size, size))
    plt.imshow(image)
    plt.axis('off')  
    plt.show()

def build_heatmap(pixels_dict, func = np.median, title="", size=28):
    result_arr = []
    for y in range(size):
        line_arr = []
        for x in range(size):
            line_arr.append(func(pixels_dict[x, y]))
        result_arr.append(line_arr)
    fig = px.imshow(result_arr, zmin=0, zmax=1, color_continuous_scale='gray', title=title)
    fig.update_layout(
        width=500,
        height=500,
        margin=dict(
            l=5,
            r=5,
            b=5,
            t=40,
            pad=4
        ),
        paper_bgcolor="LightSteelBlue",
    )
    return fig

def hheader(text, size=1):
    return display(HTML(f"<h{size}>{text}</h{size}>"))

In [None]:
splits = {'train': 'fashion_mnist/train-00000-of-00001.parquet', 'test': 'fashion_mnist/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/zalando-datasets/fashion_mnist/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/zalando-datasets/fashion_mnist/" + splits["test"])

In [None]:
train_df['named_label'] = train_df['label'].map(lambda x: labels[x])
train_df['loaded_image'] = train_df['image'].map(lambda x: load_image(x))
train_df['image_hash'] = train_df['image'].map(lambda x: hash(x['bytes']))


# test_df['named_label'] = test_df['label'].map(lambda x: labels[x])
train_df

In [None]:
hheader("Number of pictures in each category")
fig = px.histogram(train_df, x="named_label", text_auto=True)
fig.update_layout(
    bargap=0.2,
    title="Number pf pictures in each category",
    xaxis_title="Category",
    yaxis_title="Count",
)
fig.show()

fig = px.histogram(train_df.groupby('named_label')['image_hash'].nunique().reset_index(), x="named_label", y='image_hash', text_auto=True)
fig.update_layout(
    bargap=0.2,
    title="Number of unique pictures in each category",
    xaxis_title="Category",
    yaxis_title="Count",
)
fig.show()


In [None]:
sizes = []
for index, row in train_df.iterrows():
    image = row['loaded_image']
    x,y = image.size
    sizes.append(f"{x}x{y}")
size_count = Counter(sizes)
fig = px.histogram(x=size_count.keys(), y=size_count.values(), text_auto=True)
fig.update_layout(
    title="Pictures size distribution",
    xaxis_title="Picture size, pixels",
    yaxis_title="Count",
)
fig

In [None]:
pixel_dict = defaultdict(list)
category_pixel_dict = defaultdict(lambda: defaultdict(list))
for index, row in train_df.iterrows():
    image = row['loaded_image']
    width, height = image.size
    pixels = image.load()
    for y in range(height):
        for x in range(width):
            pixel = pixels[x, y] / 255
            pixel_dict[x,y].append(pixel)
            category_pixel_dict[row['label']][x, y].append(pixel)

In [None]:
hheader("Overall dataset heatmaps")
build_heatmap(pixel_dict, title="Overall heatmap with median values").show()
build_heatmap(pixel_dict, np.mean, title="Overall heatmap with mean values").show()

In [None]:
hheader(f"Label specific heatmaps")
for label_id, label_name in labels.items():
    hheader(f"Heatmaps for {label_name}", 2)
    build_heatmap(category_pixel_dict[label_id], title="Median heatmaps").show()
    build_heatmap(category_pixel_dict[label_id], np.mean, title="Mean heatmaps").show()

In [None]:
hheader("Clustering")
images = [np.array(image).flatten() / 255 for image in train_df["loaded_image"]]
kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(images)
sk_labels = kmeans.labels_
pca = PCA(n_components=2)
X_pca = pca.fit_transform(images)
clusted_df = pd.DataFrame(X_pca, columns=["PCA 1", "PCA 2"])
clusted_df['Cluster'] = sk_labels
fig = px.scatter(clusted_df, x="PCA 1", y="PCA 2", color="Cluster", title="K-means Clustering of Fashion MNIST",
                 color_continuous_scale='Viridis', 
                 labels={"Cluster": "Cluster ID"}, width=2000, height=2000)
fig.update_traces(
    marker=dict(size=5, opacity=0.6),
)
fig.show()

hheader("Clusters research", 2)
cluster_to_orig = defaultdict(list)
for i, orig_label in enumerate(train_df["label"]):
    cluster_label = sk_labels[i]
    cluster_to_orig[int(cluster_label)].append(orig_label)

for k, v in sorted(cluster_to_orig.items()):
    full_dict = {lv: 0 for lk, lv in labels.items()}
    for orig_label, count in sorted(dict(Counter(v)).items()):
        full_dict[labels[orig_label]] += count
    fig = px.bar(x=full_dict.keys(), y=full_dict.values(), text_auto=True)
    fig.update_layout(
        bargap=0.2,
        title=f"Cluster {k}",
        xaxis_title="Category",
        yaxis_title="Count",
    )
    fig.show()