# Dataset Notebook

In [None]:
from datasets import load_dataset

hf_dataset = load_dataset('cats_vs_dogs')
cat_dataset = hf_dataset.filter(lambda example: example['labels'] == 0)
dog_dataset = hf_dataset.filter(lambda example: example['labels'] == 1)

## Notes
- The original dataset contains two features: `image` and `labels`
    - `image`: Consists of cats and dogs from PetFinder.com
        - Images in the dataset have varying heights and widths. There are 412 unique heights, 429 unique widths, and 3635 unique dimensions. This means that our images will have varying levels of detail.
        - Min height & width: 4, 4. Max height & width: 500, 500.
    - `labels`: 0 = cat, 1 = dog
- Number of cats: 11,741; Number of dogs: 11,669
    - This indicates a nearly-balanced dataset between cats and dogs. It is however unclear whether there is a balance between cat and dog breeds.
- Initial clustering over the cat dataset did not contain separable clusters.
    - The model used to construct image embeddings is Google's SigLIP (`siglip-base-patch16-256`).
    - Number of clusters was varied between 3 to 50 clusters
    - Dimensionality reduction was done using TSNE

In [None]:
heights = []
widths = []
dimensions = set()

for inst in cat_dataset['train']:
    heights.append(inst['image'].size[0])
    widths.append(inst['image'].size[1])
    dimensions.add(inst['image'].size)

In [None]:
cat_dataframe = cat_dataset['train'].to_pandas()
cat_dataframe['height'] = heights
cat_dataframe['width'] = widths

In [None]:
cat_dataframe.height.hist()

In [None]:
cat_dataframe.width.hist()

In [None]:
# cat_dataset.filter(lambda example: example['image'].size[0] > 100 and example['image'].size[1] > 100)

In [None]:
from transformers import SiglipVisionModel
from transformers import AutoProcessor

import torchvision

import torch
from torch.utils.data import Dataset

from tqdm import tqdm

DEVICE = torch.device('cpu')
# DEVICE = torch.device(
#     'cuda' if torch.cuda.is_available() \
#         else 'mps' if torch.backends.mps.is_available() else 'cpu')

MODEL = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-256").to(DEVICE)
PROCESSOR = AutoProcessor.from_pretrained("google/siglip-base-patch16-256")

In [None]:
PIL_TO_TENSOR = torchvision.transforms.PILToTensor()

def transform(examples):
    images = [PIL_TO_TENSOR(image.convert('RGB')) for image in examples['image']]
    return {'image': images}

cat_dataset.set_transform(transform)

In [None]:
@torch.no_grad()
def get_embeddings(model: torch.nn.Module, images: Dataset):
    """
    Get image embeddings
    """

    # dataloader = torch.utils.data.DataLoader(
    #     images, batch_size=CONFIG.per_device_train_batch_size,
    #     shuffle=False)

    embeddings = None
    for inst in tqdm(images, total=len(images)):
        inputs = PROCESSOR(images=inst['image'], return_tensors="pt")
        output = model(**inputs)
        embeddings = output.pooler_output if embeddings is None \
            else torch.cat([embeddings, output.pooler_output], dim=0)

    print(f"Embeddings: {embeddings.shape}")
    return embeddings

In [None]:
from argparse import Namespace

SEED = 1
CONFIG = Namespace(
    seed=SEED,
    min_num_clusters = 3,
    max_num_clusters = 50
    )

In [None]:
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score

img_embeddings = get_embeddings(MODEL, cat_dataset['train'])
normalized_img_embeddings = normalize(img_embeddings)

In [None]:
from sklearn_extra.cluster import KMedoids

def run_kmedoids(embeddings, num_clusters=8):
    """
    Train KMedoids
    """

    kmedoids = KMedoids(n_clusters=num_clusters, metric='cosine')
    kmedoids.fit(embeddings)
    labels = kmedoids.predict(embeddings)

    return kmedoids, labels

In [None]:
scores = []

for n_clusters in range(CONFIG.min_num_clusters, CONFIG.max_num_clusters+1):

    _, labels = run_kmedoids(normalized_img_embeddings, num_clusters=n_clusters)
    avg_score = silhouette_score(normalized_img_embeddings, labels, metric='cosine',
                                 random_state=CONFIG.seed)
    scores.append((n_clusters, avg_score))

print(f"Score: {scores}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

sns.set(font_scale=2.5)
plt.set_cmap('tab20')

kmedoids_obj, labels = run_kmedoids(normalized_img_embeddings, num_clusters=5)

color_map = plt.get_cmap('tab20')

fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111)
# ax = fig.add_subplot(111, projection='3d')

pca = TSNE(n_components=2)
pca_embedding = pca.fit_transform(normalized_img_embeddings)
df = pd.DataFrame(pca_embedding, columns=['pca1', 'pca2'])
df['cluster'] = labels
# df['sentence'] = [sent for sent, _ in sent_kb]

plot_labels = df['cluster'].values
num_labels = len(set(plot_labels))
for i, label in enumerate(set(plot_labels)):
    label_df = df[df.cluster == label]
    ax.scatter(label_df.pca1, label_df.pca2, label=str(label), color=color_map(i))
# ax.scatter(df.pca1, df.pca2)

ax.set_xlabel("PCA Dimension 1")
ax.set_ylabel("PCA Dimension 2")
# ax.set_zlabel("PCA Dimension 3")
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.7, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1.15, 0.5))
# plt.close(fig)