# Constants

In [1]:
# False if you have already created and saved a .pth file to PTH_SAVE_PATH
CREATE_NEW_DATASET = True

# train, test, val set size. Should sum to 1
SET_SIZES = {
    "train": 0.8,
    "test": 0.1,
    "val": 0.1,
}

# samples per class in uniform dataset
N_SAMPLES = 250

# path to dataset (do not change)
HM_DATA_PATH = "../dataset/"

# path to pth saves (do not change)
PTH_SAVE_PATH = "../pth/"

# Imports

In [2]:
import os
import sys
import random
import numpy as np
from tqdm import tqdm
import gdown
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import transformers

# Our own files
print(os.getcwd())
# sys.path.append('./src/')
import model_functions
import utils
import training
import datasets

/home/jupyter/DD2430_Project/src


In [3]:
device = 'cpu'

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available(): # For apple silicon
    device = 'mps'

print("Using device:", device)

Using device: cuda


# Download data
Only run this once.

In [8]:
if not os.path.exists(HM_DATA_PATH):
    gdown.download("https://drive.google.com/uc?id=1EZ3AfRp-RMj70qZzIAC-BR0sHsrfjOWx")
    !unzip -q dataset.zip -d ../
    !rm -fr dataset.zip
else:
    print("Data already downloaded.")

Downloading...
From (original): https://drive.google.com/uc?id=1EZ3AfRp-RMj70qZzIAC-BR0sHsrfjOWx
From (redirected): https://drive.google.com/uc?id=1EZ3AfRp-RMj70qZzIAC-BR0sHsrfjOWx&confirm=t&uuid=5842bef9-9163-4b29-996a-153b02db3a69
To: /home/jupyter/DD2430_Project/src/dataset.zip
100%|██████████| 431M/431M [00:02<00:00, 197MB/s] 


# Dataset

## Create new dataset
This will create a new dataset and save it as a .pth to google drive. If you getan error, then it is most likely becuase you can not make a daatset that large.

In [9]:
if CREATE_NEW_DATASET:
    dataset = datasets.HMDataset2(
        articles_csv = HM_DATA_PATH + 'articles.csv',
        image_dir = HM_DATA_PATH + 'images',
        main_class = 'garment_group_name',
        model = transformers.CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device),
        processor = transformers.CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    )
    # data per class
    n_samples =  N_SAMPLES

    assert dataset.articles[dataset.main_class].value_counts().min()>n_samples, 'Can not make balanced set'

    # you can also set all to n_samples then set the ones you want to 0
    for exclude_subclass in ['Unknown', 'Special Offers', 'some other']:
        dataset.counts[exclude_subclass]=n_samples

    # Create uniform dataset
    image_emb, labels, images = dataset.get_n_of_each(n_samples)

    data_to_save = {
        'image_embedding': image_emb,
        'class_text': labels,
        'images': images,
    }
    
    # Save as .pth
    os.makedirs(PTH_SAVE_PATH, exist_ok=True)
    torch.save(data_to_save, f"{PTH_SAVE_PATH}HM_data_{n_samples}.pth")

Max uniform size: 908




Image for article 179208001 not found. Takes next
Image for article 212629004 not found. Takes next
Image for article 215324023 not found. Takes next
Image for article 216961011 not found. Takes next
Image for article 272591001 not found. Takes next
Image for article 348657006 not found. Takes next
Image for article 369423002 not found. Takes next
Image for article 388916001 not found. Takes next
Image for article 397376010 not found. Takes next
Image for article 398947001 not found. Takes next
Image for article 408875001 not found. Takes next
Image for article 420264002 not found. Takes next
Image for article 425683012 not found. Takes next
Image for article 442786001 not found. Takes next
Image for article 468666002 not found. Takes next
Image for article 475827007 not found. Takes next
Image for article 480076004 not found. Takes next
Image for article 481777003 not found. Takes next
Image for article 481797022 not found. Takes next
Image for article 484864002 not found. Takes next


KeyboardInterrupt: 

## Load dataset

In [None]:
file_to_load = f"HM_data_{N_SAMPLES}.pth"

loaded_data = torch.load(f'{PTH_SAVE_PATH}{file_to_load}')

image_emb = loaded_data['image_embedding']
labels = loaded_data['class_text']
images = loaded_data['images']

# Evaluate

**Finetuning**

Performance decreases if we add untuned soft prompts, now we finetune

**Split datasets**

In [None]:
dataset, dataset_train, dataset_test, dataset_val = datasets.split(labels, image_emb, images, N_SAMPLES, SET_SIZES)
batch_size = 128

dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

In [None]:
model = transformers.CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = transformers.CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
processor.feature_extractor.do_rescale = False # make sure image values: False=> [0-1] and True=> [0,255]

## Baseline

The performance of the untuned CLIP

In [None]:
dataloaders = {'train':dataloader_train, 'val':dataloader_val, 'test':dataloader_test}
clip = {'m':model, 'p':processor}

ft = training.FinetuneCLIP(dataloaders, clip)
ft.tt['soft'], ft.tt['LoRA'] = 0, 0 #baseline

ft.initialize({'add':''})  # do not add anything
all_predictions, all_labels, acc = ft.eval(False)
utils.confussion_matrix(all_labels, all_predictions, list(dataset_test.class_to_id.keys()),F1=False)
print(f"Accuracy of baseline is {acc:.2f} %")

Predicts many as Under-nightwear

## Hard prompt tuning

Easiest way to finetune, just change the text accompanied by labels

In [None]:
dataloaders = {'train':dataloader_train, 'val':dataloader_val, 'test':dataloader_test}
clip = {'m':model, 'p':processor}

texts = ['', 'An image of', 'The cloathing type is', 'An image showing cloathing of type']
perf = []
for added_text in texts:
    ft = training.FinetuneCLIP(dataloaders, clip)
    ft.tt['soft'], ft.tt['LoRA'] = 0, 0 #baseline
    ft.train_p['add'] = added_text
    _, _, acc = ft.eval(False)
    perf.append(np.round(acc,2))
print(perf)
print(f"Best accuracy of hard-prompt tune is {max(perf):.2f} %")

## Soft prompt

Add a tunable tensor in the embedding of the text. Added hyperparam search

In [None]:
dataloaders = {'train':dataloader_train, 'val':dataloader_val, 'test':dataloader_test}
clip = {'m':model, 'p':processor}
hp ={'hyperp':[1,2,3,4,5,6] , 'best_losses':[], 'params':[]}
for num_soft in hp['hyperp']:
  ft = training.FinetuneCLIP(dataloaders, clip)
  ft.initialize({'num_soft':num_soft, 'add':''})
  ft.tt['soft'], ft.tt['LoRA'] = 1, 0 #soft
  _, train_p = ft.train()
  hp['params'].append(train_p['soft']) # TODO load best of these later to evaluate on test set, I did not, i ran it again for [5].
  hp['best_losses'].append(ft.loss['val'][-ft.es['pat']])

In [None]:
plt.plot(hp['hyperp'], hp['best_losses'], marker='o', linestyle='-')
plt.xlabel('Number of soft prompts')
plt.ylabel('Validation loss')
plt.show()

In [None]:
ft.plot_loss()
#utils.print_images(dataloader_train, processor,3)
all_predictions, all_labels, acc = ft.eval(False)
utils.confussion_matrix(all_labels, all_predictions, list(dataset_test.class_to_id.keys()),F1=False)
print(f"Accuracy of soft prompt is {acc:.2f} %")

## LoRA

In [None]:
dataloaders = {'train':dataloader_train, 'val':dataloader_val, 'test':dataloader_test}
clip = {'m':model, 'p':processor}

ft = training.FinetuneCLIP(dataloaders, clip)
#ft.initialize({'num_soft':3, 'add':''}) #add rank here

ft.tt['soft'], ft.tt['LoRA'] = 0, 1 #LoRA
#loss, train_p = ft.train() # Fix initialization and forward

In [None]:
ft.load_p() # get best parameters
ft.plot_loss()
#print_images(dataset_test,2, dataloader_train)
all_predictions, all_labels, acc = ft.eval(True)
utils.confussion_matrix(all_labels, all_predictions, list(dataset_test.class_to_id.keys()),F1=False)


## Dag anything

In [None]:
dataloaders = {'train':dataloader_train, 'val':dataloader_val, 'test':dataloader_test}
clip = {'m':model, 'p':processor}

ft = training.FinetuneCLIP(dataloaders, clip)