In [118]:
import os
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from torch.utils.data import DataLoader
from torch import nn
from torchvision.io import read_image
from torchvision.transforms import v2
from torchvision import transforms
import torch
import torchvision.transforms as T
import json
import urllib
import requests
from PIL import Image
import tqdm
from io import BytesIO
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [97]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Seeds

In [98]:
def fix_random_seeds(seed=12345):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)


fix_random_seeds()

# Configuration

In [99]:
model_config = {
    'init_lr': 0.01,
    'batch_size': 32,
    'epochs': 25,
    'empty_image_representation': 'zero_matrix',  #  zero_matrix, torch_empty
    'dino_architecture': 'small',
    'dataset': 'polyvore_63eb50dc58d97415384467bef7b3c9e1bd6c96e06ad19571b6bc15e9dd5af262.parquet',
    # 'output_logistics_thresholds': [0.8, 0.7, 0.6, 0.5, 0.4, 0.3],
    'model_forward_version': 'fast',  # slow, fast
    'hidden_layer_neuron_count': 64
}

dataset_folder_root_path = '../datasets'
dataset_path = f'{dataset_folder_root_path}/imageBasedModel/polyvore/{model_config["dataset"]}'

# Load the dataset and DINO model

In [100]:
Train_df = pd.read_parquet('../datasets/imageBasedModel/polyvore/polyvore_train_eb26e630100b98397deda54fa4a0bb95929479bc30e83cbfa72424b7c1e6e178.parquet')
Test_df = pd.read_parquet('../datasets/imageBasedModel/polyvore/polyvore_test_eb26e630100b98397deda54fa4a0bb95929479bc30e83cbfa72424b7c1e6e178.parquet')

In [101]:
df = pd.concat([Train_df, Test_df])

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6723 entries, 213424314 to 209490988
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Innerwear_imagePath           6723 non-null   object
 1   Bottomwear_imagePath          6723 non-null   object
 2   Accessoire_imagePath          5631 non-null   object
 3   Shoes_imagePath               3984 non-null   object
 4   Outerwear_imagePath           2179 non-null   object
 5   Innerwear_imagePath 256x256   6723 non-null   object
 6   Bottomwear_imagePath 256x256  6723 non-null   object
 7   Accessoire_imagePath 256x256  5631 non-null   object
 8   Shoes_imagePath 256x256       3984 non-null   object
 9   Outerwear_imagePath 256x256   2179 non-null   object
 10  valid_outfit                  6723 non-null   int64 
dtypes: int64(1), object(10)
memory usage: 630.3+ KB


In [103]:
# DINOv2
dinov2 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
dinov2 = dinov2.eval().to(device)
#dinov2_vitb14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
#dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
#dinov2_vitg14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')

Using cache found in /Users/luca/.cache/torch/hub/facebookresearch_dinov2_main


In [104]:
dinov2 = dinov2.eval().to(device)

# Load a test image

In [105]:
image_path = '../datasets/images_256x256/100119331/1.jpg'
input_image = Image.open(image_path)

transform = T.Compose([
    T.Resize(256, interpolation=T.InterpolationMode.BICUBIC),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
])

image = transform(input_image).to(device)

In [106]:
with torch.no_grad():
    features = dinov2(image.unsqueeze(0))

print(features)
print(features.shape)

tensor([[ 3.4772,  1.7773,  1.4635,  2.2478, -3.1532, -1.9697,  0.0358, -1.5149,
          3.5743, -2.1269,  1.0779, -0.3922, -1.1458, -1.3456,  5.0471,  3.7748,
          2.0318, -2.5745,  0.3614,  0.1399,  0.0674,  0.7749, -1.2847,  2.1297,
          1.2446, -2.6839,  1.5353,  2.9477,  0.8563,  2.3525, -2.8953,  0.0990,
         -0.4088, -2.4031, -0.5959,  3.5995,  2.6785, -2.3534, -1.0936, -2.2574,
          1.4675, -1.6633,  3.9569,  1.5211, -1.5145, -0.2431, -1.5367, -1.5053,
         -2.2562, -1.8115,  4.4599,  2.6838, -3.3050, -1.4802,  2.6929, -3.9444,
          2.6213, -1.5817, -4.5009, -1.3996,  0.9181,  2.2265, -1.0144, -2.1989,
         -0.7569,  5.8527,  3.9435, -4.7644,  0.4165,  1.5716,  0.8196,  1.1729,
          0.3606, -1.8431, -4.0229,  0.3466, -2.0093,  3.5627, -1.9622, -0.2825,
         -1.1377, -1.6338, -0.5705,  3.9196, -0.2342, -4.9130, -0.5169, -0.4783,
         -1.2212,  2.1530,  1.5316,  3.2916,  1.2449, -0.1067,  0.0386,  1.4902,
         -1.3760, -0.9201, -

In [107]:
def dinov2_classifier(img_url):
    
    if img_url is None:
        if model_config["empty_image_representation"] == "zero_matrix":
            return torch.zeros(3, 224, 224)
        elif model_config["empty_image_representation"] == "torch_empty":
            return torch.empty(3, 224, 224)
    
    image = Image.open(img_url)

    # Preprocess the image
    transform = T.Compose([
        T.Resize(256, interpolation=T.InterpolationMode.BICUBIC),
        T.CenterCrop(224),
        T.ToTensor(),
        T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ])
    image = transform(image)

    # Move the image to the GPU if available
    image = image.to(device)

    # Extract the features
    with torch.no_grad():
        features = dinov2(image.unsqueeze(0)) #
        #features = torch.squeeze(feature_extraction_model(image.unsqueeze(0)))

    # Print the features
    return features

In [108]:
dinov2_classifier(image_path)

tensor([[ 3.4772,  1.7773,  1.4635,  2.2478, -3.1532, -1.9697,  0.0358, -1.5149,
          3.5743, -2.1269,  1.0779, -0.3922, -1.1458, -1.3456,  5.0471,  3.7748,
          2.0318, -2.5745,  0.3614,  0.1399,  0.0674,  0.7749, -1.2847,  2.1297,
          1.2446, -2.6839,  1.5353,  2.9477,  0.8563,  2.3525, -2.8953,  0.0990,
         -0.4088, -2.4031, -0.5959,  3.5995,  2.6785, -2.3534, -1.0936, -2.2574,
          1.4675, -1.6633,  3.9569,  1.5211, -1.5145, -0.2431, -1.5367, -1.5053,
         -2.2562, -1.8115,  4.4599,  2.6838, -3.3050, -1.4802,  2.6929, -3.9444,
          2.6213, -1.5817, -4.5009, -1.3996,  0.9181,  2.2265, -1.0144, -2.1989,
         -0.7569,  5.8527,  3.9435, -4.7644,  0.4165,  1.5716,  0.8196,  1.1729,
          0.3606, -1.8431, -4.0229,  0.3466, -2.0093,  3.5627, -1.9622, -0.2825,
         -1.1377, -1.6338, -0.5705,  3.9196, -0.2342, -4.9130, -0.5169, -0.4783,
         -1.2212,  2.1530,  1.5316,  3.2916,  1.2449, -0.1067,  0.0386,  1.4902,
         -1.3760, -0.9201, -

In [109]:
Columns_to_embed = ['Innerwear_imagePath 256x256', 'Bottomwear_imagePath 256x256', 'Accessoire_imagePath 256x256',
       'Shoes_imagePath 256x256', 'Outerwear_imagePath 256x256']

In [110]:
df.columns

Index(['Innerwear_imagePath', 'Bottomwear_imagePath', 'Accessoire_imagePath',
       'Shoes_imagePath', 'Outerwear_imagePath', 'Innerwear_imagePath 256x256',
       'Bottomwear_imagePath 256x256', 'Accessoire_imagePath 256x256',
       'Shoes_imagePath 256x256', 'Outerwear_imagePath 256x256',
       'valid_outfit'],
      dtype='object')

In [None]:
for col in Columns_to_embed:
    df[f'{col}_Embedded'] = df[col].apply(lambda x: dinov2_classifier(f'../{x}'))

In [116]:
result = df.loc['../None']
result

KeyError: '../None'