In [26]:
import sys
import os

import json
from pathlib import Path

import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms import Resize
from datasets import Dataset, Features, Value
from datasets import Image as HFImage
from transformers import AutoImageProcessor, AutoModel

#### to import local funtions

ATENTION: You may have to change the path so that this can run on your device

In [2]:
# Checks on which folder the notebook is initially running
os.getcwd()

'/Users/mar-sangineto/Documents/sorbonne/cours/deepL/projet/OADino'

In [None]:
# Change according to where are you running so that the final path contains oadino

PROJECT_FOLDER_PATH = os.getcwd() # initial path + relative path to OADino
PROJECT_FOLDER_PATH

'/Users/mar-sangineto/Documents/sorbonne/cours/deepL/projet/OADino'

In [32]:
sys.path.append(PROJECT_FOLDER_PATH)

Internal imports

In [5]:
# from oadino.models import OADinoModel, OADinoPreProcessor, ConvVAE16
from oadino.models import OADinoModel, OADinoPreProcessor, ConvVAE16
from oadino.training import get_preprocessed_data

#### Settup device

In [6]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: mps


### Dataset loading

In [None]:
# # loading CLEVR dataset avaliable on https://cs.stanford.edu/people/jcjohns/clevr/


# # loading CLEVRtex dataset avaliable on https://www.robots.ox.ac.uk/~vgg/data/clevrtex/#downloads

# # loading Stanford dataset avaliable on https://huggingface.co/datasets/tanganke/stanford_cars
# # Load the dataset in a tabular format with image URLs and metadata
# cars_dataset = load_dataset("tanganke/stanford_cars")

# # Access the training set directly
# cars_train_set = dataset["train"]

In [7]:
#from training_loop_initial_testing import create_hf_dataset

def create_hf_dataset(image_dir, maxsize=-1):
    image_paths = sorted(Path(image_dir).glob("*.png"))

    # Create dataset dict
    data_dict = {
        "image": [str(p) for p in image_paths[:maxsize]],
        "filename": [p.name for p in image_paths[:maxsize]],
    }

    dataset = Dataset.from_dict(
        data_dict,
        features=Features(
            {
                "image": HFImage(),
                "filename": Value("string"),
            }
        ),
    )

    return dataset

def transform_batch(batch):
    batch["image"] = [transform(img.convert("RGB")) for img in batch["image"]]
    return batch



In [8]:
transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ]
)

CLEVR

In [9]:
DATA_PATH = "../../data/"

train_dataset = create_hf_dataset(DATA_PATH + "/CLEVR_v1.0/images/train", maxsize=4096)
test_dataset = create_hf_dataset(DATA_PATH + "CLEVR_v1.0/images/test", maxsize=4096)

train_dataset = train_dataset.with_transform(transform_batch)
train_dataset_name = "CLEVR_train_4K_224"
test_dataset = test_dataset.with_transform(transform_batch)

In [25]:
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Data visualization

## Models

In [21]:
## Loading Backbone Models
hf_cache = Path("../data/")

dino_processor = AutoImageProcessor.from_pretrained(
    "facebook/dinov2-small", cache_dir=hf_cache
)
dino_model = AutoModel.from_pretrained("facebook/dinov2-base", cache_dir=hf_cache)

Loading weights: 100%|██████████| 223/223 [00:00<00:00, 1031.67it/s, Materializing param=layernorm.weight]                                 


In [11]:
pre_processor = OADinoPreProcessor(dino_processor, dino_model)
vae = ConvVAE16()
model = OADinoModel(vae)

## Preprocessing OADino

visualizing the segmentations and patches

In [22]:
preprocessed_train_dataset = get_preprocessed_data(
    dataset = train_dataset,
    dataset_name = train_dataset_name,
    image_size=224,
    preprocessor=pre_processor,
    base_dir=hf_cache,
    batch_size=64
)

Processing 0 samples...
Saving to: ../data/CLEVR_train_4K_224/facebook_dinov2-base


Batches: 0it [00:00, ?it/s]

Finalizing dataset...
Dataset saved to ../data/CLEVR_train_4K_224/facebook_dinov2-base
Total samples processed: 0





### From the training...

#### Training configutations

In [14]:
config_path = PROJECT_FOLDER_PATH + "/runs/CLEVR_train_4K_224_20260202_210355/config.json"
with open(config_path, "r") as f:
    config = json.load(f)
    
for configuration in config:
    print(configuration, "=", config[configuration])

dataset_name = CLEVR_train_4K_224
model_name = OADinoModel
num_epochs = 10
learning_rate = 0.001
train_batch_size = 64
test_batch_size = 64
loss_beta = 0.0001
image_size = 224
device = cpu


### loading model

visualizing the final trained VAE

In [15]:
checkpoints_dir = "./runs/CLEVR_train_4K_224_20260202_210355/checkpoints/"
checkpoints_dir = Path(checkpoints_dir)
best_checkpoint_path = checkpoints_dir / "best_model.pt"
final_checkpoint_path = checkpoints_dir / "final_model.pt"

In [16]:
best_checkpoint = torch.load(best_checkpoint_path, map_location=device)
final_checkpoint = torch.load(final_checkpoint_path, map_location=device)

best_model = model
final_model = model


best_model.load_state_dict(best_checkpoint["model_state_dict"])
final_model.load_state_dict(final_checkpoint["model_state_dict"])

<All keys matched successfully>

In [17]:
final_model.eval()

OADinoModel(
  (vae): ConvVAE16(
    (encoder): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU()
      (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (7): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (8): ReLU()
      (9): Flatten(start_dim=1, end_dim=-1)
    )
    (mean_layer): Sequential(
      (0): Linear(in_features=1024, out_features=32, bias=True)
    )
    (logvar_layer): Sequential(
      (0): Linear(in_features=1024, out_features=32, bias=True)
    )
    (decoder): Sequential(
      (0): Linear(in_features=32, out_features=1024, bias=True)
      (1): ReLU(inplace=True)
     