# Late Fusion Concatenation Model
Inspired by Homework 2

In [6]:
import gc
gc.collect()
# we want to load in a pretrained resnet model.
# we want to use the ImageFolder format specified by PyTorch
# we freeze the resnet parameters and train on our new dataset.
# train and evaluate
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import json
import glob
import itertools
from PIL import Image
from PIL.Image import BILINEAR
from torchinfo import summary
from transformers import (
    AutoImageProcessor,
    TrainingArguments,
    Trainer,
    ResNetForImageClassification,
    Owlv2VisionModel,
    ResNetModel,
    AutoProcessor
)
import evaluate
import accelerate


cudnn.benchmark = True
plt.ion()   # interactive mode

<contextlib.ExitStack at 0x298c42340>

## Setting Up Dataset

In [2]:
# For straightforward datasets, sometimes you can make do with built-in PyTorch dataset objects.
# We want to apply automated data augmentations, which will be different for the training
# and eval scenarios

data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
    ]),
    'val': transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
    ]),
}

In [3]:
def collate_fn(batch):
    return {
        "pixel_values": torch.stack([x[0] for x in batch]),
        "labels": torch.LongTensor([int(x[1]) for x in batch]),
    }

In [4]:
data_dir = "../../data/dataset/"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}

class_names = image_datasets['train'].classes

print(class_names)

print(image_datasets.keys())
print("label", image_datasets['train'][0][1])
# we will use this test image to do all our preliminary testing to make sure stuff works.
test_image = image_datasets['train'][0][0]
test_image = test_image.unsqueeze(0)
test_image.mean(), test_image.std(), test_image.shape

['1', '2', '3', '4', '5']
dict_keys(['train', 'val'])
label 0


(tensor(0.3929), tensor(0.2438), torch.Size([1, 3, 224, 224]))

## Baseline Concatenation Model

In [5]:

## OK
## We need to just make this take in the pooled output of a OwLViT and a YOLO model, along with a resnet.
## Early fusion model.
#device = "cpu"  
class BaseLineModel(nn.Module):
  def __init__(self,
               vit,
               resnet,
               tokenizer,
               device):
    super().__init__()

    self.vit = vit
    self.vit.eval()
    self.vit.to(device)
    
    self.resnet = resnet
    self.resnet.to(device)
    self.resnet.eval()

    self.tokenizer = tokenizer
    self.device = device
    
    self.concatenatedLayerSize = vit.config.hidden_size + 1000
    self.clf = nn.Linear(self.concatenatedLayerSize, 5)
    print(self.concatenatedLayerSize)

  def forward(self, pixel_values):
      # Computing image embeddings
      pixel_values = pixel_values.to(self.device)
      image_embeddings = self.resnet(pixel_values).logits
      print("image embeddings shape: ", image_embeddings.shape)
      
      # Computing caption embeddings
      # tokenize all captions
      inputs = self.tokenizer(images = pixel_values, return_tensors="pt", do_rescale=False).to(self.device)
      #Pass the tokenized captions through the BERT model
      vit_output = self.vit(**inputs)

      #get the pooler output from the BERT model's output
      pooled_output = vit_output.pooler_output

      # Concatenate image and caption embeddings along the batch dimension
      full_embeddings = torch.cat((image_embeddings, pooled_output), dim=1)

      print(full_embeddings.shape)
      return self.clf(full_embeddings)
   

## Load in pretrained Resnet and OwLViT Models

In [11]:
resnet = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")
vit = Owlv2VisionModel.from_pretrained("google/owlv2-base-patch16")
processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")

#print(processor)
processed_image = processor(images = test_image, return_tensors='pt')

print(processed_image.pixel_values.shape)
with torch.no_grad():
    output = resnet(test_image)
    print(output.logits.shape)




tensor([[[[-1.7903, -1.7901, -1.7900,  ..., -1.7889, -1.7889, -1.7889],
          [-1.7903, -1.7901, -1.7901,  ..., -1.7889, -1.7889, -1.7889],
          [-1.7903, -1.7901, -1.7901,  ..., -1.7889, -1.7889, -1.7889],
          ...,
          [-1.7859, -1.7860, -1.7860,  ..., -1.7915, -1.7915, -1.7915],
          [-1.7859, -1.7860, -1.7860,  ..., -1.7915, -1.7915, -1.7915],
          [-1.7859, -1.7860, -1.7860,  ..., -1.7914, -1.7915, -1.7915]],

         [[-1.7508, -1.7508, -1.7507,  ..., -1.7503, -1.7503, -1.7502],
          [-1.7508, -1.7508, -1.7508,  ..., -1.7503, -1.7502, -1.7502],
          [-1.7509, -1.7508, -1.7508,  ..., -1.7503, -1.7502, -1.7502],
          ...,
          [-1.7469, -1.7470, -1.7471,  ..., -1.7512, -1.7512, -1.7512],
          [-1.7469, -1.7470, -1.7471,  ..., -1.7512, -1.7512, -1.7512],
          [-1.7469, -1.7470, -1.7470,  ..., -1.7512, -1.7512, -1.7512]],

         [[-1.4795, -1.4795, -1.4795,  ..., -1.4793, -1.4793, -1.4793],
          [-1.4795, -1.4795, -

In [26]:
print(test_image.shape)

torch.Size([1, 3, 224, 224])


In [29]:
concatModel = BaseLineModel(vit = vit, resnet = resnet, tokenizer = processor, device = "cpu")

concatModel(test_image)

1768
image embeddings shape:  torch.Size([1, 1000])
torch.Size([1, 1768])


tensor([[-7.4388,  5.8468, -8.7255,  5.7741, -0.1693]],
       grad_fn=<AddmmBackward0>)

In [30]:
# Setup the training arguments
output_dir = "./concatModel"

# we should edit things like number of training epochs and the batch size
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=100,
    lr_scheduler_type="cosine",
    logging_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    load_best_model_at_end=True,
    dataloader_num_workers=0,
    report_to="wandb",
#     gradient_accumulation_steps=8,
)

In [31]:
# Compute absolute learning rate
base_learning_rate = 1e-3
total_train_batch_size = (
    training_args.train_batch_size * training_args.gradient_accumulation_steps * training_args.world_size
)

training_args.learning_rate = base_learning_rate * total_train_batch_size / 256
print("Set learning rate to:", training_args.learning_rate)

Set learning rate to: 1.5625e-05


In [32]:
# Setup a function to compute accuracy metrics
metric = evaluate.load("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [33]:
os.environ["WANDB_PROJECT"] = "<concatentation!>"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints


In [34]:
# Create the trainer
trainer = Trainer(
    model=resnet,
    args=training_args,
    train_dataset=image_datasets['train'],
    eval_dataset=image_datasets['val'],
    #tokenizer=processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [35]:
train_results = trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrd3054[0m ([33maustin-reiter-goons[0m). Use [1m`wandb login --relogin`[0m to force relogin


  1%|          | 10/1900 [00:24<54:23,  1.73s/it] 

{'loss': 8.1355, 'grad_norm': 113.16458129882812, 'learning_rate': 1.5623932070253664e-05, 'epoch': 0.53}


  1%|          | 13/1900 [00:28<49:24,  1.57s/it]

KeyboardInterrupt: 