In [1]:
import torch
import torchinfo
import clip  # Import CLIP library
from torch.utils.data import DataLoader

from level_1_dataloader import image_dataloader
from level_1_classifier_1 import level_1_classifier, level_1_model, level_1_output_layer
from level_2_classifier import level_2_classifier, level_2_output_layer, level_2_pre_model_concate, level_2_post_model
from Trainer_1 import Trainer_level_1, Trainer_level_2
from Tester_1 import Tester_level_1, Tester_level_2

import copy

In [2]:
train_set = image_dataloader(csv_file='train_set.xls', root_dir='coverpage/coverpage/')
val_set = image_dataloader(csv_file='val_set.xls', root_dir='coverpage/coverpage/')
test_set = image_dataloader(csv_file='test_set.xls', root_dir='coverpage/coverpage/')


In [3]:
print('no. of training sample', len(train_set))
print('no. of validation sample', len(val_set))
print('no. of testing sample', len(test_set))

no. of training sample 22340
no. of validation sample 2804
no. of testing sample 2809


In [4]:
train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_set, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=32, shuffle=True)

In [5]:
from PIL import Image
import requests
from transformers import Blip2Processor, Blip2Model
import torch

In [6]:
import torch
from transformers import Blip2Model
from torchinfo import summary

class VisionOnlyModel(torch.nn.Module):
    def __init__(self, vision_model):
        super(VisionOnlyModel, self).__init__()
        self.vision_model = vision_model

    def forward(self, pixel_values):
        # Extract the class token (first token) from the last hidden state
        outputs = self.vision_model(pixel_values=pixel_values).last_hidden_state
        class_token_output = outputs[:, 0, :]  # Take the first token
        return class_token_output

def fine_tune_load_image_model():
    # Load the pre-trained BLIP-2 model
    blip2_model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b")

     # Extract the vision model
    vision_model = blip2_model.vision_model

    # Wrap it in a model class
    vision_only_model = VisionOnlyModel(vision_model)

    # Enable gradients for the image model's parameters
    for p in vision_only_model.parameters():
        p.requires_grad = True

    return vision_only_model


In [7]:
FE_model = fine_tune_load_image_model()
print(FE_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

VisionOnlyModel(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((1408,), eps=1e

In [9]:
vision_model = fine_tune_load_image_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
summary(
    vision_model, 
    input_size=(1, 3, 224, 224), 
    col_names=('input_size', 'output_size', 'num_params', 'kernel_size', 'mult_adds'), 
    verbose=1
)

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
VisionOnlyModel                                         [1, 3, 224, 224]          [1, 1408]                 --                        --                        --
├─Blip2VisionModel: 1-1                                 --                        [1, 1408]                 --                        --                        --
│    └─Blip2VisionEmbeddings: 2-1                       [1, 3, 224, 224]          [1, 257, 1408]            363,264                   --                        --
│    │    └─Conv2d: 3-1                                 [1, 3, 224, 224]          [1, 1408, 16, 16]         829,312                   [14, 14]                  212,303,872
│    └─Blip2Encoder: 2-2                                --                        [1, 257, 1408]            --                        --                        --
│    │

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
VisionOnlyModel                                         [1, 3, 224, 224]          [1, 1408]                 --                        --                        --
├─Blip2VisionModel: 1-1                                 --                        [1, 1408]                 --                        --                        --
│    └─Blip2VisionEmbeddings: 2-1                       [1, 3, 224, 224]          [1, 257, 1408]            363,264                   --                        --
│    │    └─Conv2d: 3-1                                 [1, 3, 224, 224]          [1, 1408, 16, 16]         829,312                   [14, 14]                  212,303,872
│    └─Blip2Encoder: 2-2                                --                        [1, 257, 1408]            --                        --                        --
│    │

In [11]:
import os

used_model = 'BLIP-2'
used_model_feature_size = 1408
level_1_checkpoint_dir = './checkpoints/' + used_model + '/level_1/'
level_1_model_file = 'model.pth'
level_1_csvlogger_file = 'log.csv'
level_1_weights_path = os.path.join(level_1_checkpoint_dir, level_1_model_file)
feature_size_extract_from_level_1 = 64
no_epoch = 10
lr = 0.00001
optimizer = 'AdamW'

# Create the directory if it doesn't exist
os.makedirs(level_1_checkpoint_dir, exist_ok=True)

# Print a message to confirm the directory creation (optional)
print(f"Directory {level_1_checkpoint_dir} is ready.")

Directory ./checkpoints/BLIP-2/level_1/ is ready.


In [12]:
used_model_feature_size = 1408  # Example value, adjust based on your model
feature_size_extract_from_level_1 = 1408 

In [13]:
# Define and prepare model components
classifier_level_1 = level_1_classifier(feature_size=used_model_feature_size, feature_size_extract_from_level_1=feature_size_extract_from_level_1)
output_layer_level_1 = level_1_output_layer(feature_size_extract_from_level_1=feature_size_extract_from_level_1, no_class=2)
model_level_1 = level_1_model(FE_model, classifier_level_1, output_layer_level_1)

In [14]:
torchinfo.summary(model_level_1, (1, 3, 224, 224), col_names=('input_size', 'output_size', 'num_params', 'kernel_size', 'mult_adds'), verbose=1)


Layer (type:depth-idx)                                       Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
level_1_model                                                [1, 3, 224, 224]          [1, 2]                    --                        --                        --
├─VisionOnlyModel: 1-1                                       [1, 3, 224, 224]          [1, 1408]                 --                        --                        --
│    └─Blip2VisionModel: 2-1                                 --                        [1, 1408]                 --                        --                        --
│    │    └─Blip2VisionEmbeddings: 3-1                       [1, 3, 224, 224]          [1, 257, 1408]            1,192,576                 --                        212,303,872
│    │    └─Blip2Encoder: 3-2                                --                        [1, 257, 1408]            984,756,864               --   

  return self._call_impl(*args, **kwargs)


Layer (type:depth-idx)                                       Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
level_1_model                                                [1, 3, 224, 224]          [1, 2]                    --                        --                        --
├─VisionOnlyModel: 1-1                                       [1, 3, 224, 224]          [1, 1408]                 --                        --                        --
│    └─Blip2VisionModel: 2-1                                 --                        [1, 1408]                 --                        --                        --
│    │    └─Blip2VisionEmbeddings: 3-1                       [1, 3, 224, 224]          [1, 257, 1408]            1,192,576                 --                        212,303,872
│    │    └─Blip2Encoder: 3-2                                --                        [1, 257, 1408]            984,756,864               --   

In [15]:
trainer = Trainer_level_1(model = model_level_1, level=1, training_dataloader = train_dataloader, validation_dataloader = val_dataloader, epoch = no_epoch, learning_rate = lr, use_gpu = True, opt_method = optimizer, checkpoint_dir = level_1_checkpoint_dir, checkpoint_filename=level_1_model_file, csv_logger = level_1_csvlogger_file  )

In [17]:
trainer.run()

Finish initializing...
2024-06-11 14:41:18.942663
EPOCH 1:


 33%|██████▋             | 234/699 [7:47:28<15:28:56, 119.86s/batch, loss=0.353]


KeyboardInterrupt: 