In [13]:
#pip install peft

Defaulting to user installation because normal site-packages is not writeable
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.14.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import random
import numpy as np
from tqdm import tqdm
import json

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
# class VQADataset(Dataset):
#     def __init__(self, annotations_file, image_dir, processor, max_length=32):
#         """
#         Dataset for VQA fine-tuning
        
#         Args:
#             image_dir (str): Directory containing the images
#             annotations_file (str): Path to annotations file (should contain image_id, question, answer)
#             processor (Blip2Processor): BLIP-2 processor
#             max_length (int): Maximum length for answer generation
#         """
#         self.image_dir = image_dir
#         self.processor = processor
#         self.max_length = max_length
        
#         # Load annotations
#         self.samples = []
#         with open(annotations_file, 'r') as f:
#             data = json.load(f)  # Load the entire JSON file as a dictionary

#         for key, item in data.items():  # Iterate over key-value pairs
#             self.samples.append({
#                 'image_path': item['path'],
#                 'question': item['query'],
#                 'answer': item['answer']
#             })
    
#     def __len__(self):
#         return len(self.samples)
    
#     def __getitem__(self, idx):
#         #idx = idx + 1000
#         item = self.samples[idx]
#         #print(idx)
#         image_path = os.path.join(self.image_dir, f"{item['image_path']}")
#         image = Image.open(image_path).convert('RGB')
        
#         # Process inputs
#         inputs = self.processor(
#             images=image,
#             text=item['question'],
#             padding="max_length",
#             return_tensors="pt"
#         )
        
#         # Process targets
#         target = self.processor(
#             text=item['answer'],
#             padding="max_length",
#             max_length=self.max_length,
#             return_tensors="pt"
#         )
        
#         # Remove batch dimension
#         for k, v in inputs.items():
#             inputs[k] = v.squeeze(0)
        
#         labels = target.input_ids.squeeze(0)
#         labels[labels == self.processor.tokenizer.pad_token_id] = -100  # Set padding tokens to -100 to ignore them in loss
        
#         return {
#             "pixel_values": inputs.pixel_values,
#             "input_ids": inputs.input_ids,
#             "attention_mask": inputs.attention_mask,
#             "labels": labels
#         }
#     def __getitem__(self, idx):
#         item = self.samples[idx]
        
#         image_path = item['image_path']
#         image = Image.open('datasets/' + image_path).convert('RGB')
#         text = item['question']
#         answer = item['answer']
        
#         encoding = self.processor(image, text, padding="max_length", truncation=True, return_tensors="pt")
#         labels = self.processor.tokenizer.encode(
#             answer, max_length= 32, pad_to_max_length=True, return_tensors='pt'
#         )
#         encoding["labels"] = labels
#         # remove batch dimension
#         for k,v in encoding.items():
#             encoding[k] = v.squeeze()
#         return encoding

class VQADataset(Dataset):
    def __init__(self, annotations_file, image_dir, processor, max_length=32):
        """
        Dataset for VQA fine-tuning
        
        Args:
            image_dir (str): Directory containing the images
            annotations_file (str): Path to annotations file (should contain image_id, question, answer)
            processor (Blip2Processor): BLIP-2 processor
            max_length (int): Maximum length for answer generation
        """
        self.image_dir = image_dir
        self.processor = processor
        self.max_length = max_length
        
        # Load annotations
        self.samples = []
        with open(annotations_file, 'r') as f:
            data = json.load(f)  # Load the entire JSON file as a dictionary
        for key, item in data.items():  # Iterate over key-value pairs
            self.samples.append({
                'image_path': item['path'],
                'question': item['query'],
                'answer': item['answer']
            })
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        item = self.samples[idx]
        image_path = os.path.join(self.image_dir, f"{item['image_path']}")
        image = Image.open(image_path).convert('RGB')
        
        # Process inputs
        inputs = self.processor(
            images=image,
            text=item['question'],
            padding="max_length",  # This ensures consistent tensor sizes for input_ids
            truncation=True,       # Add truncation to handle long inputs
            return_tensors="pt"
        )
        
        # Process targets with fixed max_length
        target = self.processor(
            text=item['answer'],
            padding="max_length",
            max_length=self.max_length,
            truncation=True,       # Add truncation to handle long answers
            return_tensors="pt"
        )
        
        # Remove batch dimension
        for k, v in inputs.items():
            inputs[k] = v.squeeze(0)
        
        labels = target.input_ids.squeeze(0)
        
        # Ensure consistent label size by padding or truncating
        if labels.size(0) < self.max_length:
            # Pad with -100 (ignore index) to max_length
            padding = torch.full((self.max_length - labels.size(0),), -100, dtype=labels.dtype)
            labels = torch.cat([labels, padding])
        elif labels.size(0) > self.max_length:
            # Truncate to max_length
            labels = labels[:self.max_length]
        
        # Set padding tokens to -100 to ignore them in loss calculation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        
        return {
            "pixel_values": inputs.pixel_values,
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "labels": labels
        }

In [4]:
local_model_path = os.path.expanduser("~/.cache/huggingface/hub/models--Salesforce--blip2-flan-t5-xl/snapshots/0eb0d3b46c14c1f8c7680bca2693baafdb90bb28/")

# Load Processor (Tokenizer + Image Processor)
processor = Blip2Processor.from_pretrained(local_model_path)

# Load Model
model = Blip2ForConditionalGeneration.from_pretrained(
    local_model_path)

print("Model and Processor loaded successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Processor loaded successfully!


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((

In [6]:
for name, param in model.named_parameters():
    if "lm_head" not in name: 
        
        param.requires_grad = False
    else:
        print(name, param.shape)
        param.requires_grad = True

language_model.lm_head.weight torch.Size([32128, 2048])


In [7]:
# from peft import LoraConfig, get_peft_model

# # Let's define the LoraConfig
# config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.05,
#     bias="none",
#     #target_modules=["language_model.decoder.block.23.layer.1.EncDecAttention.q", "language_model.decoder.block.23.layer.1.EncDecAttention.k"]
# )

In [8]:
# for name, module in model.named_modules():
#     print(name)

In [9]:
# model = get_peft_model(model, config)
# model.print_trainable_parameters()

In [10]:
trainable_params = filter(lambda p: p.requires_grad, model.parameters())
num_trainable_params = sum(p.numel() for p in trainable_params)

print(f"Number of trainable parameters: {num_trainable_params}")

Number of trainable parameters: 65798144


In [6]:
def vqa_collate_fn(batch):
    """
    Custom collate function for VQA dataset to ensure consistent tensor sizes.
    
    Args:
        batch: List of samples from the dataset
        
    Returns:
        Dict with batched pixel_values, input_ids, attention_mask, and labels
    """
    # Initialize lists for each key
    pixel_values = []
    input_ids = []
    attention_mask = []
    labels = []
    
    # Extract and collect values for each key
    for item in batch:
        pixel_values.append(item["pixel_values"])
        input_ids.append(item["input_ids"])
        attention_mask.append(item["attention_mask"])
        labels.append(item["labels"])
    
    # Stack tensors
    # Print shapes for debugging (optional)
    # print(f"Pixel shapes: {[pv.shape for pv in pixel_values]}")
    # print(f"Input ID shapes: {[ids.shape for ids in input_ids]}")
    # print(f"Attention mask shapes: {[mask.shape for mask in attention_mask]}")
    # print(f"Label shapes: {[lbl.shape for lbl in labels]}")
    
    # Stack tensors with consistent sizes
    return {
        "pixel_values": torch.stack(pixel_values),
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels)
    }

In [12]:
def train(args):
    seed_everything(args.seed)
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load model and processor
#     processor = Blip2Processor.from_pretrained(args.model_name)
#     model = Blip2ForConditionalGeneration.from_pretrained(args.model_name)
#     model.to(device)
    
    # Prepare datasets
    train_dataset = VQADataset(
        image_dir=args.train_image_dir,
        annotations_file=args.train_annotations,
        processor=processor,
        max_length=args.max_length
    )
    
    val_dataset = VQADataset(
        image_dir=args.val_image_dir,
        annotations_file=args.val_annotations,
        processor=processor,
        max_length=args.max_length
    )
    
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        collate_fn=vqa_collate_fn,
        num_workers=args.num_workers
    )
    
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        collate_fn=vqa_collate_fn,
        num_workers=args.num_workers
    )
    
    # Optimizer and scheduler
    optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
    total_steps = len(train_dataloader) * args.num_epochs
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)
    
    # Training loop
    best_val_loss = float('inf')
    
    for epoch in range(args.num_epochs):
        # Training
        model.train()
        train_loss = 0.0
        
        train_pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{args.num_epochs} [Train]")
#         for batch in train_pbar:
#             # Move batch to device
#             batch = {k: v.to(device) for k, v in batch.items()}
            
#             # Forward pass
#             outputs = model(
#                 pixel_values=batch["pixel_values"],
#                 input_ids=batch["input_ids"],
#                 attention_mask=batch["attention_mask"],
#                 labels=batch["labels"]
#             )
        for idx, batch in zip(tqdm(range(len(train_dataloader)), desc='Training batch: ...'), train_dataloader):
            input_ids = batch.pop('input_ids').to(device)
            pixel_values = batch.pop('pixel_values').to(device)
            attention_masked = batch.pop('attention_mask').to(device)
            labels = batch.pop('labels').to(device)
            outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        # attention_mask=attention_masked,
                        labels=labels)
            
            loss = outputs.loss
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            
            # Gradient clipping
            if args.clip_grad_norm > 0:
                nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm)
            
            optimizer.step()
            scheduler.step()
            
            train_loss += loss.item()
            train_pbar.set_postfix({"loss": loss.item()})
        
        train_loss /= len(train_dataloader)
        
        # Validation
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            val_pbar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{args.num_epochs} [Val]")
            for batch in val_pbar:
                # Move batch to device
                batch = {k: v.to(device) for k, v in batch.items()}
                
                # Forward pass
                outputs = model(
                    pixel_values=batch["pixel_values"],
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"]
                )
                
                loss = outputs.loss
                val_loss += loss.item()
                val_pbar.set_postfix({"loss": loss.item()})
        
        val_loss /= len(val_dataloader)
        
        print(f"Epoch {epoch+1}/{args.num_epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print(f"Saving best model with validation loss: {val_loss:.4f}")
            model.save_pretrained(os.path.join(args.output_dir, f"best_model"))
            processor.save_pretrained(os.path.join(args.output_dir, f"best_model"))
        
        # Save checkpoint
        if (epoch + 1) % args.save_every == 0:
            model.save_pretrained(os.path.join(args.output_dir, f"checkpoint-{epoch+1}"))
            processor.save_pretrained(os.path.join(args.output_dir, f"checkpoint-{epoch+1}"))


In [13]:
# processor = Blip2Processor.from_pretrained(local_model_path)

# # Load Model
# model = Blip2ForConditionalGeneration.from_pretrained(local_model_path)
# model.to(device)
# model.eval()

In [7]:
def inference_example(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     local_model_path = os.path.expanduser("~/.cache/huggingface/hub/models--Salesforce--blip2-flan-t5-xl/snapshots/0eb0d3b46c14c1f8c7680bca2693baafdb90bb28/")
#     # Load model and processor
#     # Load Processor (Tokenizer + Image Processor)
    
    def __init__(self, model_path):
        self.model_path = model_path
        local_model_path = self.model_path
        
#     processor = Blip2Processor.from_pretrained(local_model_path)

#     # Load Model
    model = Blip2ForConditionalGeneration.from_pretrained(local_model_path)
    model.to(device)
    model.eval()
    
    # Load image and ask question
    image = Image.open(args.test_image).convert('RGB')
    question = args.test_question
    
    # Process inputs
    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
    
    # Generate answer
    outputs = model.generate(
        **inputs,
        max_length=args.max_length,
        num_beams=args.num_beams,
        min_length=1,
        do_sample=args.do_sample,
        top_p=args.top_p,
        temperature=args.temperature,
        repetition_penalty=args.repetition_penalty
    )
    
    # Decode answer
    #answer = processor.decode(outputs[0], skip_special_tokens=True)
    answer = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
    print(f"Question: {question}")
    print(f"Answer: {answer}")

In [14]:
class Args:
    def __init__(self):
        # General parameters
        self.seed = 42
        #self.model_name = "Salesforce/blip2-opt-2.7b"
        self.output_dir = "test_blip2flant5_vqa_finetuned"
        
        # Data parameters
        self.train_image_dir = ""
        self.train_annotations = "converted_vizwiz_train.json"
        self.val_image_dir = ""
        self.val_annotations = "converted_vizwiz_val.json"
        self.max_length = 32
        
        # Training parameters
        self.num_epochs = 20
        self.batch_size = 32
        self.learning_rate = 5e-5
        self.weight_decay = 0.01
        self.clip_grad_norm = 1.0
        self.num_workers = 1
        self.save_every = 1
        
        # Inference parameters
        self.model_path = "test_blip2flant5_vqa_finetuned/best_model"
        self.test_image = "train/VizWiz_train_00000005.jpg"
        self.test_question = "Question: What's this? Answer: "
        self.num_beams = 5
        self.do_sample = False
        self.top_p = 0.9
        self.temperature = 1.0
        self.repetition_penalty = 1.0

# Create output directory
os.makedirs("test_blip2flant5_vqa_finetuned", exist_ok=True)

# For training
# train_args = Args()
# train(train_args)

# For inference
inference_args = Args()
inference_example(inference_args)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Question: Question: What's this? Answer: 
Answer: a pair of shoes in a bathroom


In [16]:
import torch
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"Cached memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
print(f"Available memory: {torch.cuda.mem_get_info()[0] / 1024**2:.2f} MB")

Allocated memory: 17941.98 MB
Cached memory: 38174.00 MB
Available memory: 42336.75 MB


In [18]:
ls

augmented_train_label.json   miscelleneous.ipynb
augmented_val_label.json     [0m[01;34mModel[0m/
[01;34mblip2flant5_vqa_finetuned[0m/   Peft_Train_FlanT5XL.ipynb
BLIP2Inference.ipynb         [01;34mtest_blip2flant5_vqa_finetuned[0m/
[01;34mblip2opt_vqa_finetuned[0m/      [01;34mtrain[0m/
BLIP2Train.ipynb             Train_FlanT5XL.ipynb
converted_vizwiz_train.json  train_label.json
converted_vizwiz_val.json    TrainOPT.ipynb
dataset.py                   Untitled.ipynb
[01;34mdatasets[0m/                    val_label.json
