In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install --force-reinstall transformers==4.50.3


In [None]:
import transformers
print(transformers.__version__)  # Must show 4.5.3


In [None]:
from transformers import TrainingArguments
import inspect

print(inspect.signature(TrainingArguments.__init__))

In [None]:
import kagglehub
import pandas as pd
import os

dataset_dir = kagglehub.dataset_download("shaikmdirfan/images")

# List what files are there
print(os.listdir(dataset_dir))

In [None]:
import pandas as pd
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import os
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# ✅ Load CSV and define image path
df = pd.read_csv('/kaggle/input/actual-dataset/merged_data_actual.csv')
dataset_dir = '/kaggle/input/images/abo-images-small/images/small'

# ✅ Take a random sample of 30,000 rows
df = df.sample(n=1000, random_state=42).reset_index(drop=True)

# ✅ Load BLIP VQA base model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

# ✅ Evaluation setup
results = []
smoothie = SmoothingFunction().method1
bleu_scores = []

for i in tqdm(range(len(df))):
    row = df.iloc[i]
    img_path = os.path.join(dataset_dir, row['path'])

    try:
        image = Image.open(img_path).convert("RGB")
    except:
        print(f"Could not open image {img_path}")
        continue

    question = str(row['Question']).strip() if pd.notnull(row['Question']) else ""
    true_answer = str(row['Answer']).strip() if pd.notnull(row['Answer']) else ""

    question += " (answer in one word)"

    inputs = processor(image, question, return_tensors="pt").to(device)
    output = model.generate(**inputs)
    pred = processor.decode(output[0], skip_special_tokens=True).strip()

    reference = [true_answer.lower().split()]
    candidate = pred.lower().split()
    bleu = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu_scores.append(bleu)

    results.append({
        'Index': i,
        'Question': question,
        'GroundTruth': true_answer,
        'Prediction': pred,
        'BLEU-1': bleu
    })

# ✅ Convert to DataFrame and evaluate
res_df = pd.DataFrame(results)
res_df['Match'] = res_df.apply(
    lambda x: x['GroundTruth'].strip().lower() == x['Prediction'].strip().lower(),
    axis=1
)

accuracy = res_df['Match'].mean()
average_bleu = sum(bleu_scores) / len(bleu_scores)

# ✅ Save and show
res_df.to_csv('/kaggle/working/blip_vqa_base_30k_sample_results.csv', index=False)
print(f"BLIP-VQA-Base Exact Match Accuracy: {accuracy:.4f}")
print(f"BLIP-VQA-Base Average BLEU-1 Score: {average_bleu:.4f}")
res_df.head(10)


Inference is done, now we are going to apply finetuning using lora on blip-vqa-base model with approximately 400 parameters on a subset of images,

**Description(Lora):**
**r = 8**
This is the rank of the LoRA adapters. It defines the dimensionality of the low-rank decomposition matrices inserted into the model. A higher rank allows the model to capture more adaptation capacity but also increases the number of trainable parameters.

**lora_alpha = 16**
This is a scaling factor. The output of the LoRA layers is scaled by alpha / r to control the impact of the LoRA weights during training. It ensures the modified model does not deviate too much from the pretrained weights early on.

**lora_dropout = 0.05**
This applies dropout to the LoRA layers during training to prevent overfitting. A small value like 0.05 is common for stable training while still adding regularization.

**bias = "none"**
Specifies that bias terms are not trainable in LoRA layers. This reduces the number of parameters and avoids overfitting.

**target_modules = ["q", "v", "k", "mlp", "vision_layers", "nlp_layers"]**
These are the parts of the model where LoRA adapters are injected:

"q", "k", "v": Query, Key, and Value projection matrices in attention layers.

"mlp": Feed-forward layers.

"vision_layers": Layers in the vision encoder.

"nlp_layers": Layers in the language model (e.g., transformer decoder).

**task_type = "VQA"**
Indicates that the task is Visual Question Answering, which helps define the context in which LoRA modules operate.

In [None]:
for name, module in model.named_modules():
    print(name)


In [None]:
pip install -U bitsandbytes

In [None]:
pip install bitsandbytes accelerate transformers peft


In [None]:
import pandas as pd
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from PIL import Image
import os
from torch.utils.data import Dataset
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm

# Download NLTK data (if not already present)
nltk.download('punkt')

# Custom Dataset Class
class VQADataset(Dataset):
    def __init__(self, df, processor, dataset_dir):
        self.df = df
        self.processor = processor
        self.dataset_dir = dataset_dir
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.dataset_dir, row['path'])
        
        try:
            image = Image.open(img_path).convert("RGB")
        except:
            return None
            
        question = str(row['Question']).strip() if pd.notnull(row['Question']) else ""
        answer = str(row['Answer']).strip() if pd.notnull(row['Answer']) else ""
        
        # Format the question to encourage short answers
        question += " (answer in one word)"
        
        # Process the image and text
        inputs = self.processor(
            images=image,
            text=question,
            return_tensors="pt",
            padding="max_length",
            max_length=32,
            truncation=True
        )
        
        # Process the answer
        answer_inputs = self.processor(
            text=answer,
            return_tensors="pt",
            padding="max_length",
            max_length=32,
            truncation=True
        )
        
        # Remove batch dimension and convert to numpy
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = answer_inputs["input_ids"].squeeze(0)
        
        return inputs

# Load and prepare data
df = pd.read_csv('/kaggle/input/actual-dataset/merged_data_actual.csv')
dataset_dir = '/kaggle/input/images/abo-images-small/images/small'

# Take a subset of 10k images
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

# Split into train and eval (90%/10%)
train_df = df.sample(frac=0.9, random_state=42)
eval_df = df.drop(train_df.index)

# Initialize processor and model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

# Define LoRA configuration
# Targeting key layers in BLIP architecture:
# - text encoder's query/value layers
# - vision encoder's attention layers
lora_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=32,
    target_modules=["query", "value", "vision_encoder.encoder.layer.*.attention.attention"],  # Key layers to adapt
    lora_dropout=0.05,
    bias="none",
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Create datasets
train_dataset = VQADataset(train_df, processor, dataset_dir)
eval_dataset = VQADataset(eval_df, processor, dataset_dir)

# Filter out None values (failed image loads)
train_dataset = [x for x in train_dataset if x is not None]
eval_dataset = [x for x in eval_dataset if x is not None]

# Training arguments
training_args = TrainingArguments(
    output_dir="./blip-vqa-lora",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    fp16=True,
    save_steps=1000,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_steps=500,
    remove_unused_columns=False,
    report_to="none",
)

# Custom compute_metrics function for evaluation
def compute_metrics(p):
    predictions, labels = p
    decoded_preds = processor.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
    
    # Calculate BLEU and accuracy
    smoothie = SmoothingFunction().method1
    bleu_scores = []
    exact_matches = []
    
    for pred, label in zip(decoded_preds, decoded_labels):
        reference = [label.lower().split()]
        candidate = pred.lower().split()
        bleu = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie)
        bleu_scores.append(bleu)
        exact_matches.append(label.strip().lower() == pred.strip().lower())
    
    return {
        "bleu": np.mean(bleu_scores),
        "accuracy": np.mean(exact_matches),
    }

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./blip-vqa-lora-final")
processor.save_pretrained("./blip-vqa-lora-final")

# Evaluate after training
final_results = trainer.evaluate()
print("Final evaluation results:", final_results)

# Generate predictions on test set for comparison
test_results = []
bleu_scores = []
smoothie = SmoothingFunction().method1 
model.eval()
for i in tqdm(range(len(eval_df))):
    row = eval_df.iloc[i]
    img_path = os.path.join(dataset_dir, row['path'])
    
    try:
        image = Image.open(img_path).convert("RGB")
    except:
        continue
        
    question = str(row['Question']).strip() if pd.notnull(row['Question']) else ""
    true_answer = str(row['Answer']).strip() if pd.notnull(row['Answer']) else ""
    
    question += " (answer in one word)"
    
    inputs = processor(image, question, return_tensors="pt").to(device)
    output = model.generate(**inputs)
    pred = processor.decode(output[0], skip_special_tokens=True).strip()
    
    reference = [true_answer.lower().split()]
    candidate = pred.lower().split()
    bleu = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu_scores.append(bleu)
    
    test_results.append({
        'Question': question,
        'GroundTruth': true_answer,
        'Prediction': pred,
        'BLEU-1': bleu,
        'Match': true_answer.strip().lower() == pred.strip().lower()
    })

# Convert to DataFrame and evaluate
test_res_df = pd.DataFrame(test_results)
accuracy = test_res_df['Match'].mean()
average_bleu = sum(bleu_scores) / len(bleu_scores)

# Save and show results
test_res_df.to_csv('/kaggle/working/blip_vqa_lora_finetuned_results.csv', index=False)
print(f"LoRA Fine-tuned Model Exact Match Accuracy: {accuracy:.4f}")
print(f"LoRA Fine-tuned Model Average BLEU-1 Score: {average_bleu:.4f}")
test_res_df.head(10)

**With quantisation**

In [None]:
!python -m bitsandbytes


In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import BlipProcessor, BlipForQuestionAnswering, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from PIL import Image
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
from transformers import BitsAndBytesConfig

# Download NLTK data
nltk.download('punkt')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define dataset class
class VQADataset(Dataset):
    def __init__(self, df, processor, dataset_dir):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.dataset_dir = dataset_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.dataset_dir, row['path'])

        try:
            image = Image.open(img_path).convert("RGB")
        except:
            return None

        question = str(row['Question']).strip() if pd.notnull(row['Question']) else ""
        answer = str(row['Answer']).strip() if pd.notnull(row['Answer']) else ""

        question += " (answer in one word)"

        inputs = self.processor(
            images=image,
            text=question,
            return_tensors="pt",
            padding="max_length",
            max_length=32,
            truncation=True
        )

        answer_inputs = self.processor(
            text=answer,
            return_tensors="pt",
            padding="max_length",
            max_length=32,
            truncation=True
        )

        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = answer_inputs["input_ids"].squeeze(0)

        return inputs

# Load and prepare data
df = pd.read_csv('/kaggle/input/actual-dataset/merged_data_actual.csv')
dataset_dir = '/kaggle/input/images/abo-images-small/images/small'

# Take a subset of 10k images
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

# Split into train and eval (90%/10%)
train_df = df.sample(frac=0.9, random_state=42)
eval_df = df.drop(train_df.index)

# Initialize processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

# Define quantization configuration for base model
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# Load base model with quantization
model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    quantization_config=quantization_config,
    device_map="auto"
)

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value", "vision_encoder.encoder.layer.*.attention.attention"],
    lora_dropout=0.05,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Create datasets
train_dataset = VQADataset(train_df, processor, dataset_dir)
eval_dataset = VQADataset(eval_df, processor, dataset_dir)

# Filter out None values
train_dataset = [x for x in train_dataset if x is not None]
eval_dataset = [x for x in eval_dataset if x is not None]

# Training arguments
training_args = TrainingArguments(
    output_dir="./blip-vqa-lora",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    fp16=True,
    save_steps=1000,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_steps=500,
    remove_unused_columns=False,
    report_to="none"
)

# Define compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    decoded_preds = processor.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)

    smoothie = SmoothingFunction().method1
    bleu_scores = []
    exact_matches = []

    for pred, label in zip(decoded_preds, decoded_labels):
        reference = [label.lower().split()]
        candidate = pred.lower().split()
        bleu = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie)
        bleu_scores.append(bleu)
        exact_matches.append(label.strip().lower() == pred.strip().lower())

    return {
        "bleu": np.mean(bleu_scores),
        "accuracy": np.mean(exact_matches),
    }

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./blip-vqa-lora-final")
processor.save_pretrained("./blip-vqa-lora-final")

# Evaluate after training
final_results = trainer.evaluate()
print("Final evaluation results:", final_results)

# Generate predictions on test set
test_results = []
bleu_scores = []
smoothie = SmoothingFunction().method1
model.eval()

for i in tqdm(range(len(eval_df))):
    row = eval_df.iloc[i]
    img_path = os.path.join(dataset_dir, row['path'])

    try:
        image = Image.open(img_path).convert("RGB")
    except:
        continue

    question = str(row['Question']).strip() if pd.notnull(row['Question']) else ""
    true_answer = str(row['Answer']).strip() if pd.notnull(row['Answer']) else ""

    question += " (answer in one word)"

    inputs = processor(image, question, return_tensors="pt").to(device)
    output = model.generate(**inputs)
    pred = processor.decode(output[0], skip_special_tokens=True).strip()

    reference = [true_answer.lower().split()]
    candidate = pred.lower().split()
    bleu = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu_scores.append(bleu)

    test_results.append({
        'Question': question,
        'GroundTruth': true_answer,
        'Prediction': pred,
        'BLEU-1': bleu,
        'Match': true_answer.strip().lower() == pred.strip().lower()
    })

# Convert to DataFrame and evaluate
test_res_df = pd.DataFrame(test_results)
accuracy = test_res_df['Match'].mean()
average_bleu = sum(bleu_scores) / len(bleu_scores)

# Save and show results
test_res_df.to_csv('/kaggle/working/blip_vqa_lora_finetuned_results.csv', index=False)
print(f"LoRA Fine-tuned Model Exact Match Accuracy: {accuracy:.4f}")
print(f"LoRA Fine-tuned Model Average BLEU-1 Score: {average_bleu:.4f}")
print(test_res_df.head(10))


In [None]:
pip install -U bitsandbytes

**Trying other large model with high no of parameters**

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    Blip2Processor,
    Blip2ForConditionalGeneration,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from peft import LoraConfig, get_peft_model
from PIL import Image
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import warnings

# Setup
warnings.filterwarnings("ignore")
nltk.download('punkt')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load dataset
df = pd.read_csv('/kaggle/input/actual-dataset/merged_data_actual.csv')
dataset_dir = '/kaggle/input/images/abo-images-small/images/small'
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

# Split dataset
initial_eval_df = df.sample(n=100, random_state=0)
train_df = df.drop(initial_eval_df.index).sample(frac=0.9, random_state=1)
eval_df = df.drop(train_df.index)

# Load processor and model with quantization
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map=None
).to(device)

# Dataset class
class VQADataset(Dataset):
    def __init__(self, df, processor, dataset_dir):
        self.samples = []
        self.processor = processor
        self.dataset_dir = dataset_dir

        for _, row in df.iterrows():
            img_path = os.path.join(dataset_dir, row['path'])
            try:
                image = Image.open(img_path).convert("RGB")
            except:
                continue

            question = str(row['Question']).strip() + " (answer in one word)"
            answer = str(row['Answer']).strip()

            inputs = self.processor(
                images=image,
                text=question,
                return_tensors="pt",
                padding="max_length",
                max_length=32,
                truncation=True
            )
            labels = self.processor.tokenizer(
                answer,
                return_tensors="pt",
                max_length=32,
                padding="max_length",
                truncation=True
            ).input_ids

            input_dict = {k: v.squeeze(0) for k, v in inputs.items()}
            input_dict['labels'] = labels.squeeze(0)
            self.samples.append(input_dict)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

# Inference function
def run_inference(model, processor, eval_df, dataset_dir):
    smoothie = SmoothingFunction().method1
    results = []

    for _, row in tqdm(eval_df.iterrows(), total=len(eval_df)):
        img_path = os.path.join(dataset_dir, row['path'])
        try:
            image = Image.open(img_path).convert("RGB")
        except:
            continue

        question = str(row['Question']).strip() + " (answer in one word)"
        answer = str(row['Answer']).strip()

        inputs = processor(images=image, text=question, return_tensors="pt").to(device)

        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=32)

        pred = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
        ref = [answer.lower().split()]
        cand = pred.lower().split()
        bleu = sentence_bleu(ref, cand, weights=(1, 0, 0, 0), smoothing_function=smoothie)

        results.append({
            "Question": question,
            "GroundTruth": answer,
            "Prediction": pred,
            "BLEU-1": bleu,
            "Match": answer.strip().lower() == pred.strip().lower()
        })

    return pd.DataFrame(results)

# Initial evaluation
print("Running initial inference...")
initial_df = run_inference(model, processor, initial_eval_df, dataset_dir)
initial_df.to_csv("/kaggle/working/blip2_before_finetune.csv", index=False)

print("\n✅ First Inference Results (Before Fine-tuning):")
print(f"Total Samples Evaluated: {len(initial_df)}")
print(f"Accuracy: {initial_df['Match'].mean():.4f}")
print(f"Average BLEU-1 Score: {initial_df['BLEU-1'].mean():.4f}")
print("\n🔍 Sample Predictions:")
print(initial_df[['Question', 'GroundTruth', 'Prediction']].head(10).to_string(index=False))

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Prepare datasets
train_dataset = VQADataset(train_df, processor, dataset_dir)
eval_dataset = VQADataset(eval_df, processor, dataset_dir)

# Training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/blip2-flan-vqa-lora",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    fp16=True,
    save_steps=1000,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    remove_unused_columns=False,
    report_to="none"
)

# Compute metrics
def compute_metrics(p):
    preds, labels = p
    decoded_preds = processor.tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
    decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_scores, matches = [], []
    for pred, label in zip(decoded_preds, decoded_labels):
        ref = [label.lower().split()]
        cand = pred.lower().split()
        bleu = sentence_bleu(ref, cand, weights=(1, 0, 0, 0), smoothing_function=SmoothingFunction().method1)
        bleu_scores.append(bleu)
        matches.append(label.strip().lower() == pred.strip().lower())

    return {"bleu": np.mean(bleu_scores), "accuracy": np.mean(matches)}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
)

# Train
print("\n🚀 Starting training...")
trainer.train()

# Save fine-tuned model
model.save_pretrained("/kaggle/working/blip2-flan-vqa-lora-final")
processor.save_pretrained("/kaggle/working/blip2-flan-vqa-lora-final")

# Post-training inference
print("\n📈 Running inference after fine-tuning...")
post_df = run_inference(model, processor, initial_eval_df, dataset_dir)
post_df.to_csv("/kaggle/working/blip2_after_finetune.csv", index=False)

# Comparison
print("\n🎯 Final Comparison:")
print(f"Before Fine-tuning Accuracy: {initial_df['Match'].mean():.4f}")
print(f"After Fine-tuning Accuracy:  {post_df['Match'].mean():.4f}")
print(f"Before BLEU-1: {initial_df['BLEU-1'].mean():.4f}")
print(f"After BLEU-1:  {post_df['BLEU-1'].mean():.4f}")


In [None]:
pip install -U bitsandbytes