In [None]:
import pandas as pd
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import os
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

#  Load CSV and define image path
df = pd.read_csv('/kaggle/input/actual-dataset/final.csv')
dataset_dir = '/kaggle/input/images/abo-images-small/images/small'

# Take a random sample of 30,000 rows
df = df.sample(n=30000, random_state=42).reset_index(drop=True)

# Load BLIP VQA base model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

# Evaluation setup
results = []
smoothie = SmoothingFunction().method1
bleu_scores = []

for i in tqdm(range(len(df))):
    row = df.iloc[i]
    img_path = os.path.join(dataset_dir, row['path'])

    try:
        image = Image.open(img_path).convert("RGB")
    except:
        print(f"Could not open image {img_path}")
        continue

    question = str(row['Question']).strip() if pd.notnull(row['Question']) else ""
    true_answer = str(row['Answer']).strip() if pd.notnull(row['Answer']) else ""

    question += " (answer in one word)"

    inputs = processor(image, question, return_tensors="pt").to(device)
    output = model.generate(**inputs,temperature=0.7)
    pred = processor.decode(output[0], skip_special_tokens=True).strip()

    reference = [true_answer.lower().split()]
    candidate = pred.lower().split()
    bleu = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu_scores.append(bleu)

    results.append({
        'Index': i,
        'Question': question,
        'GroundTruth': true_answer,
        'Prediction': pred,
        'BLEU-1': bleu
    })

# Convert to DataFrame and evaluate
res_df = pd.DataFrame(results)
res_df['Match'] = res_df.apply(
    lambda x: x['GroundTruth'].strip().lower() == x['Prediction'].strip().lower(),
    axis=1
)

accuracy = res_df['Match'].mean()
average_bleu = sum(bleu_scores) / len(bleu_scores)

#  Save and show
res_df.to_csv('/kaggle/working/blip_vqa_base_30k_sample_results.csv', index=False)
print(f"BLIP-VQA-Base Exact Match Accuracy: {accuracy:.4f}")
print(f"BLIP-VQA-Base Average BLEU-1 Score: {average_bleu:.4f}")
res_df.head(10)

100%|██████████| 30000/30000 [49:42<00:00, 10.06it/s]


BLIP-VQA-Base Exact Match Accuracy: 0.2536
BLIP-VQA-Base Average BLEU-1 Score: 0.2592


Unnamed: 0,Index,Question,GroundTruth,Prediction,BLEU-1,Match
0,0,What is the pattern of the shoe sole? (answer ...,Ridged,stripes,0.0,False
1,1,"Judging by the image, what is the rust-resista...",Brown,rust,0.0,False
2,2,What color is the lid of the 365 Everyday Valu...,Green,green,1.0,True
3,3,"Judging from the image, what color is the ""Who...",White,brown,0.0,False
4,4,Considering the Rivet Theresa chair's upholste...,Dotted,tweed,0.0,False
5,5,"Given the ""Boy and Girl"" design, what shape ar...",Hearts,heart,0.0,False
6,6,Based on the AmazonBasics mat's double-dot tex...,Non-slip,high,0.0,False
7,7,What color is the handle of the garden tool? (...,Blue,blue,1.0,True
8,8,What design is printed on the cups? (answer in...,Leaves,leaf,0.0,False
9,9,What color is the plumbing fixture shown in th...,Black,silver,0.0,False


In [None]:
import pandas as pd
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import os
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

#  Load CSV and define image path
df = pd.read_csv('/kaggle/input/actual-dataset/final.csv')
dataset_dir = '/kaggle/input/images/abo-images-small/images/small'

#  Take a random sample of 30,000 rows
df = df.sample(n=20000, random_state=42).reset_index(drop=True)

#  Load BLIP VQA base model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

#  Evaluation setup
results = []
smoothie = SmoothingFunction().method1
bleu_scores = []

for i in tqdm(range(len(df))):
    row = df.iloc[i]
    img_path = os.path.join(dataset_dir, row['path'])

    try:
        image = Image.open(img_path).convert("RGB")
    except:
        print(f"Could not open image {img_path}")
        continue

    question = str(row['Question']).strip() if pd.notnull(row['Question']) else ""
    true_answer = str(row['Answer']).strip() if pd.notnull(row['Answer']) else ""

    question += " (answer in one word)"

    inputs = processor(image, question, return_tensors="pt").to(device)
    output = model.generate(**inputs,temperature=1.5)
    pred = processor.decode(output[0], skip_special_tokens=True).strip()

    reference = [true_answer.lower().split()]
    candidate = pred.lower().split()
    bleu = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu_scores.append(bleu)

    results.append({
        'Index': i,
        'Question': question,
        'GroundTruth': true_answer,
        'Prediction': pred,
        'BLEU-1': bleu
    })

#  Convert to DataFrame and evaluate
res_df = pd.DataFrame(results)
res_df['Match'] = res_df.apply(
    lambda x: x['GroundTruth'].strip().lower() == x['Prediction'].strip().lower(),
    axis=1
)

accuracy = res_df['Match'].mean()
average_bleu = sum(bleu_scores) / len(bleu_scores)

#  Save and show
res_df.to_csv('/kaggle/working/blip_vqa_base_30k_sample_results.csv', index=False)
print(f"BLIP-VQA-Base Exact Match Accuracy: {accuracy:.4f}")
print(f"BLIP-VQA-Base Average BLEU-1 Score: {average_bleu:.4f}")
res_df.head(10)

100%|██████████| 20000/20000 [31:54<00:00, 10.44it/s]


BLIP-VQA-Base Exact Match Accuracy: 0.2565
BLIP-VQA-Base Average BLEU-1 Score: 0.2620


Unnamed: 0,Index,Question,GroundTruth,Prediction,BLEU-1,Match
0,0,What is the pattern of the shoe sole? (answer ...,Ridged,stripes,0.0,False
1,1,"Judging by the image, what is the rust-resista...",Brown,rust,0.0,False
2,2,What color is the lid of the 365 Everyday Valu...,Green,green,1.0,True
3,3,"Judging from the image, what color is the ""Who...",White,brown,0.0,False
4,4,Considering the Rivet Theresa chair's upholste...,Dotted,tweed,0.0,False
5,5,"Given the ""Boy and Girl"" design, what shape ar...",Hearts,heart,0.0,False
6,6,Based on the AmazonBasics mat's double-dot tex...,Non-slip,high,0.0,False
7,7,What color is the handle of the garden tool? (...,Blue,blue,1.0,True
8,8,What design is printed on the cups? (answer in...,Leaves,leaf,0.0,False
9,9,What color is the plumbing fixture shown in th...,Black,silver,0.0,False


In [None]:
model.save_pretrained("./blip-vqa-lora-final")
processor.save_pretrained("./blip-vqa-lora-final")
