In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/facad-test-dataset/test_dataset.parquet


In [2]:
!pip install bert-score python-Levenshtein evaluate rouge_score --upgrade nltk

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.0-py3-none-any.whl.metadata (3.7 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting Levenshtein==0.26.0 (from python-Levenshtein)
  Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.0->python-Levenshtein)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m


In [3]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import AutoProcessor, AutoModelForCausalLM
from bert_score import score as bertscore
from datasets import load_dataset
from tqdm import tqdm
import evaluate 
import Levenshtein

# Load test dataset using the datasets library
data_path = '/kaggle/input/facad-test-dataset/test_dataset.parquet'
test_dataset = load_dataset('parquet', data_files=data_path)['train']
test_images = test_dataset['image']
test_captions = test_dataset['text']

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# Load evaluation metrics using the evaluate library
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
meteor = evaluate.load('meteor')

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [5]:
# Initialize lists to store generated captions
blip_captions, blip_finetuned_captions, git_finetuned_captions = [], [], []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Helper function to generate captions using a given model and processor
def generate_captions(model, processor, test_images):
    captions = []
    model = model.to(device)
    model.eval()  
    with torch.no_grad():
        for image in tqdm(test_images):
            inputs = processor(images=image, return_tensors="pt").to(device)
            pixel_values = inputs.pixel_values
            generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
            caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            captions.append(caption)
    return captions

In [6]:
# Load models and processors
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

blip_finetuned_model = BlipForConditionalGeneration.from_pretrained("sagniksengupta/blip-finetuned-facad")
blip_finetuned_processor = BlipProcessor.from_pretrained("sagniksengupta/blip-finetuned-facad")

git_finetuned_model = AutoModelForCausalLM.from_pretrained("sagniksengupta/git-finetuned-facad")
git_finetuned_processor = AutoProcessor.from_pretrained("sagniksengupta/git-finetuned-facad")

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/431 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/869 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [7]:
# Generate captions from all three models
blip_captions = generate_captions(blip_model, blip_processor, test_images)
blip_finetuned_captions = generate_captions(blip_finetuned_model, blip_finetuned_processor, test_images)
git_finetuned_captions = generate_captions(git_finetuned_model, git_finetuned_processor, test_images)

100%|██████████| 1000/1000 [03:37<00:00,  4.59it/s]
100%|██████████| 1000/1000 [07:46<00:00,  2.14it/s]
100%|██████████| 1000/1000 [10:23<00:00,  1.60it/s]


In [20]:
def evaluate_generated_captions(generated_captions, test_captions):
    predictions = generated_captions
    references = [[ref] for ref in test_captions]  

    bleu_scores = bleu.compute(predictions=predictions, references=references)['bleu']
    
    rouge_scores = rouge.compute(predictions=generated_captions, references=test_captions)
    
    P, R, F1 = bertscore(generated_captions, test_captions, lang='en')
    bertscore_f1 = F1.mean().item()
    
    # Levenshtein distance calculation
    levenshtein_distances = []
    for gen_caption, ref_caption in zip(generated_captions, test_captions):
        distance = Levenshtein.distance(gen_caption, ref_caption)
        levenshtein_distances.append(distance)
    
    avg_levenshtein = sum(levenshtein_distances) / len(levenshtein_distances)

    return {
        'bleu': bleu_scores,
        'rouge': rouge_scores['rougeL'],
        'bertscore_f1': bertscore_f1,
        'avg_levenshtein': avg_levenshtein
    }


In [21]:
# Evaluate for each model's captions
blip_metrics = evaluate_generated_captions(blip_captions, test_captions)
blip_finetuned_metrics = evaluate_generated_captions(blip_finetuned_captions, test_captions)
git_finetuned_metrics = evaluate_generated_captions(git_finetuned_captions, test_captions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
print("BLIP Base Metrics: ", blip_metrics)
print("BLIP Fine-tuned Metrics: ", blip_finetuned_metrics)
print("GIT Fine-tuned Metrics: ", git_finetuned_metrics)

BLIP Base Metrics:  {'bleu': 0.0, 'rouge': 0.1295297393210773, 'bertscore_f1': 0.8311129212379456, 'avg_levenshtein': 97.622}
BLIP Fine-tuned Metrics:  {'bleu': 0.02984166260245386, 'rouge': 0.19304251481728948, 'bertscore_f1': 0.8568896055221558, 'avg_levenshtein': 93.806}
GIT Fine-tuned Metrics:  {'bleu': 0.009336050760516194, 'rouge': 0.1502710551309211, 'bertscore_f1': 0.8445741534233093, 'avg_levenshtein': 92.806}


In [23]:
import pandas as pd

data = [
    {'Model': 'BLIP Base', 'BLEU': blip_metrics['bleu'], 'ROUGE': blip_metrics['rouge'], 'BERTScore F1': blip_metrics['bertscore_f1'], 'Avg. Levenshtein': blip_metrics['avg_levenshtein']},
    {'Model': 'BLIP Fine-tuned', 'BLEU': blip_finetuned_metrics['bleu'], 'ROUGE': blip_finetuned_metrics['rouge'], 'BERTScore F1': blip_finetuned_metrics['bertscore_f1'], 'Avg. Levenshtein': blip_finetuned_metrics['avg_levenshtein']},
    {'Model': 'GIT Fine-tuned', 'BLEU': git_finetuned_metrics['bleu'], 'ROUGE': git_finetuned_metrics['rouge'], 'BERTScore F1': git_finetuned_metrics['bertscore_f1'], 'Avg. Levenshtein': git_finetuned_metrics['avg_levenshtein']}
]

df = pd.DataFrame(data)

df

Unnamed: 0,Model,BLEU,ROUGE,BERTScore F1,Avg. Levenshtein
0,BLIP Base,0.0,0.12953,0.831113,97.622
1,BLIP Fine-tuned,0.029842,0.193043,0.85689,93.806
2,GIT Fine-tuned,0.009336,0.150271,0.844574,92.806
