In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.layers import Embedding, LSTM, add, Concatenate, Reshape, concatenate, Bidirectional
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet201
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap

plt.rcParams['font.size'] = 12
sns.set_style("dark")
warnings.filterwarnings('ignore')



# **nlpconnect/vit-gpt2-image-captioning**

In [2]:

from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


predict_step(['/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000366164.jpg']) # ['a woman in a hospital bed with a woman in a hospital bed']


Downloading config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


['a man standing in a kitchen with a plate of food']

In [3]:
predict_step(['/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000366164.jpg']) 

['a man standing in a kitchen with a plate of food']

In [4]:

from transformers import pipeline

image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

a = image_to_text("/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000366164.jpg") 

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [5]:
a[0]['generated_text']

'a man in a kitchen with a bowl of food '

In [6]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

In [7]:
path = '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/'

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv('/kaggle/input/flickr-image-dataset/flickr30k_images/results.csv', delimiter='|')


In [10]:
(len(df))*0.3

47674.5

In [11]:
test_df = df[0:45000]

In [12]:
len(test_df)

45000

In [13]:
# Group captions by image names
grouped_data = test_df.groupby('image_name')[' comment'].apply(list).to_dict()

print(len(grouped_data))

9000


In [14]:
path = '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/'

In [15]:
import time
import tqdm
def evaluate_model(df):
    start_time = time.time()

    grouped_data = df.groupby('image_name')[' comment'].apply(list).to_dict()
    print(len(grouped_data))
    scores = []
    for img_name, reference_captions in tqdm.tqdm(grouped_data.items(), desc="Evaluating"):
        img_name = path+img_name
        predicted_caption = predict_step([img_name])
        try:
            predicted_tokens = predicted_caption[0].split()
            reference_tokens = [ref.split() if isinstance(ref, str) else [] for ref in reference_captions]
            score = sentence_bleu(reference_tokens, predicted_tokens)
            scores.append(score)
        except AttributeError as e:
            # Handle the exception as needed, e.g., log an error message, skip, etc.
            print(f"Error for image {img_name}: {e}")
            continue  # Skip this image-caption pair and move to the nex
    end_time = time.time()
    elapsed_time = end_time - start_time
    print('time is',elapsed_time)
    print(sum(scores) / len(scores))
    return sum(scores) / len(scores)

In [16]:
value =evaluate_model(test_df)

9000


Evaluating: 100%|██████████| 9000/9000 [46:44<00:00,  3.21it/s]

time is 2804.6616089344025
0.5027036717976299





# **BLIP large unconditional**

In [17]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

inputs = processor(raw_image, return_tensors="pt").to("cuda")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))


Downloading (…)rocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

woman sitting on the beach with her dog and a cell phone


In [18]:
raw_image = Image.open('/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000092795.jpg').convert('RGB')

inputs = processor(raw_image, return_tensors="pt").to("cuda")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

they are standing in the garden looking at a cell phone


In [19]:
#  try:
#             # Process the image and generate the caption
#             raw_image = Image.open(img_name).convert('RGB')
#             inputs = processor(raw_image, return_tensors="pt").to("cuda")
#             out = model.generate(**inputs)
#             predicted_caption = processor.decode(out[0], skip_special_tokens=True)
#             predicted_tokens = predicted_caption.split()

#             # Check if reference captions are strings and split them, handle non-strings gracefully
#             reference_tokens = [ref.split() if isinstance(ref, str) else [''] for ref in reference_captions]
            
#             # Compute BLEU score
#             score = sentence_bleu(reference_tokens, predicted_tokens)
#             scores.append(score)
#         except AttributeError as e:
#             print(f"Error processing image {img_name}: {e}")
#         except Exception as e:
#             print(f"An unexpected error occurred with image {img_name}: {e}")


In [20]:
import time
import tqdm
def evaluate_model_BLIP_L(df):
    start_time = time.time()

    grouped_data = df.groupby('image_name')[' comment'].apply(list).to_dict()
    print(len(grouped_data))
    scores = []
    for img_name, reference_captions in tqdm.tqdm(grouped_data.items(), desc="Evaluating"):
        img_name = path+img_name
        try:
            raw_image = Image.open(img_name).convert('RGB')
            inputs = processor(raw_image, return_tensors="pt").to("cuda")
            out = model.generate(**inputs)
            predicted_caption = processor.decode(out[0], skip_special_tokens=True)
            predicted_tokens = predicted_caption.split()
            reference_tokens = [ref.split() if isinstance(ref, str) else [''] for ref in reference_captions]
            score = sentence_bleu(reference_tokens, predicted_tokens)
            scores.append(score)
        except AttributeError as e:
            print(f"Error processing image {img_name}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred with image {img_name}: {e}")
    end_time = time.time()
    elapsed_time = end_time - start_time
    print('time taken is',elapsed_time)
    return sum(scores) / len(scores)

In [21]:
# test_df = df[0:1000]

In [22]:
evaluate_model_BLIP_L(test_df)

9000


Evaluating: 100%|██████████| 9000/9000 [48:42<00:00,  3.08it/s]

time taken is 2922.6077933311462





0.46022497414146135

# **blip-image-captioning-base unconditional**

In [23]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))



Downloading (…)rocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

a woman sitting on the beach with her dog


In [24]:
import time
import tqdm
def evaluate_model_BLIP_B(df):
    start_time = time.time()

    grouped_data = df.groupby('image_name')[' comment'].apply(list).to_dict()
    print(len(grouped_data))
    scores = []
    for img_name, reference_captions in tqdm.tqdm(grouped_data.items(), desc="Evaluating"):
        try:
            img_name = path + img_name
            raw_image = Image.open(img_name).convert('RGB')
            inputs = processor(raw_image, return_tensors="pt")
            out = model.generate(**inputs)
            
            predicted_caption = processor.decode(out[0], skip_special_tokens=True)
            predicted_tokens = predicted_caption.split()

            # Ensure reference captions are strings
            reference_tokens = []
            for ref in reference_captions:
                if isinstance(ref, str):
                    reference_tokens.append(ref.split())
                else:
                    print(f"Non-string caption found for image {img_name}: {ref}")
                    reference_tokens.append([''])  # Use an empty list for non-string captions

            score = sentence_bleu(reference_tokens, predicted_tokens)
            scores.append(score)
        except AttributeError as e:
            print(f"AttributeError processing image {img_name}: {e}")
        except Exception as e:
            print(f"Unexpected error occurred with image {img_name}: {e}")
    end_time = time.time()
    elapsed_time = end_time - start_time
    print('time taken is',elapsed_time)
    return sum(scores) / len(scores)

In [25]:
evaluate_model_BLIP_B(test_df)

9000


Evaluating:  44%|████▍     | 4000/9000 [1:37:21<2:06:10,  1.51s/it]

Non-string caption found for image /kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/2199200615.jpg: nan


Evaluating: 100%|██████████| 9000/9000 [3:37:42<00:00,  1.45s/it]

time taken is 13062.551885128021





0.42982701999882583

# Memory exceeds****# **Salesforce/blip2-opt-2.7b**

In [26]:
# import requests
# from PIL import Image
# from transformers import Blip2Processor, Blip2ForConditionalGeneration

# processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
# model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

# # img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
# raw_image = Image.open('/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000092795.jpg').convert('RGB')

# question = "Write a caption for this photo"
# inputs = processor(raw_image, question, return_tensors="pt")

# out = model.generate(**inputs)
# print(processor.decode(out[0], skip_special_tokens=True).strip())

In [27]:
# import time
# import tqdm
# def evaluate_model_large_blip2(df):
#     start_time = time.time()

#     grouped_data = df.groupby('image_name')[' comment'].apply(list).to_dict()
#     print(len(grouped_data))
#     scores = []
#     for img_name, reference_captions in tqdm.tqdm(grouped_data.items(), desc="Evaluating"):
#         img_name = path+img_name
#         raw_image = Image.open(img_name)
#         pixel_values = processor(images=image, return_tensors="pt").pixel_values
#         generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
#         predicted_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
#         predicted_tokens = predicted_caption.split()
#         reference_tokens = [ref.split() for ref in reference_captions]
#         score = sentence_bleu(reference_tokens, predicted_tokens)
#         scores.append(score)
#     end_time = time.time()
#     elapsed_time = end_time - start_time
#     print('time taken is',elapsed_time)
#     return sum(scores) / len(scores)

# **microsoft/git-large-coco**

In [28]:
from transformers import AutoProcessor, AutoModelForCausalLM
import requests
from PIL import Image

processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

image = Image.open('/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000092795.jpg')

pixel_values = processor(images=image, return_tensors="pt").pixel_values

generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_caption)


Downloading (…)rocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/707M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

a man standing in a garden looking at a cell phone.


In [29]:
import time
import tqdm
def evaluate_model_large_coco(df):
    start_time = time.time()

    grouped_data = df.groupby('image_name')[' comment'].apply(list).to_dict()
    print(len(grouped_data))
    scores = []
    for img_name, reference_captions in tqdm.tqdm(grouped_data.items(), desc="Evaluating"):
        img_name = path+img_name
        raw_image = Image.open(img_name)
        pixel_values = processor(images=image, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
        predicted_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        predicted_tokens = predicted_caption.split()
        reference_tokens = [ref.split() for ref in reference_captions]
        score = sentence_bleu(reference_tokens, predicted_tokens)
        scores.append(score)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print('time taken is',elapsed_time)
    return sum(scores) / len(scores)

In [30]:
test_df2 = test_df[0:4500]

In [31]:
evaluate_model_large_coco(test_df2)

900


Evaluating: 100%|██████████| 900/900 [1:15:59<00:00,  5.07s/it]

time taken is 4559.279318571091





0.5105195908262962