In [1]:
!pip install nltk rouge
!git clone https://github.com/salaniz/pycocoevalcap

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Cloning into 'pycocoevalcap'...
remote: Enumerating objects: 821, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 821 (delta 5), reused 15 (delta 4), pack-reused 797[K
Receiving objects: 100% (821/821), 130.06 MiB | 10.96 MiB/s, done.
Resolving deltas: 100% (424/424), done.
Updating files: 100% (40/40), done.


In [2]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

# Function to load JSON data
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Load the data
data = load_json('/content/drive/MyDrive/MasterThesis/captions_data_combined_all.json')

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Categorize captions
def categorize_caption(caption):
    categories = [
        "people and daily activities", "animals and nature",
        "urban and rural settings", "objects and interiors",
        "vehicles and transportation", "food and beverages"
    ]
    result = classifier(caption, candidate_labels=categories)
    return result['labels'][0]

# Prepare to accumulate scores for averaging, organized by model and category
model_category_scores = {'blip': {}, 'gpt2': {}}

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

i=0
# Process data for cosine similarity calculation
for item in data.values():
    original_captions = item['original_coco_captions']
    i+=1
    if i==1098:
      break
    for model_name, generated_caption in item['generated_captions'].items():
        if model_name not in ['blip', 'gpt2']:
            continue  # Only process BLIP and GPT2

        # Categorize based on the generated caption
        category = categorize_caption(generated_caption)
        # Ensure the category dictionary is initialized
        if category not in model_category_scores[model_name]:
            model_category_scores[model_name][category] = []

        # Combine all captions into a single list for vectorization
        captions = original_captions + [generated_caption]
        tfidf_matrix = vectorizer.fit_transform(captions)

        # Calculate cosine similarity between the generated caption and each original caption
        cos_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

        # Store the average cosine similarity for this instance
        avg_cos_sim = cos_similarities.mean()
        model_category_scores[model_name][category].append(avg_cos_sim)

# Print average cosine similarity scores for each model and category
for model_name, categories in model_category_scores.items():
    print(f"Model: {model_name}")
    for category, similarities in categories.items():
        avg_cos_sim = sum(similarities) / len(similarities) if similarities else 0
        print(f"  Category: {category}")
        print(f"  Average Cosine Similarity: {avg_cos_sim:.3f}\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Model: blip
  Category: urban and rural settings
  Average Cosine Similarity: 0.119

  Category: food and beverages
  Average Cosine Similarity: 0.242

  Category: vehicles and transportation
  Average Cosine Similarity: 0.245

  Category: objects and interiors
  Average Cosine Similarity: 0.244

  Category: animals and nature
  Average Cosine Similarity: 0.238

  Category: people and daily activities
  Average Cosine Similarity: 0.181

Model: gpt2
  Category: food and beverages
  Average Cosine Similarity: 0.238

  Category: vehicles and transportation
  Average Cosine Similarity: 0.241

  Category: animals and nature
  Average Cosine Similarity: 0.237

  Category: objects and interiors
  Average Cosine Similarity: 0.245

  Category: people and daily activities
  Average Cosine Similarity: 0.258

  Category: urban and rural settings
  Average Cosine Similarity: 0.290

