In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import json

# Define the path to your JSON file
file_path = '/content/drive/My Drive/MasterThesis/flickr8k_dataset/combined_captions_data_flickr_final.json'

# Open and load the JSON data
with open(file_path, 'r') as file:
    captions_data = json.load(file)


In [None]:
!pip install transformers torch evaluate

import torch
import json
from transformers import CLIPProcessor, CLIPModel, pipeline

# Initialize the necessary components
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Define the function to summarize captions
def summarize_captions(captions):
    combined_text = ' '.join(captions)
    input_length = len(combined_text.split())  # Calculate the input length

    # Dynamically adjust max_length with a gentler slope, ensuring it doesn't exceed 75% of the input length
    # but is also not shorter than a minimum viable length for coherence
    max_length_ratio = 0.75
    min_viable_length = 20  # Adjust this as needed to ensure summaries are not too short
    max_length = max(min_viable_length, int(input_length * max_length_ratio))

    # Ensure min_length is logical given the new max_length, aiming for summaries to have enough room to be coherent
    min_length = max(10, max_length // 2)

    summarized_text = summarizer(combined_text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
    return summarized_text

# Define function to compute cosine similarities for each model vs original captions
def compute_similarities_for_each_model(generated_caption, original_captions):
    inputs = clip_processor(text=[generated_caption] + original_captions, return_tensors="pt", truncation=True, padding=True)
    embeddings = clip_model.get_text_features(**inputs)
    similarities = torch.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1:]).detach().numpy()
    return similarities.tolist()

counter=0
for key, item in captions_data.items():
    generated_captions = item['generated_captions']
    original_captions = item['original_coco_captions']
    summarized_caption = summarize_captions(list(generated_captions.values()))

    # Calculate and store similarities for each model vs. original captions
    for model_name in ["blip", "gpt2"]:
        model_caption = generated_captions[model_name]
        similarities = compute_similarities_for_each_model(model_caption, original_captions)
        item[f'{model_name}_vs_Orig_similarities'] = [float(sim) for sim in similarities]

    # Split the summarized caption into sentences and find the most similar sentence
    summarized_sentences = summarized_caption.split(". ")
    highest_similarity = -1
    most_similar_sentence = ""
    most_similar_caption_from_original = ""

    for sentence in summarized_sentences:
        inputs = clip_processor(text=[sentence] + original_captions, return_tensors="pt", truncation=True, padding=True)
        embeddings = clip_model.get_text_features(**inputs)
        similarities = torch.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1:]).detach().numpy()
        most_similar_index = similarities.argmax()
        if similarities[most_similar_index] > highest_similarity:
            highest_similarity = similarities[most_similar_index]
            most_similar_sentence = sentence
            most_similar_caption_from_original = original_captions[most_similar_index]

    # Update the JSON structure with the most similar sentence info
    captions_data[key]['most_similar_summarized_sentence'] = most_similar_sentence
    captions_data[key]['most_similar_original_caption'] = most_similar_caption_from_original
    captions_data[key]['cosine_similarity_score'] =float(highest_similarity)

    print(most_similar_sentence)
    print(most_similar_caption_from_original)
    print(float(highest_similarity))
    print(counter)
    counter+=1

# Save the modified JSON data
output_path = '/content/drive/MyDrive/MasterThesis/Scenario2_distilbart_with_all_similarities_only_2_flickr.json'
with open(output_path, 'w') as file:
    json.dump(captions_data, file, indent=4)

print("JSON file has been updated and saved.")

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m43.7 MB/s[0m eta [36m0:00:0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

 There is a little girl that is standing on the steps outside of a house 
A little girl climbing the stairs to her playhouse .
0.8158478736877441
0
Two dogs that
A black dog and a spotted dog are fighting
0.852564811706543
1
 There is a little girl that is sitting on the ground in front of a rainbow a little boy sitting on a grass covered field 
A little girl is sitting in front of a large painted rainbow .
0.785959005355835
2
 There is a man laying on a bench with a dog next to him a man holding a book titled "The Book
A man lays on a bench while his dog sits by him .
0.8325022459030151
3
 There is a man wearing a hat that has a lot of stickers on it a man with a hat and
The man with pierced ears is wearing glasses and an orange hat .
0.687355101108551
4
 There is a little girl climbing on a rope net in a playground 
The small child climbs on a red ropes on a playground .
0.8477234840393066
5
 There is a dog that is running in the grass with a frisbee in it's mouth a dog and a cat pla