In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json

# Define the path to your JSON file
file_path = '/content/drive/My Drive/MasterThesis/flickr8k_dataset/combined_captions_data_flickr_final.json'

# Open and load the JSON data
with open(file_path, 'r') as file:
    captions_data = json.load(file)

In [None]:
!pip install transformers torch evaluate

import torch
import json
from transformers import CLIPProcessor, CLIPModel, pipeline

# Initialize the necessary components
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Define the function to summarize captions
def summarize_captions(captions):
    combined_text = ' '.join(captions)
    input_length = len(combined_text.split())  # Calculate the input length

    # Dynamically adjust max_length with a gentler slope, ensuring it doesn't exceed 75% of the input length
    # but is also not shorter than a minimum viable length for coherence
    max_length_ratio = 0.75
    min_viable_length = 20  # Adjust this as needed to ensure summaries are not too short
    max_length = max(min_viable_length, int(input_length * max_length_ratio))

    # Ensure min_length is logical given the new max_length, aiming for summaries to have enough room to be coherent
    min_length = max(10, max_length // 2)

    summarized_text = summarizer(combined_text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
    return summarized_text

# Function to clean captions
def clean_caption(caption):
    return caption.replace("\\", "").replace("\"", "")

# Compute cosine similarities for generated caption vs. original captions
def compute_cosine_similarities(generated_caption, original_captions, clip_processor, clip_model):
    inputs_generated = clip_processor(text=[generated_caption], return_tensors="pt", truncation=True, padding=True)
    embeddings_generated = clip_model.get_text_features(**inputs_generated)

    similarities = []
    for orig_caption in original_captions:
        inputs_orig = clip_processor(text=[orig_caption], return_tensors="pt", truncation=True, padding=True)
        embeddings_orig = clip_model.get_text_features(**inputs_orig)
        cosine_similarity_score = torch.cosine_similarity(embeddings_generated, embeddings_orig, dim=1).cpu().item()
        similarities.append(cosine_similarity_score)

    return similarities

iter=0
# Process each item in the JSON
for key, item in captions_data.items():

    # Clean and summarize generated captions
    generated_captions = [clean_caption(caption) for caption in list(item['generated_captions'].values())]
    original_captions = item['original_coco_captions']
    summarized_caption = summarize_captions(generated_captions)
    print(summarized_caption)

    # Compute and store similarities for each model vs. original captions
    for model_name in ['blip', 'gpt2']:
        item[f"{model_name}_vs_Orig"] = compute_cosine_similarities(item['generated_captions'][model_name], original_captions, clip_processor, clip_model)

    # Compute similarities between summarized caption and original captions
    summarized_vs_orig_similarities = compute_cosine_similarities(summarized_caption, original_captions, clip_processor, clip_model)
    most_similar_index = summarized_vs_orig_similarities.index(max(summarized_vs_orig_similarities))
    most_similar_caption = original_captions[most_similar_index]
    cosine_similarity_score = summarized_vs_orig_similarities[most_similar_index]

    # Update the JSON structure
    captions_data[key]['summarized_caption'] = summarized_caption
    captions_data[key]['most_similar_caption'] = most_similar_caption
    captions_data[key]['cosine_similarity_score'] = cosine_similarity_score

    # Optional: Print for verification
    print(f"Summarized Caption: {summarized_caption}")
    print(f"Most Similar Original Caption: {most_similar_caption}")
    print(f"Cosine Similarity Score: {cosine_similarity_score}\n" + "="*50)
    print(iter)
    iter+=1

# Save the modified JSON data to a new file
output_path = '/content/drive/MyDrive/MasterThesis/Scenario1_distilbart_with_all_similarities_only_2_flickr.json'  # Update this path
with open(output_path, 'w') as file:
    json.dump(captions_data, file, indent=4)

print("JSON file has been updated and saved.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
266
 There is a baby that is laying down on a slide in the water and a young
Summarized Caption:  There is a baby that is laying down on a slide in the water and a young
Most Similar Original Caption: The small boy is sliding down a pool slide and is about to hit the water .
Cosine Similarity Score: 0.8863576054573059
267
 there is a woman that is standing on the train tracks . there is  a woman
Summarized Caption:  there is a woman that is standing on the train tracks . there is  a woman
Most Similar Original Caption: A girl poses on the train tracks near a station
Cosine Similarity Score: 0.7888343930244446
268
 There is a woman that is standing on the train tracks a woman standing on a platform
Summarized Caption:  There is a woman that is standing on the train tracks a woman standing on a platform
Most Similar Original Caption: Girl is standing out on the train tracks .
Cosine Similarity Score: 0.9123722314834595
269
