In [16]:
!pip install google-generativeai
!pip install transformers
!pip install torch
!pip install datasets
!pip install python-dotenv
!pip install rouge-score
!pip install bert-score
!pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[

In [17]:
import google.generativeai as genai
from datasets import load_dataset
import pandas as pd
import time
import os
from dotenv import load_dotenv
from rouge_score import rouge_scorer
import ssl
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score


In [3]:
load_dotenv("../.env")
GEMINI_API_KEY =  os.getenv("GEMINI_API_KEY")
genai.configure(api_key=GEMINI_API_KEY)

dataset = load_dataset("jylins/videoxum")

In [18]:
# Testing if setup works 
model = genai.GenerativeModel("gemini-1.0-pro")
response = model.generate_content("What is Natural Language Processing? Explain it to a five year old.")
print(response.text)

Let's imagine your favorite toy talks to you. When you talk, your toy understands what you say, and it answers you back. That's like talking to a computer that can understand our words.

Natural Language Processing (NLP) makes computers understand our language. It's like giving special tools to your computer to help it "speak" with us. NLP helps computers read our words, listen to our voices, and even understand our jokes!

Just like your toy, NLP helps computers talk to us in a way we can understand. It's like giving computers superpowers to communicate with us just like our friends.


In [19]:
def generate_with_backoff(prompt: str, max_retries: int = 5, initial_delay: float = 2.0):
        """Helper function implementing exponential backoff"""
        delay = initial_delay
        for attempt in range(max_retries):
            try:
                response = model.generate_content(prompt)
                return response.text
            except Exception as e:
                if attempt == max_retries - 1:
                    print(f"All attempts failed for prompt after {max_retries} retries")
                    return ""
                sleep_time = delay * (2 ** attempt)
                print(f"API error: {str(e)}. Retrying in {sleep_time:.1f} seconds...")
                time.sleep(sleep_time)
        return ""


In [20]:
def evaluate_rouge(reference, summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, summary)
    return scores

def evaluate_bleu(reference, summary):
    reference_tokens = reference.split()
    summary_tokens = summary.split()
    smoothing_function = SmoothingFunction().method4
    score = sentence_bleu([reference_tokens], summary_tokens, smoothing_function=smoothing_function)
    return score

def evaluate_bertscore(reference, summary, model_type="roberta-large"):
    P, R, F1 = score([summary], [reference], lang="en", rescale_with_baseline=True)
    return P.mean().item(), R.mean().item(), F1.mean().item()

In [21]:
def extract_score(text: str) -> str:
    words = text.split()
    num = 3
    for word in words:
        # Try to extract regular number
        try:
            # print(f"Word: {word}")
            num = float(word.replace('*','').replace('[','').replace(']',''))
            # print(f"{num} found!")
            break
        except ValueError:
            continue
    
    return num


# LLM As a Judge
def evaluate_summary(summary, reference, num_evaluations=1):
    prompt = f"""Evaluate the following video's visual summary based on six criteria:

        1. Descriptiveness: How well does the summary provide a rich and vivid description of the video's content? How clear is the picture it paints of what happens in the video?
        2. Coherence: How logically structured and easy to follow is the summary?
        3. Completeness: Does the summary cover all important aspects of the video?
        4. Fluency: How grammatically correct and well-written is the language in the summary?
        5. Conciseness: How well does the summary avoid unnecessary details while covering the essentials?

        Use the following scoring scale:
        - 5: Excellent
        - 4: Good
        - 3: Average
        - 2: Below average
        - 1: Poor
        
        ---
        The following are Example Scoring:

        1. Descriptiveness:
        5:
        "The summary captures detailed visuals and events from the video, such as 'the athlete’s grueling uphill training runs under the rain' or 'her heartfelt conversation with her coach.' It brings the scenes to life with vivid language, matching the richness of the video."
        4:
        "The summary describes the key moments but lacks some vivid details. For example, it mentions 'training' but doesn’t describe the intense conditions shown in the video."
        3:
        "The summary touches on important moments (e.g., training, challenges, victory) but is generic and lacks descriptive depth."
        2:
        "The summary barely describes the events and uses vague terms like 'The athlete worked hard and succeeded,' offering little visual or emotional detail."
        1:
        "The summary is extremely vague or devoid of descriptions, e.g., 'The video is about a runner.'”
        
        2. Coherence:
        5:
        "The summary presents events in a clear sequence that mirrors the video’s progression: starting with training, moving through challenges, and culminating in the victory. Transitions are smooth and logical."
        4:
        "The summary mostly follows a logical structure but includes minor jumps or skips (e.g., it moves abruptly from training to victory without mentioning the injury)."
        3:
        "The summary contains some disjointed transitions or a slightly unclear order of events, making it harder to follow."
        2:
        "The summary is confusing and jumps between events with no logical sequence (e.g., it starts with victory, then mentions training)."
        1:
        "The summary is completely incoherent, with events presented in a random or contradictory order."
       
        3. Completeness:
        5:
        "The summary covers all key events and details essential to understanding the video. It does not omit any significant aspects."
        4:
        "The summary captures most key events but leaves out a minor detail or two that slightly reduce its completeness."
        3:
        "The summary includes major events but omits at least one significant aspect, leaving an incomplete understanding of the video."
        2:
        "The summary misses multiple important details, making it difficult to fully grasp the video’s core narrative."
        1:
        "The summary barely mentions any aspects of the video, leaving out most critical events."
        
        4. Fluency:
        5:
        "The summary is entirely free of grammatical errors and uses clear, polished, and natural language."
        4:
        "The summary is mostly fluent, with minor grammatical errors or slightly awkward phrasing that doesn’t hinder comprehension."
        3:
        "The summary is understandable but contains noticeable grammatical mistakes or unnatural phrasing that slightly disrupts reading."
        2:
        "The summary has frequent grammatical errors or poor sentence structure, making it hard to read."
        1:
        "The summary is riddled with grammatical errors and is incomprehensible."
        
        5. Conciseness:
        5:
        "The summary is concise and covers all critical information without including irrelevant details or repetition."
        4:
        "The summary is mostly concise but includes minor redundancies or slightly extraneous details."
        3:
        "The summary conveys the main points but is overly wordy or contains unnecessary information."
        2:
        "The summary is verbose, with excessive repetition or irrelevant tangents, detracting from its conciseness."
        1:
        "The summary is excessively repetitive and includes irrelevant content that overwhelms the key points."
        
        ---

        Now let's score this one. If the provided summary is describing an image instead of a video, score it as you would a video.
        Summary: {summary}

        Output in this format:
        Descriptiveness: [Score] (Reason: [Explanation])
        Coherence: [Score] (Reason: [Explanation])
        Completeness: [Score] (Reason: [Explanation])
        Fluency: [Score] (Reason: [Explanation])
        Conciseness: [Score] (Reason: [Explanation])"""
    aggregated_scores = {metric: [] for metric in ["Descriptiveness", "Coherence", "Completeness", "Fluency", "Conciseness"]}
    
    for _ in range(num_evaluations):
        response = generate_with_backoff(prompt).replace('*','')
        print(response)
        # Extract scores from the response
        for metric in aggregated_scores.keys():
            if f"{metric}:" in response:
                for line in response.split('\n'):
                    if f"{metric}" in line:
                        score_line = line
                        # print(f"Line: {score_line}")
                        break
                score = extract_score(score_line.split(f"{metric}:")[-1])
                aggregated_scores[metric].append(score)
            else:
                print(f"{metric} not found")
                aggregated_scores[metric].append(1)
    
    # Calculate average scores
    average_scores = {metric: sum(scores) / len(scores) for metric, scores in aggregated_scores.items()}
    overall_average_score = sum(average_scores.values()) / len(average_scores)
    average_scores['Overall_Average'] = overall_average_score    

    # Calculate ROUGE and BLEU scores
    rouge_scores = evaluate_rouge(reference, summary)
    bleu_score = evaluate_bleu(reference, summary)
    P, R, F1 = evaluate_bertscore(reference, summary)
    average_scores['ROUGE-1'] = rouge_scores['rouge1'].fmeasure
    average_scores['ROUGE-2'] = rouge_scores['rouge2'].fmeasure
    average_scores['ROUGE-L'] = rouge_scores['rougeL'].fmeasure
    average_scores['BLEU'] = bleu_score
    average_scores['BERTScore_P'] = P
    average_scores['BERTScore_R'] = R
    average_scores['BERTScore_F1'] = F1

    return average_scores

In [22]:
reference = "A young girl is seen sitting in a chair with a person standing next to her.  The person next to her then piercing one ear followed by the other.  The person rubs lotion on the piercings afterwards."
print(evaluate_summary("a close-up of a young girl with blonde hair. She appears to be around 6-7 years old and has blonde hair. She is wearing a white shirt with a pink bow on the collar. The girl is looking off to the side with a serious expression on her face. A person's hand is visible on the left side of the image, holding a blue spray bottle and applying a white substance to the girl's ear. The background is blurred, but it seems like the focus is on the girl and the person applying the substance. The girl appears to be receiving a dental procedure, as she is looking up at the dentist with a concerned expression on her face. a",reference, num_evaluations=3))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'ROUGE-1': 0.23602484472049692, 'ROUGE-2': 0.0880503144654088, 'ROUGE-L': 0.1987577639751553, 'BLEU': 0.013888089110663132, 'BERTScore_P': 0.06938610225915909, 'BERTScore_R': 0.18434451520442963, 'BERTScore_F1': 0.12763485312461853}


In [None]:
df = pd.read_csv("./summarized_dense_descriptions_1000_final (2).csv")

# How many evaluations to average
num_evaluations = 5
# Evaluate each summary and store the results
baseline_results = []
bart_results = []
t5_movie_results = []
t5_synthetic_results = []

for index, row in df.iterrows():

    reference = row['description']
    bart_summary = row['bart_summarized_video_captions']
    t5_movie_summary = row['t5_finetuned_on_movie_summarized_video_captions']
    t5_synthetic_summary = row['t5_finetuned_on_synthetic_summarized_video_captions']

    print(bart_summary)
    bart_scores = evaluate_summary(bart_summary, reference, num_evaluations=num_evaluations)
    bart_results.append(bart_scores)

    print(t5_movie_summary)
    t5_movie_scores = evaluate_summary(t5_movie_summary, reference, num_evaluations=num_evaluations)
    t5_movie_results.append(t5_movie_scores)

    print(t5_synthetic_summary)
    t5_synthetic_scores = evaluate_summary(t5_synthetic_summary, reference, num_evaluations=num_evaluations)
    t5_synthetic_results.append(t5_synthetic_scores)


# Convert the results to DataFrames
bart_results_df = pd.DataFrame(bart_results).add_prefix('bart_')
t5_movie_results_df = pd.DataFrame(t5_movie_results).add_prefix('t5_movie_')
t5_synthetic_results_df = pd.DataFrame(t5_synthetic_results).add_prefix('t5_synthetic_')

# Combine the original DataFrame with the results
final_df = pd.concat([df, bart_results_df, t5_movie_results_df, t5_synthetic_results_df], axis=1)

# Save the results to a new CSV file
final_df.to_csv('evaluated_summaries.csv', index=False)