In [1]:
# Clear pip cache to avoid stale packages
!pip cache purge

# Uninstall pre-installed packages to avoid conflicts
!pip uninstall -y transformers huggingface_hub tokenizers datasets accelerate torch torchvision torchaudio

# Install compatible versions of required libraries
!pip install transformers==4.35.0
!pip install huggingface_hub==0.23.0  # Ensure compatibility with transformers==4.35.0
!pip install pandas==2.0.3
!pip install rouge-score==0.1.2
!pip install nltk==3.8.1
!pip install scipy==1.10.1
!pip install matplotlib==3.7.1
!pip install seaborn==0.12.2
!pip install torch==2.0.1  # Compatible PyTorch version

# Verify installed versions
import transformers
import huggingface_hub
import torch
print(f"Transformers version: {transformers.__version__}")
print(f"Huggingface_hub version: {huggingface_hub.__version__}")
print(f"Torch version: {torch.__version__}")

# Download NLTK resources
import nltk
nltk.download('punkt')

print("Installation complete. Please restart the runtime (Runtime > Restart runtime) and proceed to the next cell.")

Files removed: 84
Found existing installation: transformers 4.35.0
Uninstalling transformers-4.35.0:
  Successfully uninstalled transformers-4.35.0
Found existing installation: huggingface-hub 0.23.0
Uninstalling huggingface-hub-0.23.0:
  Successfully uninstalled huggingface-hub-0.23.0
Found existing installation: tokenizers 0.14.1
Uninstalling tokenizers-0.14.1:
  Successfully uninstalled tokenizers-0.14.1
[0mCollecting transformers==4.35.0
  Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.1/123.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers==4.35.0)
  Downloading huggingface_hub-0.32.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.15,>=0.14 (from transformers==4.35.0)
  Downloading tokenizers-0.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Import libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration
import pandas as pd
from rouge_score import rouge_scorer
import nltk
from scipy.stats import ttest_rel
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set up models
# T5-small
t5_model_name = "t5-small"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

# BART-large-cnn
bart_model_name = "facebook/bart-large-cnn"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)

# Diverse Few-shot Prompt examples
few_shot_examples = [
    {
        "plot": "A detective investigates a series of murders in a small town, uncovering a conspiracy.",
        "summary": "A detective solves a murder mystery in a small town, revealing a hidden conspiracy."
    },
    {
        "plot": "A young girl discovers a magical world where she must save her new friends from an evil sorcerer.",
        "summary": "A girl enters a magical world to defeat an evil sorcerer and save her friends."
    },
    {
        "plot": "A robot befriends a human child in a futuristic city, learning about emotions.",
        "summary": "A robot and a child form a bond in a futuristic city, exploring human emotions."
    },
    {
        "plot": "A struggling comedian finds love and success after a series of hilarious mishaps.",
        "summary": "A comedian overcomes obstacles to find love and achieve success."
    }
]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [5]:
import pandas as pd
import json
import os

# Function to preprocess CMU dataset
def preprocess_cmu_data(metadata_path, summaries_path, output_path='movie_data.csv', sample_size=100):
    """Preprocess CMU Movie Summary Corpus to create a CSV with movie_id, plot, reference_summary, and genre."""
    try:
        # Read plot summaries (tab-separated)
        summaries_df = pd.read_csv(summaries_path, sep='\t', names=['movie_id', 'plot'], on_bad_lines='skip')

        # Read metadata (tab-separated)
        metadata_df = pd.read_csv(metadata_path, sep='\t', header=None, on_bad_lines='skip')
        # Assign column names based on CMU documentation
        metadata_df.columns = ['movie_id', 'freebase_id', 'title', 'release_date', 'revenue',
                              'runtime', 'languages', 'countries', 'genres']

        # Parse genres (stored as JSON strings)
        def parse_genres(genre_str):
            try:
                genres = json.loads(genre_str)
                return list(genres.values())[0] if genres else 'Unknown'
            except:
                return 'Unknown'

        metadata_df['genre'] = metadata_df['genres'].apply(parse_genres)

        # Merge summaries and metadata on movie_id
        merged_df = pd.merge(summaries_df, metadata_df[['movie_id', 'title', 'genre']],
                            on='movie_id', how='inner')

        # Create reference_summary (use first 50 words of plot as a proxy)
        def create_summary(plot):
            words = plot.split()[:50]
            return ' '.join(words)

        merged_df['reference_summary'] = merged_df['plot'].apply(create_summary)

        # Select required columns
        final_df = merged_df[['movie_id', 'plot', 'reference_summary', 'genre']]

        # Sample a subset to avoid memory issues
        final_df = final_df.sample(n=min(sample_size, len(final_df)), random_state=42)

        # Save to CSV
        final_df.to_csv(output_path, index=False)
        print(f"Preprocessed data saved to '{output_path}' with {len(final_df)} movies.")
        return final_df

    except Exception as e:
        print(f"Error preprocessing data: {e}")
        return None

# Check if files exist and preprocess
metadata_path = 'movie.metadata.tsv'
summaries_path = 'plot_summaries.txt'

if os.path.exists(metadata_path) and os.path.exists(summaries_path):
    df = preprocess_cmu_data(metadata_path, summaries_path)
else:
    print("One or both files not found. Please ensure 'movie.metadata.tsv' and 'plot_summaries.txt' are uploaded.")
    from google.colab import files
    uploaded = files.upload()
    if 'movie.metadata.tsv' in uploaded and 'plot_summaries.txt' in uploaded:
        df = preprocess_cmu_data('movie.metadata.tsv', 'plot_summaries.txt')
    else:
        print("Required files not uploaded. Please upload both 'movie.metadata.tsv' and 'plot_summaries.txt'.")

Preprocessed data saved to 'movie_data.csv' with 100 movies.


In [6]:
# Import libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration
import pandas as pd
from rouge_score import rouge_scorer
import nltk
from scipy.stats import ttest_rel
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set up models
# T5-small
t5_model_name = "t5-small"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

# BART-large-cnn
bart_model_name = "facebook/bart-large-cnn"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)

# Diverse Few-shot Prompt examples
few_shot_examples = [
    {
        "plot": "A detective investigates a series of murders in a small town, uncovering a conspiracy.",
        "summary": "A detective solves a murder mystery in a small town, revealing a hidden conspiracy."
    },
    {
        "plot": "A young girl discovers a magical world where she must save her new friends from an evil sorcerer.",
        "summary": "A girl enters a magical world to defeat an evil sorcerer and save her friends."
    },
    {
        "plot": "A robot befriends a human child in a futuristic city, learning about emotions.",
        "summary": "A robot and a child form a bond in a futuristic city, exploring human emotions."
    },
    {
        "plot": "A struggling comedian finds love and success after a series of hilarious mishaps.",
        "summary": "A comedian overcomes obstacles to find love and achieve success."
    }
]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# Load data
def load_data(file_path):
    """Load movie plot data from CSV file."""
    return pd.read_csv(file_path)

# Analyze genre distribution
def analyze_genres(df):
    """Analyze and plot genre distribution in the dataset."""
    if 'genre' in df.columns:
        genre_counts = df['genre'].value_counts()
        plt.figure(figsize=(10, 6))
        sns.barplot(x=genre_counts.values, y=genre_counts.index, palette='viridis')
        plt.title('Distribution of Movie Genres in Sample')
        plt.xlabel('Number of Movies')
        plt.ylabel('Genre')
        plt.savefig('genre_distribution.png')
        plt.close()
        print("Genre distribution saved to 'genre_distribution.png'")
    else:
        print("No 'genre' column found in dataset.")

# Load preprocessed data
try:
    df = load_data('movie_data.csv')
    print("Data loaded successfully.")
    analyze_genres(df)
except FileNotFoundError:
    print("Error: 'movie_data.csv' not found. Ensure the preprocessing cell was run successfully.")
    print("Please run the preprocessing cell to create 'movie_data.csv' from 'movie.metadata.tsv' and 'plot_summaries.txt'.")

Data loaded successfully.
Genre distribution saved to 'genre_distribution.png'


In [8]:
# Create Few-shot Prompt
def create_few_shot_prompt(plot):
    """Create a Few-shot prompt with diverse examples."""
    prompt = "Summarize the following movie plot in 50 words or less:\n\n"
    for example in few_shot_examples:
        prompt += f"Plot: {example['plot']}\nSummary: {example['summary']}\n\n"
    prompt += f"Plot: {plot}\nSummary:"
    return prompt

# Create Zero-shot Prompt
def create_zero_shot_prompt(plot):
    """Create a Zero-shot prompt with clear instructions."""
    return f"Summarize the following movie plot in 50 words, focusing on main characters and conflict: {plot}"

# Generate summary with T5
def generate_t5_summary(plot, prompt_type="few-shot", max_length=100):
    """Generate summary using T5 with specified prompt type."""
    if prompt_type == "few-shot":
        input_text = create_few_shot_prompt(plot)
    elif prompt_type == "zero-shot":
        input_text = create_zero_shot_prompt(plot)
    else:
        input_text = f"summarize: {plot}"  # Baseline

    inputs = t5_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = t5_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )
    summary = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Generate summary with BART
def generate_bart_summary(plot, max_length=100):
    """Generate summary using BART."""
    inputs = bart_tokenizer(plot, return_tensors="pt", max_length=512, truncation=True)
    outputs = bart_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )
    summary = bart_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Evaluate summary
def evaluate_summary(reference, generated):
    """Evaluate summary using ROUGE and BLEU metrics."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, generated)
    try:
        bleu_score = nltk.translate.bleu_score.sentence_bleu([reference.split()], generated.split())
    except:
        bleu_score = 0.0  # Handle errors (e.g., empty summary)
    return {
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure,
        'bleu': bleu_score
    }

In [11]:
# Main experiment
max_lengths = [60, 100, 150]
results = []

# Use a smaller sample for testing
sample_df = df.sample(n=5, random_state=42)  # Reduced to 5 movies

for max_len in max_lengths:
    print(f"Processing max_length = {max_len}")
    for index, row in sample_df.iterrows():
        plot = row['plot']
        reference = row['reference_summary']
        movie_id = row.get('movie_id', index)

        try:
            # Baseline (T5)
            baseline_summary = generate_t5_summary(plot, prompt_type="baseline", max_length=max_len)
            baseline_scores = evaluate_summary(reference, baseline_summary)

            # Few-shot (T5)
            few_shot_summary = generate_t5_summary(plot, prompt_type="few-shot", max_length=max_len)
            few_shot_scores = evaluate_summary(reference, few_shot_summary)

            # Zero-shot (T5)
            zero_shot_summary = generate_t5_summary(plot, prompt_type="zero-shot", max_length=max_len)
            zero_shot_scores = evaluate_summary(reference, zero_shot_summary)

            # BART (with error handling)
            try:
                bart_summary = generate_bart_summary(plot, max_length=max_len)
                bart_scores = evaluate_summary(reference, bart_summary)
            except Exception as e:
                print(f"BART failed for movie {movie_id}, max_length={max_len}: {e}")
                bart_summary = "Failed"
                bart_scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'bleu': 0.0}

            results.append({
                'movie_id': movie_id,
                'max_length': max_len,
                'plot': plot,
                'reference_summary': reference,
                'baseline_summary': baseline_summary,
                'baseline_rouge1': baseline_scores['rouge1'],
                'baseline_rouge2': baseline_scores['rouge2'],
                'baseline_rougeL': baseline_scores['rougeL'],
                'baseline_bleu': baseline_scores['bleu'],
                'few_shot_summary': few_shot_summary,
                'few_shot_rouge1': few_shot_scores['rouge1'],
                'few_shot_rouge2': few_shot_scores['rouge2'],
                'few_shot_rougeL': few_shot_scores['rougeL'],
                'few_shot_bleu': few_shot_scores['bleu'],
                'zero_shot_summary': zero_shot_summary,
                'zero_shot_rouge1': zero_shot_scores['rouge1'],
                'zero_shot_rouge2': zero_shot_scores['rouge2'],
                'zero_shot_rougeL': zero_shot_scores['rougeL'],
                'zero_shot_bleu': zero_shot_scores['bleu'],
                'bart_summary': bart_summary,
                'bart_rouge1': bart_scores['rouge1'],
                'bart_rouge2': bart_scores['rouge2'],
                'bart_rougeL': bart_scores['rougeL'],
                'bart_bleu': bart_scores['bleu']
            })
            print(f"Completed movie {movie_id} for max_length={max_len}")

        except Exception as e:
            print(f"Error processing movie {movie_id}, max_length={max_len}: {e}")
            continue

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv('summary_results_extended.csv', index=False)
print("Results saved to 'summary_results_extended.csv'")

Processing max_length = 60
Completed movie 18760565 for max_length=60
Completed movie 7363831 for max_length=60
Completed movie 35965448 for max_length=60
Completed movie 4969625 for max_length=60
Completed movie 31323048 for max_length=60
Processing max_length = 100
Completed movie 18760565 for max_length=100
Completed movie 7363831 for max_length=100
Completed movie 35965448 for max_length=100
Completed movie 4969625 for max_length=100
Completed movie 31323048 for max_length=100
Processing max_length = 150
Completed movie 18760565 for max_length=150
Completed movie 7363831 for max_length=150
Completed movie 35965448 for max_length=150
Completed movie 4969625 for max_length=150
Completed movie 31323048 for max_length=150
Results saved to 'summary_results_extended.csv'


In [12]:
# Statistical analysis
def statistical_analysis(results_df):
    """Perform t-test to compare models."""
    for max_len in max_lengths:
        print(f"\nStatistical Analysis for max_length = {max_len}")
        subset = results_df[results_df['max_length'] == max_len]

        baseline_rouge1 = subset['baseline_rouge1']
        few_shot_rouge1 = subset['few_shot_rouge1']
        zero_shot_rouge1 = subset['zero_shot_rouge1']
        bart_rouge1 = subset['bart_rouge1']

        # t-test: Baseline vs Few-shot
        t_stat_few, p_value_few = ttest_rel(baseline_rouge1, few_shot_rouge1)
        print(f"Baseline vs Few-shot (ROUGE-1): T={t_stat_few:.4f}, P={p_value_few:.4f}")

        # t-test: Baseline vs Zero-shot
        t_stat_zero, p_value_zero = ttest_rel(baseline_rouge1, zero_shot_rouge1)
        print(f"Baseline vs Zero-shot (ROUGE-1): T={t_stat_zero:.4f}, P={p_value_zero:.4f}")

        # t-test: Baseline vs BART
        t_stat_bart, p_value_bart = ttest_rel(baseline_rouge1, bart_rouge1)
        print(f"Baseline vs BART (ROUGE-1): T={t_stat_bart:.4f}, P={p_value_bart:.4f}")

# Plot metrics
def plot_metrics(results_df):
    """Plot comparison of ROUGE-1 scores for models."""
    metrics = ['baseline_rouge1', 'few_shot_rouge1', 'zero_shot_rouge1', 'bart_rouge1']
    labels = ['Baseline', 'Few-shot', 'Zero-shot', 'BART']
    colors = ['#36A2EB', '#FF6B6B', '#4BC0C0', '#9966FF']

    plt.figure(figsize=(12, 6))
    for max_len in max_lengths:
        means = [results_df[results_df['max_length'] == max_len][metric].mean() for metric in metrics]
        plt.plot(labels, means, marker='o', label=f'Max Length {max_len}')
    plt.title('Average ROUGE-1 Scores by Model and Max Length')
    plt.ylabel('ROUGE-1 Score')
    plt.legend()
    plt.savefig('scores_comparison.png')
    plt.close()
    print("Scores comparison saved to 'scores_comparison.png'")

# Run analysis and visualization
statistical_analysis(results_df)
plot_metrics(results_df)


Statistical Analysis for max_length = 60
Baseline vs Few-shot (ROUGE-1): T=13.5921, P=0.0002
Baseline vs Zero-shot (ROUGE-1): T=1.0221, P=0.3645
Baseline vs BART (ROUGE-1): T=-1.1035, P=0.3317

Statistical Analysis for max_length = 100
Baseline vs Few-shot (ROUGE-1): T=9.6956, P=0.0006
Baseline vs Zero-shot (ROUGE-1): T=1.3249, P=0.2558
Baseline vs BART (ROUGE-1): T=-0.2031, P=0.8490

Statistical Analysis for max_length = 150
Baseline vs Few-shot (ROUGE-1): T=9.6281, P=0.0007
Baseline vs Zero-shot (ROUGE-1): T=1.3960, P=0.2352
Baseline vs BART (ROUGE-1): T=0.1544, P=0.8848
Scores comparison saved to 'scores_comparison.png'
