<a href="https://colab.research.google.com/github/Vibhu-Maurya/Machine-Learning-project/blob/main/T5ReadMe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
Optimized T5 Model Training Pipeline for CNN/DailyMail Text Summarization
"""

# Install required packages (uncomment if running in Colab)
!pip install -q datasets pandas pyarrow transformers evaluate rouge_score accelerate tensorboard sentencepiece

import os
import re
import json
import random
import pprint
import logging
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Union

import torch
import numpy as np
import pandas as pd
import evaluate
from datasets import load_dataset, Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ====== CONFIGURATION ======
class Config:
    """Centralized configuration class for better organization"""
    # Data preprocessing
    OUT_DIR = "cnn_dm_prepared"
    FILTER_STRATEGY = "strict"  # "strict" (60-70), "soft" (50-90), or "none"
    STRICT_RANGE = (60, 70)
    SOFT_RANGE = (50, 90)
    SHUFFLE_SEED = 42
    SAVE_FORMATS = ["jsonl", "csv", "parquet"]
    ADD_META = True

    # Model training
    MODEL_NAME = 't5-small'
    BATCH_SIZE = 8
    NUM_PROCS = min(4, os.cpu_count() or 1)
    EPOCHS = 3
    OUT_MODEL_DIR = '/content/drive/MyDrive/results_t5small'
    MAX_INPUT_LENGTH = 512
    MAX_TARGET_LENGTH = 150
    LEARNING_RATE = 3e-4
    WARMUP_STEPS = 500
    WEIGHT_DECAY = 0.01
    GRADIENT_ACCUMULATION_STEPS = 2

    def __post_init__(self):
        os.makedirs(self.OUT_DIR, exist_ok=True)
        os.makedirs(self.OUT_MODEL_DIR, exist_ok=True)

config = Config()

In [None]:
# =============================

class DataProcessor:
    """Handles data loading, cleaning, and preprocessing operations"""

    def __init__(self, config: Config):
        self.config = config
        self._whitespace_re = re.compile(r"\s+")
        self._word_re = re.compile(r"[A-Za-z0-9']+")
        logger.info(f"Saving to: {os.path.abspath(config.OUT_DIR)}")

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        text = (text or "").strip()
        return self._whitespace_re.sub(" ", text)

    def word_count(self, text: str) -> int:
        """Count words in text using regex"""
        return len(self._word_re.findall(text or ""))

    def in_range(self, n: int, low: int, high: int) -> bool:
        """Check if number is within range (inclusive)"""
        return low <= n <= high

    def get_filter_range(self) -> Optional[Tuple[int, int]]:
        """Get filtering range based on strategy"""
        strategy_map = {
            "strict": self.config.STRICT_RANGE,
            "soft": self.config.SOFT_RANGE,
            "none": None
        }
        return strategy_map.get(self.config.FILTER_STRATEGY)

    def transform_record(self, example: Dict, split_name: str) -> Optional[Dict]:
        """Transform a single record with filtering"""
        article = self.clean_text(example.get("article", ""))
        summary = self.clean_text(example.get("highlights", ""))

        if not article or not summary:
            return None

        word_count = self.word_count(summary)
        filter_range = self.get_filter_range()

        if filter_range and not self.in_range(word_count, *filter_range):
            return None

        record = {
            "input_text": article,
            "target_text": summary,
        }

        if self.config.ADD_META:
            record.update({
                "summary_word_count": word_count,
                "id": example.get("id"),
                "split": split_name,
                "source": "cnn_dailymail_3.0.0",
            })

        return record

    def process_split(self, dataset_split, split_name: str) -> pd.DataFrame:
        """Process a dataset split with filtering and shuffling"""
        random.seed(self.config.SHUFFLE_SEED)
        rows = []
        kept = total = 0

        for example in dataset_split:
            total += 1
            record = self.transform_record(example, split_name)
            if record is not None:
                rows.append(record)
                kept += 1

        random.shuffle(rows)
        logger.info(f"[{split_name}] kept {kept} / {total} ({kept/total:.1%})")
        return pd.DataFrame(rows)

    def save_dataframe(self, df: pd.DataFrame, filepath: str) -> None:
        """Save dataframe in multiple formats"""
        for format_type in self.config.SAVE_FORMATS:
            if format_type == "jsonl":
                df.to_json(f"{filepath}.jsonl", orient="records", lines=True, force_ascii=False)
            elif format_type == "csv":
                df.to_csv(f"{filepath}.csv", index=False)
            elif format_type == "parquet":
                df.to_parquet(f"{filepath}.parquet", index=False)

    def load_and_process_dataset(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """Load and process the entire CNN/DailyMail dataset"""
        logger.info("Loading CNN/DailyMail dataset...")
        dataset = load_dataset("cnn_dailymail", "3.0.0")

        # Process splits
        train_df = self.process_split(dataset["train"], "train")
        val_df = self.process_split(dataset["validation"], "validation")
        test_df = self.process_split(dataset["test"], "test")

        # Validate non-empty dataframes
        for name, df in [("train", train_df), ("validation", val_df), ("test", test_df)]:
            assert len(df) > 0, f"{name} is empty after filtering — loosen FILTER_STRATEGY or ranges."

        # Display sample data
        for name, df in [("train", train_df), ("validation", val_df), ("test", test_df)]:
            logger.info(f"\n=== {name} ===")
            sample_cols = ["input_text", "target_text", "summary_word_count"] if "summary_word_count" in df.columns else ["input_text", "target_text"]
            print(df.head(2)[sample_cols])

        # Save processed data
        timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
        base_path = os.path.join(self.config.OUT_DIR, f"cnn_dm_{self.config.FILTER_STRATEGY}_{timestamp}")

        # Ensure the output directory exists
        os.makedirs(self.config.OUT_DIR, exist_ok=True)

        self.save_dataframe(train_df, f"{base_path}_train")
        self.save_dataframe(val_df, f"{base_path}_validation")
        self.save_dataframe(test_df, f"{base_path}_test")

        logger.info(f"Saved files with prefix: {base_path}")
        return train_df, val_df, test_df


class DataAnalyzer:
    """Handles dataset analysis and statistics"""

    @staticmethod
    def analyze_text_lengths(texts: List[str], text_type: str = "text") -> Dict:
        """Analyze text length statistics efficiently"""
        word_counts = []
        counters = {"4k": 0, "2k": 0, "1k": 0, "500": 0}
        max_length = 0

        for text in texts:
            words = text.split()
            length = len(words)
            word_counts.append(length)

            # Update counters
            if length > 4000:
                counters["4k"] += 1
            if length > 2000:
                counters["2k"] += 1
            if length > 1000:
                counters["1k"] += 1
            if length > 500:
                counters["500"] += 1

            max_length = max(max_length, length)

        avg_length = sum(word_counts) / len(word_counts) if word_counts else 0

        logger.info(f"=== {text_type.upper()} ANALYSIS ===")
        logger.info(f"Longest {text_type}: {max_length} words")
        logger.info(f"Average {text_type}: {avg_length:.1f} words")
        for threshold, count in counters.items():
            logger.info(f"{text_type.capitalize()}s > {threshold} words: {count}")

        return {
            "max_length": max_length,
            "avg_length": avg_length,
            "counters": counters
        }


class ModelTrainer:
    """Handles model training and evaluation"""

    def __init__(self, config: Config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device: {self.device}")

        # Initialize tokenizer and model
        self.tokenizer = T5Tokenizer.from_pretrained(config.MODEL_NAME)
        self.model = T5ForConditionalGeneration.from_pretrained(config.MODEL_NAME)
        self.model.to(self.device)

        # Initialize evaluation metric
        self.rouge = evaluate.load("rouge")

        # Log model parameters
        self._log_model_params()

    def _log_model_params(self):
        """Log model parameter information"""
        total_params = sum(p.numel() for p in self.model.parameters())
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        logger.info(f"Total parameters: {total_params:,}")
        logger.info(f"Trainable parameters: {trainable_params:,}")

    def preprocess_function(self, examples: Dict) -> Dict:
        """Preprocess examples for training"""
        inputs = [f"summarize: {article}" for article in examples['input_text']]
        model_inputs = self.tokenizer(
            inputs,
            max_length=self.config.MAX_INPUT_LENGTH,
            truncation=True,
            padding='max_length'
        )

        # Process targets
        targets = examples['target_text']
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                targets,
                max_length=self.config.MAX_TARGET_LENGTH,
                truncation=True,
                padding='max_length'
            )

        # Replace pad tokens with -100 for loss computation
        labels["input_ids"] = [
            [(token_id if token_id != self.tokenizer.pad_token_id else -100)
             for token_id in label]
            for label in labels["input_ids"]
        ]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def compute_metrics(self, eval_pred) -> Dict:
        """Compute ROUGE metrics for evaluation"""
        predictions, labels = eval_pred.predictions, eval_pred.label_ids

        # Handle different prediction formats
        if isinstance(predictions, tuple):
            predictions = predictions[0]

        if len(predictions.shape) == 3:  # Logits
            predictions = np.argmax(predictions, axis=-1)

        # Decode predictions and labels
        decoded_preds = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Compute ROUGE scores
        result = self.rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True,
            rouge_types=['rouge1', 'rouge2', 'rougeL']
        )

        # Add generation length
        prediction_lens = [np.count_nonzero(pred != self.tokenizer.pad_token_id) for pred in predictions]
        result["gen_len"] = np.mean(prediction_lens)

        return {k: round(v, 4) for k, v in result.items()}

    @staticmethod
    def preprocess_logits_for_metrics(logits, labels):
        """Preprocess logits to avoid memory leaks"""
        pred_ids = torch.argmax(logits[0], dim=-1)
        return pred_ids, labels

    def prepare_datasets(self, train_df: pd.DataFrame, val_df: pd.DataFrame) -> Tuple[Dataset, Dataset]:
        """Convert DataFrames to tokenized datasets safely for GPU"""
        # Convert to HuggingFace datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)

        logger.info(f"Train dataset size: {len(train_dataset)}")
        logger.info(f"Validation dataset size: {len(val_dataset)}")

        # Display sample
        logger.info("\nSample from training data:")
        sample = train_dataset[0]
        logger.info(f"Input: {sample['input_text'][:200]}...")
        logger.info(f"Target: {sample['target_text'][:200]}...")

        # Tokenize datasets safely (num_proc=1 avoids CUDA fork issues)
        logger.info("Tokenizing datasets...")
        tokenized_train = train_dataset.map(
            self.preprocess_function,
            batched=True,
            num_proc=1,  # ← avoid CUDA fork problem
            remove_columns=train_dataset.column_names
        )
        tokenized_val = val_dataset.map(
            self.preprocess_function,
            batched=True,
            num_proc=1,  # ← avoid CUDA fork problem
            remove_columns=val_dataset.column_names
        )

        return tokenized_train, tokenized_val


    def create_training_args(self) -> TrainingArguments:
        """Create optimized training arguments"""
        return TrainingArguments(
            output_dir=self.config.OUT_MODEL_DIR,
            num_train_epochs=self.config.EPOCHS,
            per_device_train_batch_size=self.config.BATCH_SIZE,
            per_device_eval_batch_size=self.config.BATCH_SIZE,
            gradient_accumulation_steps=self.config.GRADIENT_ACCUMULATION_STEPS,
            warmup_steps=self.config.WARMUP_STEPS,
            weight_decay=self.config.WEIGHT_DECAY,
            learning_rate=self.config.LEARNING_RATE,
            lr_scheduler_type='cosine',

            # Logging and evaluation
            logging_dir=f"{self.config.OUT_MODEL_DIR}/logs",
            logging_steps=100,
            eval_strategy='steps',
            eval_steps=500,

            # Saving
            save_strategy='steps',
            save_steps=1000,
            save_total_limit=3,
            load_best_model_at_end=True,
            metric_for_best_model='eval_rouge1',
            greater_is_better=True,

            # Performance optimizations
            dataloader_num_workers=4,
            dataloader_pin_memory=True,
            fp16=torch.cuda.is_available(),
            report_to='tensorboard',
            remove_unused_columns=False,
        )

    def train(self, train_dataset: Dataset, val_dataset: Dataset) -> None:
        """Train the model"""
        training_args = self.create_training_args()

        # Create data collator for efficient batching
        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding=True
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics,
            preprocess_logits_for_metrics=self.preprocess_logits_for_metrics,
        )

        logger.info("Starting training...")
        trainer.train()


        # Save model and tokenizer
        trainer.save_model()
        self.tokenizer.save_pretrained(self.config.OUT_MODEL_DIR)
        logger.info(f"Model and tokenizer saved to {self.config.OUT_MODEL_DIR}")


def main():
    """Main execution function"""
    logger.info("Starting CNN/DailyMail T5 Training Pipeline")

    # Initialize components
    data_processor = DataProcessor(config)
    analyzer = DataAnalyzer()
    trainer = ModelTrainer(config)

    # Process data
    train_df, val_df, test_df = data_processor.load_and_process_dataset()

    # Analyze data (optional)
    analyzer.analyze_text_lengths(train_df['input_text'].tolist(), "article")
    analyzer.analyze_text_lengths(train_df['target_text'].tolist(), "summary")

    # Prepare datasets and train
    tokenized_train, tokenized_val = trainer.prepare_datasets(train_df, val_df)
    trainer.train(tokenized_train, tokenized_val)

    logger.info("Training pipeline completed successfully!")


if __name__ == "__main__":
    main()

                                          input_text  \
0  By . Mark Duell . PUBLISHED: . 12:35 EST, 9 Ja...   
1  Paul Scholes wants England manager Roy Hodgson...   

                                         target_text  summary_word_count  
0  Father-of-two David Sharma, 47, vows to take l...                  60  
1  Former Man United midfielder urges Roy Hodgson...                  68  
                                          input_text  \
0  Killer: Triple killer Ian McLoughlin, pictured...   
1  A university lecturer is suing his ex-wife for...   

                                         target_text  summary_word_count  
0  Ian McLoughlin murdered Graham Buck, 66, while...                  69  
1  Lecturer suing ex-wife over claim she swapped ...                  67  
                                          input_text  \
0  This spot in Colorado's Vail Valley is so pict...   
1  Come the end of this season, Malky Mackay’s te...   

                                         ta

  timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")


Map:   0%|          | 0/27602 [00:00<?, ? examples/s]



Map:   0%|          | 0/1892 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
500,1.9985,1.830596,0.4492,0.2421,0.4006,150.0
1000,2.039,1.822697,0.4456,0.2405,0.3977,150.0
1500,2.0121,1.827228,0.4483,0.2422,0.4003,150.0
2000,1.9598,1.816276,0.445,0.2409,0.3973,150.0
2500,1.9629,1.811286,0.4488,0.2445,0.4019,150.0
3000,1.9523,1.799301,0.4484,0.2438,0.4013,150.0
3500,1.9265,1.798532,0.451,0.245,0.4029,150.0
4000,1.8753,1.799019,0.4496,0.2449,0.4019,150.0
4500,1.8813,1.793125,0.4511,0.2456,0.4031,150.0
5000,1.8435,1.792822,0.4506,0.2453,0.4028,150.0




In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# ===============================
# Configuration
# ===============================
MODEL_DIR = "/content/drive/MyDrive/results_t5small"  # Replace with your OUT_MODEL_DIR
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ===============================
# Load Model and Tokenizer
# ===============================
model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR)
tokenizer = T5Tokenizer.from_pretrained(MODEL_DIR)
model.to(DEVICE)
model.eval()  # Set model to evaluation mode

# ===============================
# Summarization Function
# ===============================
def summarize_text(text, model, tokenizer, max_input_length=512, max_output_length=150, num_beams=4):
    # Preprocess input
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=max_input_length,
        truncation=True
    ).to(DEVICE)

    # Generate summary
    with torch.no_grad():
        summary_ids = model.generate(
            inputs,
            max_length=max_output_length,
            min_length=20,
            num_beams=num_beams,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=2,
            do_sample=False
        )

    # Decode output
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# ===============================
# Example Usage
# ===============================
article_text = """
NASA's Artemis program successfully launched the Artemis I mission,
sending the Orion spacecraft around the Moon. This is a key step toward
returning humans to the lunar surface.
"""

summary = summarize_text(article_text, model, tokenizer)
print("Summary:")
print(summary)


Summary:
NASA's Artemis program successfully launched the mission, sending the Orion spacecraft around the Moon. This is a key step toward restoring humans to the lunar surface.


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
import evaluate
import time
import pandas as pd
import gc
import random

# ====== Configuration ======
MODEL_DIR = '/content/drive/MyDrive/results_t5small'
DEVICE = torch.device("cuda")
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 150
BATCH_SIZE = 64  # Adjust based on GPU memory
NUM_SAMPLES = 1000  # Random articles to test

# ====== Load model & tokenizer ======
tokenizer = T5Tokenizer.from_pretrained(MODEL_DIR)
model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR)
model.to(DEVICE)
model.eval()

# ====== Load test data ======
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
test_df = pd.DataFrame({
    "input_text": dataset["article"],
    "target_text": dataset["highlights"]
})

# Randomly sample 500 articles
test_df = test_df.sample(n=NUM_SAMPLES, random_state=42).reset_index(drop=True)

# ====== ROUGE metric ======
rouge = evaluate.load("rouge")

# ====== Function to generate summaries efficiently ======
def generate_summaries(texts):
    summaries = []
    start_time = time.time()

    for i in range(0, len(texts), BATCH_SIZE):
        batch_texts = texts[i:i+BATCH_SIZE]

        # Tokenize
        inputs = tokenizer(
            ["summarize: " + t for t in batch_texts],
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(DEVICE)

        # Generate summaries
        with torch.no_grad():
            summary_ids = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=MAX_TARGET_LENGTH,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )

        # Decode
        batch_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
        summaries.extend(batch_summaries)

        # Free GPU memory
        del inputs, summary_ids, batch_summaries
        torch.cuda.empty_cache()
        gc.collect()

        if (i // BATCH_SIZE) % 10 == 0:
            print(f"Processed {i + len(batch_texts)} / {len(texts)} articles")

    total_time = time.time() - start_time
    avg_time_per_article = total_time / len(texts)
    return summaries, total_time, avg_time_per_article

# ====== Run inference on random articles ======
print("Generating summaries for random test articles...")
predictions, total_time, avg_time = generate_summaries(test_df['input_text'].tolist())

print(f"\nTotal inference time for {len(test_df)} articles: {total_time:.2f} sec")
print(f"Average time per article: {avg_time:.2f} sec")

# ====== Compute ROUGE scores ======
results = rouge.compute(predictions=predictions, references=test_df['target_text'].tolist(), use_stemmer=True)
results = {k: round(v, 4) for k, v in results.items()}
print("\nROUGE scores on 500 random test articles:")
for k, v in results.items():
    print(f"{k}: {v}")

# ====== Optional: Show first 3 examples ======
for i in range(3):
    print(f"\n=== ARTICLE {i} ===")
    print(test_df['input_text'][i][:300] + "...")
    print("\n--- MODEL SUMMARY ---")
    print(predictions[i])
    print("\n--- REFERENCE SUMMARY ---")
    print(test_df['target_text'][i])


Generating summaries for 500 random test articles...
Processed 64 / 1000 articles
Processed 704 / 1000 articles

Total inference time for 1000 articles: 152.83 sec
Average time per article: 0.15 sec

ROUGE scores on 500 random test articles:
rouge1: 0.4081
rouge2: 0.1855
rougeL: 0.2829
rougeLsum: 0.3462

=== ARTICLE 0 ===
Down Augusta way they say the azaleas are in full bloom, which is more than can be said for England’s Justin Rose. A bruising Florida swing last month saw the Englishman fall outside the world’s top 10. For a player who has been virtually a fixture in the top five for the last three years it was cer...

--- MODEL SUMMARY ---
Justin Rose has been virtually a fixture in the top five for the last three years. The 34-year-old has spent long hours on the practice ground for the past two weeks. Rose hit 17 out of 18 greens in regulation and signed for a 69. Phil Mickelson enjoyed his best round in months with a 66. Rose has been virtually a fixture in the top five for the l

In [None]:
import os

def list_files_and_folders(directory):
    for root, dirs, files in os.walk(directory):
        # Print the current directory
        print(f"Directory: {root}")

        # Print subdirectories
        for d in dirs:
            print(f"  Subdirectory: {d}")

        # Print files
        for f in files:
            print(f"  File: {f}")
        print("-" * 40)

# Example usage
directory_path = "/content/drive/MyDrive/results_t5small"  # Replace with your folder path
list_files_and_folders(directory_path)


Directory: /content/drive/MyDrive/results_t5small
  Subdirectory: logs
  Subdirectory: checkpoint-4000
  Subdirectory: checkpoint-5000
  Subdirectory: checkpoint-5178
  File: tokenizer_config.json
  File: config.json
  File: generation_config.json
  File: model.safetensors
  File: training_args.bin
  File: special_tokens_map.json
  File: added_tokens.json
  File: spiece.model
----------------------------------------
Directory: /content/drive/MyDrive/results_t5small/logs
  File: events.out.tfevents.1756281819.3290f58c98b8.558.0
----------------------------------------
Directory: /content/drive/MyDrive/results_t5small/checkpoint-4000
  File: config.json
  File: generation_config.json
  File: model.safetensors
  File: tokenizer_config.json
  File: special_tokens_map.json
  File: added_tokens.json
  File: spiece.model
  File: training_args.bin
  File: optimizer.pt
  File: scheduler.pt
  File: scaler.pt
  File: rng_state.pth
  File: trainer_state.json
---------------------------------------

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

MODEL_DIR = "/content/drive/MyDrive/results_t5small"  # or checkpoint path

tokenizer = T5Tokenizer.from_pretrained(MODEL_DIR)
model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
from datasets import load_dataset

# Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:100]")  # first 100 samples
print("Number of samples:", len(dataset))
print("Example article:", dataset[0]['article'][:500])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Number of samples: 100
Example article: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, includin


In [None]:
summaries = []

for i, sample in enumerate(dataset):
    article = sample['article']
    inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=150,   # adjust output length if needed
            num_beams=4,
            early_stopping=True
        )

    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    summaries.append(summary)

    if (i+1) % 10 == 0:
        print(f"Processed {i+1}/100 articles")


Processed 10/100 articles
Processed 20/100 articles
Processed 30/100 articles
Processed 40/100 articles
Processed 50/100 articles
Processed 60/100 articles
Processed 70/100 articles
Processed 80/100 articles
Processed 90/100 articles
Processed 100/100 articles


In [None]:
import pandas as pd

df = pd.DataFrame({
    "article": [s['article'] for s in dataset],
    "reference_summary": [s['highlights'] for s in dataset],
    "generated_summary": summaries
})

df.to_csv("cnn_100_summaries.csv", index=False)
print("Saved summaries to cnn_100_summaries.csv")


Saved summaries to cnn_100_summaries.csv


In [None]:
import evaluate

rouge = evaluate.load("rouge")

# Reference summaries
references = [s['highlights'] for s in dataset]

results = rouge.compute(predictions=generated_summaries, references=references)
print("ROUGE Scores:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")


Downloading builder script: 0.00B [00:00, ?B/s]

ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['rouge_score'] using 'pip install rouge_score' for instance'

In [None]:
!pip install evaluate
