# Text summarizer
Here I am trying DistilBART model "DistilBART" . DistilBART is a distilled version of BART, which is smaller and faster while retaining good performance for summarization task


### Imports

In [2]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, DatasetDict
from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm

### Global variables 

In [None]:
# Load the dataset
train, test = load_dataset(
    "wikihow",
    "sep",
    data_dir="../Dataset/",
    split=["train", "test"],
    trust_remote_code=True,
)
dataset = DatasetDict({"train": train, "test": test})
dataset = dataset.select_columns(["text", "headline"])

In [3]:
# Define model and tokenizer
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Preprocessing function
prefix = "summarize: "
max_input_length = 512
max_target_length = 64

### Preprocessing

In [None]:
# Pre processing
def preprocess_function(dataset):
    inputs = [prefix + text for text in dataset["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length", return_tensors="pt")
    targets = dataset["headline"]
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding="max_length", return_tensors="pt")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Split the data
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)
tokenized_datasets["train"] = train_test_split["train"]
tokenized_datasets["validation"] = train_test_split["test"]


### Evaluation

In [None]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Enable mixed precision
scaler = torch.cuda.amp.GradScaler()

# Generate summaries for the test set with tqdm progress bar
generated_summaries = []
reference_summaries = tokenized_datasets["test"]["headline"]

# Define batch size
batch_size = 16  # Adjust the batch size based on your GPU memory

# Process in batches with mixed precision
for i in tqdm(range(0, len(tokenized_datasets["test"]), batch_size), desc="Generating summaries"):
    batch_texts = tokenized_datasets["test"]["text"][i:i+batch_size]
    batch_inputs = tokenizer([prefix + text for text in batch_texts], return_tensors="pt", max_length=max_input_length, truncation=True, padding=True).to(device)
    
    with torch.cuda.amp.autocast():  # Enable mixed precision
        summary_ids = model.generate(batch_inputs["input_ids"], max_length=max_target_length, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    batch_summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    generated_summaries.extend(batch_summaries)

In [None]:
# Check if generated summaries are empty
if len(generated_summaries) == 0:
    print("No summaries were generated.")
else:
    # Evaluate using ROUGE score
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for ref_summary, gen_summary in zip(reference_summaries, generated_summaries):
        # Ensure both summaries are non-empty
        if ref_summary.strip() and gen_summary.strip():
            scores = scorer.score(ref_summary, gen_summary)
            rouge1_scores.append(scores['rouge1'].fmeasure)
            rouge2_scores.append(scores['rouge2'].fmeasure)
            rougeL_scores.append(scores['rougeL'].fmeasure)
        else:
            print(f"Empty summary detected: ref='{ref_summary}', gen='{gen_summary}'")

    # Calculate average ROUGE scores
    if rouge1_scores and rouge2_scores and rougeL_scores:
        avg_rouge1 = np.mean(rouge1_scores)
        avg_rouge2 = np.mean(rouge2_scores)
        avg_rougeL = np.mean(rougeL_scores)
        print(f'Average ROUGE-1 Score: {avg_rouge1:.4f}')
        print(f'Average ROUGE-2 Score: {avg_rouge2:.4f}')
        print(f'Average ROUGE-L Score: {avg_rougeL:.4f}')
    else:
        print("No valid scores to calculate averages.")