# Text summarizer
Here I try around with t5-base

In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset, DatasetDict
!pip install -q rouge_score
!pip install -q tqdm
from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm


### Global variables

In [12]:
# Load the dataset
train, test = load_dataset(
    "wikihow",
    "sep",
    data_dir="../Dataset/",
    split=["train", "test"],
    trust_remote_code=True,
)
dataset = DatasetDict({"train": train, "test": test})
dataset = dataset.select_columns(["text", "headline"])

In [14]:
# Define model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocessing function
prefix = "summarize: "
max_input_length = 512
max_target_length = 64

### Preprocessing


In [20]:
def preprocess_function(dataset):
    inputs = [prefix + text for text in dataset["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    targets = dataset["headline"]
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Split the data
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)
tokenized_datasets["train"] = train_test_split["train"]
tokenized_datasets["validation"] = train_test_split["test"]



Map:   0%|          | 0/1060732 [00:00<?, ? examples/s]

Map: 100%|██████████| 1060732/1060732 [01:08<00:00, 15550.42 examples/s]


## Evaluation

In [24]:

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Enable mixed precision
scaler = torch.cuda.amp.GradScaler()

# Generate summaries for the test set with tqdm progress bar
generated_summaries = []
reference_summaries = tokenized_datasets["test"]["headline"]

# Define batch size
batch_size = 16  # Adjust the batch size based on your GPU memory

# Process in batches
for i in tqdm(range(0, len(tokenized_datasets["test"]), batch_size), desc="Generating summaries"):
    batch_texts = tokenized_datasets["test"]["text"][i:i+batch_size]
    batch_inputs = tokenizer([prefix + text for text in batch_texts], return_tensors="pt", max_length=max_input_length, truncation=True, padding=True).to(device)
    
    with torch.cuda.amp.autocast():  # Enable mixed precision
        summary_ids = model.generate(batch_inputs["input_ids"], max_length=max_target_length, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    batch_summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    generated_summaries.extend(batch_summaries)


Generating summaries:   0%|          | 0/2363 [00:00<?, ?it/s]

Generating summaries: 100%|██████████| 2363/2363 [46:11<00:00,  1.17s/it]


In [25]:
# Evaluate using ROUGE score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for ref_summary, gen_summary in zip(reference_summaries, generated_summaries):
    scores = scorer.score(ref_summary, gen_summary)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate average ROUGE scores
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)

print(f'Average ROUGE-1 Score: {avg_rouge1:.4f}')
print(f'Average ROUGE-2 Score: {avg_rouge2:.4f}')
print(f'Average ROUGE-L Score: {avg_rougeL:.4f}')

Average ROUGE-1 Score: 0.1355
Average ROUGE-2 Score: 0.0416
Average ROUGE-L Score: 0.1130


In [26]:
# Define the compute_metrics function to calculate ROUGE scores
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    decoded_labels = [
        [token if token != -100 else tokenizer.pad_token_id for token in label]
        for label in decoded_labels
    ]

    result = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True).score(decoded_preds, decoded_labels)
    result = {key: value.fmeasure for key, value in result.items()}
    return result

# Set up the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="baseEval",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    bf16=torch.cuda.is_available(),  # Enable bf16 if available
)

# Create the DataCollator for padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Evaluate the model
results = trainer.evaluate()
print(results)