## Text Summarization

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset

## Configs

In [None]:
split_prop = 0.2
model_name = "t5-small"
dataset_name = "billsum"
model_output_name = f"{model_name}-{dataset_name}"
print(model_output_name)
learning_rate = 2e-5
batch_size = 8
save_total_limit=3
epochs=2


In [None]:
billsum = load_dataset(dataset_name, split="ca_test")

## Split the dataset

In [None]:
billsum = billsum.train_test_split(test_size=split_prop)

In [None]:
billsum['train'][0]

## Preprocess the data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
prefix = "summarize: "

def preprocess_sum(examples, max_len=1024, sum_max_len=128):
    inputs = [prefix + doc for doc in examples["text"]]
    model_in = tokenizer(inputs, max_length=max_len, truncation=True)
    labels = tokenizer(text_target=examples["summary"], max_length=sum_max_len)

    model_in["labels"] = labels["input_ids"]
    return model_in


In [None]:
tokenized_billsum = billsum.map(preprocess_sum, batched=True)

In [None]:
import evaluate 
import numpy as np
rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v,4) for k, v in result.items()}


## Train the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
train_args = Seq2SeqTrainingArguments(
    output_dir=model_output_name,
    evaluation_strategy="epoch",
    learning_rate=learning_rate, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, 
    weight_decay=0.01,
    save_total_limit=save_total_limit,
    num_train_epochs=epochs,
    predict_with_generate=True, 
    fp16=True, 
    push_to_hub=True
)

In [None]:
import inspect
print(inspect.getsource(Seq2SeqTrainingArguments))


In [None]:
help(Seq2SeqTrainingArguments)

## Create trainer

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


trainer = Seq2SeqTrainer(
    model=model, 
    args=train_args, 
    train_dataset = tokenized_billsum["train"],
    eval_dataset = tokenized_billsum["test"],
    tokenizer=tokenizer, 
    data_collator=data_collator, 
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

## Inference

In [None]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization", model=f"StatsGary/{model_output_name}")

In [None]:
summarizer(text)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(f"StatsGary/{model_output_name}")
inputs = tokenizer(text, return_tensors="pt").input_ids

model = AutoModelForSeq2SeqLM.from_pretrained(f"StatsGary/{model_output_name}")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
tokenizer.decode(outputs[0], skip_special_tokens=True)
