### requirements

In [None]:
!pip install accelerate -U

In [None]:
pip install transformers datasets evaluate rouge_score

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Prepare

### Load BillSum dataset


**Dataset Summary:** BillSum, summarization of US Congressional and California state bills.

There are several features:

* text: bill text.
* summary: summary of the bills.
* title: title of the bills. features for us bills. ca bills does not have.
* text_len: number of chars in text.
* sum_len: number of chars in summary.

In [None]:
billsum = load_dataset("billsum", split="ca_test")

In [None]:
# Split the dataset into a train and test set
billsum = billsum.train_test_split(test_size=0.2)

In [None]:
# example
billsum["train"][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 75.21 of the Revenue and Taxation Code is amended to read:\n75.21.\n(a) Exemptions shall be applied to the amount of the supplemental assessment, provided that the property is not receiving any other exemption on either the current roll or the roll being prepared except as provided for in subdivision (b), that the assessee is eligible for the exemption, and that, in those instances in which the provisions of this division require the filing of a claim for the exemption, the assessee makes a claim for the exemption.\n(b) If the property received an exemption on the current roll or the roll being prepared and the assessee on the supplemental roll is eligible for an exemption and, in those instances in which the provisions of this division require the filing of a claim for the exemption, the assessee makes a claim for an exemption of a greater amount, then the difference in the amount between the

### Preprocess

In [None]:
# load a T5 tokenizer to process text and summary

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The preprocessing function needs to:

1. Prefix the input with a prompt so T5 knows this is a summarization task
2. Use the keyword text_target argument when tokenizing labels
3. Truncate sequences to be no longer than the maximum length set by the max_length parameter.

In [None]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# apply the preprocessing function over the entire dataset
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [None]:
# create a batch of examples using DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

### evaluate

In [None]:
# load the ROUGE metric
rouge = evaluate.load("rouge")

In [None]:
# create a function that passes predictions and labels to compute to calculate the ROUGE metric
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Train

In [None]:
# Load T5 with AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Here, we're setting the Seq2SeqTrainingArguments and calling the trainer

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="teachMy_sum",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.848788,0.1234,0.0342,0.104,0.1042,19.0
2,No log,2.641356,0.135,0.0443,0.1113,0.1112,19.0
3,No log,2.580542,0.1387,0.0491,0.1148,0.1147,19.0
4,No log,2.563722,0.1412,0.0488,0.1166,0.1164,19.0




TrainOutput(global_step=248, training_loss=3.012329593781502, metrics={'train_runtime': 279.3278, 'train_samples_per_second': 14.163, 'train_steps_per_second': 0.888, 'total_flos': 1070824333246464.0, 'train_loss': 3.012329593781502, 'epoch': 4.0})

In [None]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1711230551.e65fc19f204c.4866.0:   0%|          | 0.00/7.97k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Oulaa/teachMy_sum/commit/29dfd3dd0d95de072ac717fc6a2294dd927edfd0', commit_message='End of training', commit_description='', oid='29dfd3dd0d95de072ac717fc6a2294dd927edfd0', pr_url=None, pr_revision=None, pr_num=None)

## Inference

this is how to call an inference from zero, from import to inference

In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline

In [None]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying up American workers and create good-paying, union jobs across the country. The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes. And no one making under $400,000 per year will pay a penny more in taxes."

In [None]:
text = """ cess. A clear concept of the next evolutionary stage for the classification of mental disorders \nwas central to the efforts of the task force and the work groups. This vision emerged as the \ntask force and work groups recounted the history of DSM-IV's classification, its current \nstrengths and limitations, and strategic directions for its revision. An intensive 6-year pro\xad\ncess involved conducting literature reviews and secondary analyses, publishing research \nreports in scientific journals, developing draft diagnostic criteria, posting preliminary \ndrafts on the DSM-5 Web site for public comment, presenting preliminary findings at pro\xad\nfessional meetings, performing field trials, and revising criteria and text.\nProposals for Revisions\nProposals for the revision of DSM-5 diagnostic criteria were developed by members of the \nwork groups on the basis of rationale, scope of change, expected impact on clinical man\xad"

In [None]:
# set up pipeline
summarizer = pipeline("summarization", model="Oulaa/teachMy_sum")
summarizer(text)

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]

In [None]:
# tokenize text

tokenizer = AutoTokenizer.from_pretrained("Oulaa/teachMy_sum")
inputs = tokenizer(text, return_tensors="pt").input_ids

In [None]:
# generate() to create the summarization

model = AutoModelForSeq2SeqLM.from_pretrained("Oulaa/teachMy_sum")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [None]:
# ecode generated tokens into text
tokenizer.decode(outputs[0], skip_special_tokens=True)

"the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in American history. it'll ask the ultra-wealthy and corporations to pay their fair share."

### Function to summarize

this is the initial function to call a summary in general. This is not ready to be used in the user facing feedback of the app

In [None]:
# initialize the pipeline and model only once, outside the function
summarizer_pipeline = pipeline("summarization", model="Oulaa/teachMy_sum")
tokenizer = AutoTokenizer.from_pretrained("Oulaa/teachMy_sum")
model = AutoModelForSeq2SeqLM.from_pretrained("Oulaa/teachMy_sum")

def summarize_text(text, max_length_output=200):
    """
    Summarizes the input text using a fine-tuned T5 model.

    Parameters:
    - text (str): text to summarize.
    - max_length_output (int): maximum length of summarization output. Default is 200 tokens.

    Returns:
    - str: summarized text.
    """
    # Use summarization pipeline for a quick summarization (preferred for single or few texts)
    summarized = summarizer_pipeline(text, max_length=max_length_output, min_length=int(max_length_output / 2), do_sample=False)
    return summarized[0]['summary_text']


# Example usage
summarized_text = summarize_text(text)
print(summarized_text)


The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs . it's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying up american workers . It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share . And no one making under $400,000 per year will pay a penny more in taxes .
