# Abstractive summarisation using T5 small:

# Pip installments + loadings

In [None]:
#nltk basics
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import names
import nltk

# Download the necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('names')

#Import the punctuation module
from string import punctuation

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Dataset loading

In [None]:
filepath='...'

df = pd.read_csv(filepath+"/dati_TM.csv")

In [None]:
df_sample = df.sample(2000, random_state=123)

In [None]:
df_sample.head()

Unnamed: 0,id,article,highlights
12136,e27a8944f7c91a4c8c84757521def725c7c32231,By . Daily Mail Reporter . Last updated at 2:2...,Tycoon says his opposition to wind farm off th...
16812,1d5080b5e0b40ac587ce40543f60890b31d7dfd5,Iran has named a member of the radical student...,Member of group who seized American hostages i...
10072,0269203104295e3d6badc2808fff5c5e1f09e298,Waiting times for cancer treatment and other o...,Waiting times for non-emergency tests and surg...
5850,fd1a9aacdf6ea782f2c71d8bfff0d483a0c01e9d,"Pam Pope, 65, from Surrey had nine organs remo...",Pam Pope complained of symptoms for 15 months ...
4320,ee43cb823d45c314b94de26888a666559bdea287,"By . David Wilkes . PUBLISHED: . 07:20 EST, 8 ...",Pippa the long-haired dachshund had back pain ...


# Inference

## Loading the model and the tokenizer

**t5 small**: https://huggingface.co/t5-small


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Defining the `batch_inference()` function

- Inputs: batches of articles to be summarized

- Outputs: the summaries

In [None]:
def batch_inference(input_dataset, batch_size=20):

  #Extract the articles+ adding a prefix and the ground_truths(labels)
  articles = [ "Summarize this article: " + input_dataset["article"].iloc[id] for id, _ in enumerate(input_dataset["article"]) ]
  labels = input_dataset.highlights.tolist()

  #Initializing and defining variables
  final_output = []

  #Start the batch processing + inference + decoding
  for i in range(0,len(articles), batch_size):

    #Define the batch
    batch = articles[i:i+batch_size]

    #Tokenize the batch
    inputs = tokenizer(batch, padding="max_length", truncation=True, return_tensors="pt")

    #Inference the batch
    outputs = model.generate(**inputs, max_length=60, min_length=40)

    #Decoding the batch
    summaries = [tokenizer.decode(summary, skip_special_tokens=True) for summary in outputs]
    final_output.append(summaries)

  #merge the sublists
  summy = sum(final_output, [])

  return (summy, labels[0:len(summy)])




Performing the inference

In [None]:
summary_results, ground_truth = batch_inference(df_sample, batch_size=32)

Saving the results

In [None]:
output_dir = "..."

df_result = pd.DataFrame({"summary_results": summary_results, "ground_truth": ground_truth})

df_result.to_csv( output_dir + "/abstractive_sum_T5.csv", index=False)