In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load data
path_input = 'mydata.csv'
df = pd.read_csv(path_input)
df['summary'] = df['summary'].replace(r'\n', '', regex=True)

# Load T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base", max_len=1024, 
                                        do_lower_case=True, padding=True,
                                        bos_token="<s>", eos_token="</s>", unk_token="<unk>", pad_token="<pad>")
model = T5ForConditionalGeneration.from_pretrained("t5-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define dataset class
class SummaryDataset(Dataset):
    def __init__(self, data, tokenizer, text_max_token_len=512, summary_max_token_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        text = data_row['text']
        summary = str(data_row['summary'])

        text_encoding = self.tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        summary_encoding = self.tokenizer(
            summary,
            max_length=self.summary_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        
        # retrieves the tokenized representation 
        # of the summary from the summary_encoding dictionary.
        # The input_ids key holds the token IDs of the summary.
        labels = summary_encoding['input_ids']
        
        
        # 100 is often used to indicate that those positions should be ignored during training
        # The positions with padding tokens are not considered during this process.
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': text_encoding['input_ids'].squeeze(),
            'attention_mask': text_encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze(),
        }

# Create dataset and dataloader
dataset = SummaryDataset(df, tokenizer)
dataloader = DataLoader(dataset, shuffle=True, batch_size=8)

# Define summary function
def summarizeText(text, model=model, tokenizer=tokenizer, device=device):
    text_encoding = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )
    generated_ids = model.generate(
        input_ids=text_encoding['input_ids'].to(device),
        attention_mask=text_encoding['attention_mask'].to(device),
        max_length=513,
        num_beams=4,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    preds = [
        tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for gen_id in generated_ids
    ]
    return "".join(preds)

# Example usage
text = """Outcome of the information retrieval becomes necessary for user to find out concrete information for the abstraction
because of the stridently escalation of data on the web. Internet is widely used by people to come across information using
proficient information retrieval (IR) tools, such as Google, Yahoo, AltaVista, etc., where findings are abundant. In most of the
cases, users feel bore with the very tedious and time consuming job to reveal the main gist of the outcome of the IR.
Academics and researchers are very much benefitted by using automatic text summarization system as a tool to lessen the
amount of time spent manually extracting the chief thoughts from large documents. In addition to the above reason,
automatic text summarization also provides its users with numerous benefits as well as:
(i) Increase efficiency of other researches to choose documents/information from search engines’ output, which usually
contain an excess amount of replicated information.
(ii) Solve the limitation of information presentation on small communication devices such as PDA and mobile phone etc.,
which is able to display abridged version of the full document.
(iii) The running time of machine for translation is significantly reduced if a short version of text is given. """

summary = summarizeText(text)
print(summary)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned

information retrieval becomes necessary for user to find out concrete information for the abstraction because of the stridently escalation of data on the web. In most of the cases, users feel bore with the very tedious and time consuming job to reveal the main gist of the outcome of the IR. (i) Increase efficiency of other researches to choose documents/information from search engines’ output, which usually contain an excess amount of replicated information. (iv) Reduce the running time of machine for translation is significantly reduced
