In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")

dataset = dataset['train'].shuffle(seed=42).select([i for i in range(100)])

df = dataset.to_pandas()
df = df[['article', 'highlights']]
df.columns = ['text', 'summary']

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_df.to_csv("pretrained_dataset_train.csv", index=False)
val_df.to_csv("pretrained_dataset_val.csv", index=False)



  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 15.6k/15.6k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 257M/257M [01:59<00:00, 2.15MB/s] 
Downloading data: 100%|██████████| 257M/257M [01:42<00:00, 2.49MB/s] 
Downloading data: 100%|██████████| 259M/259M [02:35<00:00, 1.67MB/s] 
Downloading data: 100%|██████████| 34.7M/34.7M [00:08<00:00, 3.86MB/s]
Downloading data: 100%|██████████| 30.0M/30.0M [00:07<00:00, 4.04MB/s]
Generating train split: 100%|██████████| 287113/287113 [00:17<00:00, 16757.77 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 27086.64 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 25919.76 examples/s]


In [2]:
from flask import Flask, render_template, request
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd

app = Flask(__name__)

model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

df = pd.read_csv('pretrained_dataset_train.csv', encoding='utf-8') 

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/summarize', methods=['POST'])
def summarize():
    article = request.form['article']
    summary_length = int(request.form['summary_length'])
    style = request.form['style']


    prompt_template = f"Create a {style} summary with {summary_length} words: "

  
    input_text = f"{prompt_template}{article}"
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)
    personalized_summary_ids = model.generate(inputs, max_length=summary_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    personalized_summary = tokenizer.decode(personalized_summary_ids[0], skip_special_tokens=True)

    
    baseline_summary_ids = model.generate(inputs, max_length=summary_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    baseline_summary = tokenizer.decode(baseline_summary_ids[0], skip_special_tokens=True)

    return render_template('result.html', article=article, personalized_summary=personalized_summary, baseline_summary=baseline_summary)

if __name__ == '__main__':
    app.run()


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [19/Jan/2024 19:02:08] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [19/Jan/2024 19:02:09] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [19/Jan/2024 19:14:34] "POST /summarize HTTP/1.1" 200 -
