In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [2]:
df = pd.read_csv('news_summary.csv', encoding='latin-1', usecols=['headlines', 'text'])

In [3]:
df.head()

Unnamed: 0,headlines,text
0,Daman & Diu revokes mandatory Rakshabandhan in...,The Administration of Union Territory Daman an...
1,Malaika slams user who trolled her for 'divorc...,Malaika Arora slammed an Instagram user who tr...
2,'Virgin' now corrected to 'Unmarried' in IGIMS...,The Indira Gandhi Institute of Medical Science...
3,Aaj aapne pakad liya: LeT man Dujana before be...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotel staff to get training to spot signs of s...,Hotels in Maharashtra will train their staff t...


In [4]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [5]:
#Initialize the T5 Tokenizer and Model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
# Tokenization
def tokenize_data(texts, targets):
    inputs = tokenizer(texts.tolist(), padding='max_length', truncation=True, max_length=512, return_tensors="pt")
    outputs = tokenizer(targets.tolist(), padding='max_length', truncation=True, max_length=128, return_tensors="pt")
    inputs['labels'] = outputs['input_ids']
    return inputs

# Prepare the Datasets
train_dataset = tokenize_data(train_df['text'], train_df['headlines'])
test_dataset = tokenize_data(test_df['text'], test_df['headlines'])

# Convert to list of dictionaries
train_dataset = [{k: v[i] for k, v in train_dataset.items()} for i in range(len(train_dataset['input_ids']))]
test_dataset = [{k: v[i] for k, v in test_dataset.items()} for i in range(len(test_dataset['input_ids']))]

In [10]:
# set Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=200,
)



In [11]:
#Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train Model
trainer.train()

Step,Training Loss,Validation Loss
200,1.2493,0.198926
400,0.209,0.179672
600,0.1787,0.173934
800,0.1703,0.16941
1000,0.1554,0.168086
1200,0.1543,0.165546
1400,0.1443,0.165476
1600,0.1432,0.165095
1800,0.1363,0.165114


TrainOutput(global_step=1975, training_loss=0.26935132280180724, metrics={'train_runtime': 3153.2108, 'train_samples_per_second': 5.009, 'train_steps_per_second': 0.626, 'total_flos': 9618489881395200.0, 'train_loss': 0.26935132280180724, 'epoch': 5.0})

In [19]:
# Saving model to a custom directory
directory = "topic_model_t5"
trainer.save_model(directory)

# Saving model tokenizer
tokenizer.save_pretrained(directory)

('topic_model_t5/tokenizer_config.json',
 'topic_model_t5/special_tokens_map.json',
 'topic_model_t5/spiece.model',
 'topic_model_t5/added_tokens.json')

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
import shutil
shutil.copytree('topic_model_t5', '/content/drive/MyDrive/topic/')

'/content/drive/MyDrive/topic/'

In [28]:
from transformers import pipeline

# Load the model and tokenizer
model_name = "/content/topic_model_t5"  # Path to the saved model
topic_model = pipeline("text2text-generation", model=model_name)

# Sample text for testing
sample_text = "Kunal Shah's credit card bill payment platform, CRED, gave users a chance to win free food from Swiggy for one year. Pranav Kaushik, a Delhi techie, bagged this reward after spending 2000 CRED coins."

# Generate a headline or prediction based on the sample text
generated_headlines = topic_model(sample_text, max_length=50, num_return_sequences=3, do_sample=True, top_k=50, top_p=0.95)

# Print the outputs
for i, headline in enumerate(generated_headlines):
    print(f"Generated Headline {i + 1}:", headline['generated_text'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Generated Headline 1: CRED gives you chance to win free food from Swiggy for 1 year
Generated Headline 2: CRED gives users a chance to win free food from Swiggy for 1 year
Generated Headline 3: CRED gives lucky users chance to win free food from Swiggy
