In [None]:
!pip install simplet5



In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split


# Load the dataset
path = "processed_dataset.csv"
df = pd.read_csv(path)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [None]:
# Rename columns for consistency and select only relevant columns
df = df.rename(columns={"Cleaned_highlights":"target_text", "Cleaned_article":"source_text"})
df = df[['source_text', 'target_text']]

# Add "summarize:" prefix to source_text
df['source_text'] = "summarize: " + df['source_text']
df

Unnamed: 0,source_text,target_text
0,"summarize: Saurav Kant, an alumnus of upGrad a...",upGrad learner switches to career in ML & Al w...
1,summarize: Kunal Shah's credit card bill payme...,Delhi techie wins free food from Swiggy for on...
2,summarize: New Zealand defeated India by 8 wic...,New Zealand end Rohit Sharma-led India's 12-ma...
3,summarize: With Aegon Life iTerm Insurance pla...,Aegon life iTerm insurance plan helps customer...
4,summarize: Speaking about the sexual harassmen...,"Have known Hirani for yrs, what if MeToo claim..."
...,...,...
98396,summarize: A CRPF jawan was on Tuesday axed to...,CRPF jawan axed to death by Maoists in Chhatti...
98397,"summarize: 'Uff Yeh', the first song from the ...",First song from Sonakshi Sinha's 'Noor' titled...
98398,"summarize: According to reports, a new version...",'The Matrix' film to get a reboot: Reports
98399,summarize: A new music video shows rapper Snoo...,Snoop Dogg aims gun at clown dressed as Trump ...


In [None]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2)

# Display the shapes of the training and testing sets
train_df.shape, test_df.shape

((78720, 2), (19681, 2))

In [None]:
# Import the SimpleT5 library and create an instance of the SimpleT5 model
from simplet5 import SimpleT5

model = SimpleT5()


In [None]:
# Load the pre-trained T5 model with the base configuration
model.from_pretrained(model_type="t5", model_name="t5-base")

In [None]:
# Train the model 
model.train(
    train_df=train_df[:8000],  
    eval_df=test_df[:50],      
    source_max_token_len=128,
    target_max_token_len=50,
    batch_size=8,
    max_epochs=1,              
    use_gpu=True
)

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
# Load a specific checkpoint of the trained model
model.load_model("t5","/content/outputs/simplet5-epoch-0-train-loss-1.9677-val-loss-1.3219", use_gpu=False)

In [None]:
# Define the text to be summarized
text_to_summarize="""summarize: Rahul Gandhi has replied to Goa CM Manohar Parrikar's letter,
which accused the Congress President of using his "visit to an ailing man for political gains".
"He's under immense pressure from the PM after our meeting and needs to demonstrate his loyalty by attacking me,"
Gandhi wrote in his letter. Parrikar had clarified he didn't discuss Rafale deal with Rahul.
"""


# Use the loaded model to predict the summary of the input text
model.predict(text_to_summarize)

["Rahul responds to Goa PM's letter accusing him of using his visit for political gains"]