In [1]:
import torch
import pandas as pd
from datasets import Dataset , DatasetDict
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
file_path = "synthetic News Dataset & Model Details - Synthetic News Dataset.csv"  
df = pd.read_csv(file_path)

In [4]:
dataset = Dataset.from_pandas(df)

In [5]:
train_dataset, temp_dataset = dataset.train_test_split(test_size=0.4, seed=42).values()
validation_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5, seed=42).values()

In [6]:
tokenized_datasets = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

In [7]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["Content"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["Human Summary"], max_length=100, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_datasets = tokenized_datasets.map(preprocess_function, batched=True)

Map: 100%|██████████| 67/67 [00:00<00:00, 808.64 examples/s]
Map: 100%|██████████| 22/22 [00:00<00:00, 1014.28 examples/s]
Map: 100%|██████████| 23/23 [00:00<00:00, 792.01 examples/s]


In [9]:
training_args = TrainingArguments(
    output_dir="./pegasus_news",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,
    push_to_hub=False,
)



In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'] 
)

In [11]:
trainer.train()

                                                
 20%|██        | 34/170 [03:41<12:54,  5.69s/it]

{'eval_loss': 3.9597151279449463, 'eval_runtime': 16.6806, 'eval_samples_per_second': 1.319, 'eval_steps_per_second': 0.659, 'epoch': 1.0}


                                                
 40%|████      | 68/170 [07:41<11:37,  6.83s/it]

{'eval_loss': 3.7549026012420654, 'eval_runtime': 14.0977, 'eval_samples_per_second': 1.561, 'eval_steps_per_second': 0.78, 'epoch': 2.0}


                                                 
 60%|██████    | 102/170 [11:37<07:08,  6.29s/it]

{'eval_loss': 3.651711940765381, 'eval_runtime': 15.6807, 'eval_samples_per_second': 1.403, 'eval_steps_per_second': 0.702, 'epoch': 3.0}


                                                 
 80%|████████  | 136/170 [16:04<05:02,  8.90s/it]

{'eval_loss': 3.5957071781158447, 'eval_runtime': 18.272, 'eval_samples_per_second': 1.204, 'eval_steps_per_second': 0.602, 'epoch': 4.0}


                                                 
100%|██████████| 170/170 [21:03<00:00,  7.43s/it]

{'eval_loss': 3.574906826019287, 'eval_runtime': 17.0645, 'eval_samples_per_second': 1.289, 'eval_steps_per_second': 0.645, 'epoch': 5.0}
{'train_runtime': 1263.8776, 'train_samples_per_second': 0.265, 'train_steps_per_second': 0.135, 'train_loss': 4.306199376723345, 'epoch': 5.0}





TrainOutput(global_step=170, training_loss=4.306199376723345, metrics={'train_runtime': 1263.8776, 'train_samples_per_second': 0.265, 'train_steps_per_second': 0.135, 'total_flos': 483985289379840.0, 'train_loss': 4.306199376723345, 'epoch': 5.0})

In [12]:
model.save_pretrained("./fine_tuned_pegasus_news")
tokenizer.save_pretrained("./fine_tuned_pegasus_news")

('./fine_tuned_pegasus_news\\tokenizer_config.json',
 './fine_tuned_pegasus_news\\special_tokens_map.json',
 './fine_tuned_pegasus_news\\spiece.model',
 './fine_tuned_pegasus_news\\added_tokens.json')

In [6]:
fine_tuned_model = PegasusForConditionalGeneration.from_pretrained("./fine_tuned_pegasus_news")
fine_tuned_tokenizer = PegasusTokenizer.from_pretrained("./fine_tuned_pegasus_news")

In [7]:
def generate_summary(text):
    inputs = fine_tuned_tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
    summary_ids = fine_tuned_model.generate(**inputs, max_length=100,num_beams=10, early_stopping=True)
    return fine_tuned_tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
test_text = "'ITANAGAR, Jan 02: In a tragic incident, at least six army jawans of 11th Garhwal Regiment of Gajraj Corps were killed and two others injured when their convoy truck was reportedly plunged into deep gorge near Sessa in West Kameng yesterday. Sources from Bhalukpong informed this daily over the telephone that the incident occurred when the convoy was moving from Sessa base camp to Bhalukpong.  One of the convoy truck carrying eight jawans was hit by shooting stone, plunged into deep gorge after veered off the road, source added.  \n\nThe jawans who killed were identified as Rajendra Singh, Sandeep Singh, Arvind Singh, Bather Rajesh, Deep Chand, R. Ram Prasad and injured jawans were identified as Kuldeep Singh and Kundun, both were rushed to Army Hospital at Tezpur (Assam)'"
summary = generate_summary(test_text)
print("\nGenerated Summary:", summary)


Generated Summary: At least six army jawans were killed and two others injured when their convoy truck was reportedly plunged into deep gorge near Sessa in West Kameng yesterday.


In [10]:
print("Size of Input text:",len(test_text))
print("Size of Output text:",len(summary))

Size of Input text: 780
Size of Output text: 159
