<a href="https://colab.research.google.com/github/Srivastava-Rani-Aakanksha/NLP-Project-Abstractive-Text-Summarization-/blob/main/pegasus(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine Tunning of Pre-trained Language Model on Generated Corpus 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Pegasus Model 

In [2]:
import pandas as pd
import numpy as np


In [3]:
# Reading the csv file in which 
# each row contains different hashtags, tweets belonging to that particular tag
# along with human generated summary 
data=pd.read_csv('/content/drive/MyDrive/finalDatasetNLP.csv')

In [4]:
data.shape


(13, 5)

In [5]:
df = pd.DataFrame({"input_ids": data["tweets"], "output" :data["humanGeneratedSummary"] })
df.head()

Unnamed: 0,input_ids,output
0,['flagging ajmerdelhi cantt vande bharat expre...,The 15th Vande Bharat Express train is operati...
1,['one clown ask went epstein island many time'...,"Elon Musk is the richest person in the world, ..."
2,['first day world book fair held pragati maida...,The world book fair held in Pragati Maidan saw...
3,['proud host poet bioprocessing leipsic chance...,The most important details in this text are th...
4,['2022 r 5 class poverty line jismein tum gare...,The most important details in this text are th...


# A CSV file containing only tweets(input) and human generated summary(output)

In [6]:
df.to_csv("train.csv")

In [7]:
df.shape

(13, 2)

# Importing necessary libraries like transformers , sentencepiece ,torch 

In [None]:
pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
pip install sentencepiece

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import torch

# Fine Tunning Pegasus Model

# Importing transformer library and pre trained Pegasus model

In [None]:
from transformers import PegasusTokenizer
from transformers import TFAutoModelForSeq2SeqLM, PegasusForConditionalGeneration

model_name = "google/pegasus-large"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

In [None]:
pip install datasets

[0mNote: you may need to restart the kernel to use updated packages.


# Training the Model 

In [None]:
#Importing necessary libraries
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch

In [None]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  


In [None]:
def preparedata(model_name,train_texts,train_labels):
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    
    def tokenizedata(texts,labels):
        encodings = tokenizer(texts, truncation=True, padding=True)
        decodings = tokenizer(labels, truncation=True, padding=True)
        dataset_tokenized = PegasusDataset(encodings, decodings)
        return dataset_tokenized
    
    train_dataset = tokenizedata(train_texts, train_labels)
    return train_dataset, tokenizer

Number of epochs = 10

In [None]:
def preparefinetuning(model_name, tokenizer, train_dataset, freeze_encoder=False, output_dir='./results'):
    #Preparing base model Pegasus for fine-tuning
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model= PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
    
    training_args = TrainingArguments(
    output_dir=output_dir,           # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
    save_steps=500,                  # number of updates steps before checkpoint saves
    save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    hub_token="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"  #hugging face hub token
    )
    
    trainer = Trainer(
    model=model,                         # Instantiated Transformers model to be trained
    args=training_args,                  # training arguments that we have defined above
    train_dataset=train_dataset,         # training dataset
    tokenizer=tokenizer,)
    
    return trainer


In [None]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files='/content/train.csv')
train_texts, train_labels = dataset['train']['input_ids'][:10], dataset['train']['output'][:10]
train_dataset, tokenizer = preparedata(model_name, train_texts, train_labels)
trainer1 = preparefinetuning(model_name, tokenizer, train_dataset)
trainer1.train()

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-056126d12c57e070/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-056126d12c57e070/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,4.9552
20,5.2013
30,4.822
40,4.7391
50,4.9615
60,5.0434
70,4.5801
80,4.5619
90,4.6756
100,4.3975


TrainOutput(global_step=100, training_loss=4.79375373840332, metrics={'train_runtime': 109.2549, 'train_samples_per_second': 0.915, 'train_steps_per_second': 0.915, 'total_flos': 288946441420800.0, 'train_loss': 4.79375373840332, 'epoch': 10.0})

# Pushing model trained on our corpus to hugging face hub

In [None]:
trainer1.push_to_hub()

Cloning https://huggingface.co/Aakanksha1999/results into local empty directory.


Download file pytorch_model.bin:   0%|          | 16.5k/2.13G [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

Clean file training_args.bin:  29%|##8       | 1.00k/3.50k [00:00<?, ?B/s]

Download file spiece.model:   0%|          | 1.40k/1.82M [00:00<?, ?B/s]

Clean file spiece.model:   0%|          | 1.00k/1.82M [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/2.13G [00:00<?, ?B/s]

Upload file pytorch_model.bin:   0%|          | 1.00/2.13G [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/3.50k [00:00<?, ?B/s]

To https://huggingface.co/Aakanksha1999/results
   a4ee886..a99134a  main -> main

To https://huggingface.co/Aakanksha1999/results
   a99134a..cec2e63  main -> main



'https://huggingface.co/Aakanksha1999/results/commit/a99134ace19400d236286cb6feb5da30d1b2f5e8'

# Pulling above trained model from Hugging Face hub 

In [None]:
model1 = PegasusForConditionalGeneration.from_pretrained("Aakanksha1999/results")

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.12k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

# Taking few tweets as input to test the model

In [None]:
temp=['kind economic activity decision taken without concurrence pak army',
 'directly dont care indirectly affect',
 'united state marine corp custom derby cover harleydavidson motorcycle hand made order omaha shop',
 'save 30 m1k9 collar today',
 'make exceptional custom military challenge coin brave men woman serving country let u design next challenge coin call 8773129794 designed last lifetime lowest price guarantee fast free shipping',
 'pak armed force reported nearly 12 million acre land thats lot land however best option pak govt take command business entity incapable retired military officer', 'government employee defense employee pakistan traditionally exempted excise taxation',
 'defense sector pakistan continues enjoy government grant subsidy loan despite financial crunch country',
 'pakistan 140th position corruption 180 country data show pakistan diplomat pakistan army looting entire nation common people',
 'solution upskilling career without compromising daily routine read full blog',
 'political instability pakistan past month impacted poverty high currency devolution pak military using every pak govt puppet',
 'pak corruption foreign investment nothing left foreign reserve increasing inflation rupee reached record level 300 dollar seems future pak doomed',
 'global hunger index data show pakistan ranked 92nd 116 country quite serious still deep state pakistan involved finding way mean loot nation',
 'government pakistan cry inflation poverty front whole world hand announcing budget parliament repair bungalow mp addition fund defence',
 'pakistan military 100billion business empire includes stake banking real estate agriculture security telecommunication sector eyeing new sector strengthen hold economy',
 'foreign reserve pakistan depleting day day govt resource basic commodity like essential medicine people pak army armtwisting govt increase defence budget',
 'inflation highest level around 40 per cent expected increase trade deficit 278 billion despite troubling figure pak military expanding business also expanding budgetary allocation',
 'foreign reserve pakistan depleting day day govt resource basic commodity like essential medicine people pak army armtwisting govt increase defence budget',
 'average property every top pak army officer 281 crore wealth belongs poor pakistani people go foreign country creating asset family pak army officer',
 'army getting rich poor pakistan share every big business country army business milk oil property name charity honesty',
 'foreign reserve pakistan depleting day day govt resource basic commodity like essential medicine people pak army armtwisting govt increase defence budget',
 'number case come notice last month pakistan army land grabbing looting province people property comfort',
 'people pakistan getting poorer army continuously making money many different name allegedly taken profitable business country',
 'pakistan army behaving like colonial master since 1947 looting common hungry mass full enthusiasm selling wealth pak people luxurious life',
 'pakistan army behaving like colonial master since 1947 looting common hungry mass full enthusiasm selling wealth pak people luxurious life',
 'army important institution pakistan taking advantage opportunity pakistan army name fauji foundation took profitable business',
 'kind economic activity decision taken without concurrence pak army',
 'pakistan army ruled interfere three time directly country remaining time ruled behind puppet govt prime cause failed economy pakistan',
 'pakistan starving country huge foreign debt hand pakistan military class enjoying amassing wealth year corruption political interference',
 'pakistan army corporate army always first look business country situation pak army always talk increasing military budget every time whereas large population country living poverty',
 'weekly inflation rate pakistan raised 3183 due surging price food item particularly vegetable wheat lpg milk still one class ie military seems relaxed',
 'pakistan bringing loan begging muslim european country name people entire amount gobbled military political class',
 'pakistan army terrorist policy forced low foreign investment country pakistan becoming poorer day day constantly getting caught new trouble economic distress',
 'downfall military official pakistan cant awful',
 'pakistan deteriorating condition leader army pakistan sucking blood common people',
 'fauji foundation mission serve people pakistan established company named pakistan maroc phosphor sa morocco benefit',
 'pakistan deteriorating condition leader army pakistan sucking blood common people',
 'madras university distance education procedure detail',
 'pakistan army military force multibrand business house enterprise',
 'cashstrapped pakistan urged countryman cut allessential item pakistan army continuously increasing defense budget',
 'pakistan army also created private sector charitable trust actually corporate house army welfare trust awt run exclusively retired serving senior military officer pakistan',
 'pakistani army lalas shop lalas empire army increasingly focused effort acquiring land property development setting various business',
 'clazkit 2 1 push chopper push chop chopper vegetable fruit cutter chopper easy push 189',
 'people country starving due financial constraint many former pakistani army officer settled abroad even spreading business empire looting pakistan wealth',
 '14 fullyrugged panasonic toughbook 40 laptop break new ground offering unrivaled flexibility even demanding unpredictable environment contact u today quote',
 'marshall stanmore iii bluetooth wireless speaker 41999 bank offer',
 'increasing budget pak army breaking back common mass pakistan country facing biggest economic distress far since inception nation',
 'qamar javed bajwa became lt gen pak army within 6 yr family become billionaire started international business purchased multiple foreign property started transferring capital abroad become owner commercial plot plaza',
 'dark scene business empire pakistan army socking 13rd part earning use put starving people fate country change',
 'pak army 60 prime business pakistan retired soldier run business without accountability revenue govt',
 'oneplus nord ce 3 lite 5g chromatic gray 8gb ram 128gb storage 19999',
 'pakistan army biggest enemy people greed pak army absorbing resource country leaving nothing people common pakistani forced stand line eventually die bag flour',
 'pak army security policy put entire country dire situation every govt come power move around globe bagging bowl',
 'click link watch incredible story hardik pandya bcci ban indian cricketer hardik pandya untold story',
 'foreign reserve pakistan depleting day day govt resource basic commodity like essential medicine people pak army armtwisting govt increase defence budget',
 'pakistan starving country huge foreign debt hand pakistan military class enjoying amassing wealth year corruption political interference',
 'people pakistan starving dying bag flour cooking oil deep state involved mass corruption one check',
 'pakistan army call shot pakistan spreading business empire whole pakistan seems entire population pakistan subject army rule impunity',
 'number case come notice last month pakistan army land grabbing looting province people property comfort',]

In [None]:

# temp=["han respond one really care lying joe pags say look tweet 0 engagement bitching yesterday radio show elon still fixed algorithm simple fact one really care say"]
for text in temp:
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model1.generate(
      inputs,
      max_length=100, 
      min_length=20, 
      length_penalty=2,
      )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

number case come notice last month pakistan army land grabbing looting province people property comfort case come notice last month pakistan army land grabbing looting province people property comfort case come notice last month pakistan army land grabbing looting province people property comfort case come notice last month pakistan army land grabbing looting province people property comfort case come notice last month pakistan army land grabbing looting province people property comfort case come notice last month pakistan army land grabbing looting province people property comfort case come notice last month pakistan army land grabbing looting province people property


# Evaluation Metric (ROUGE)

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[0m

In [None]:
from rouge import Rouge 
rouge = Rouge()

In [None]:
def evaluation_metric(input):
  scores = rouge.get_scores(input, reference_summary)
  return scores


In [None]:
#this contains manually generated summary 
reference_summary="The most important details in this text are that the Pakistan Army has a large business empire, including stakes in banking, real estate, agriculture, security, and telecommunications. It has also created private sector charitable trusts, such as the Army Welfare Trust, which is run exclusively by retired serving senior military officers. The army has been accused of looting the country and its people, as well as taking advantage of opportunities to create profitable businesses. The army has also been accused of taking advantage of the country's financial crisis, as it has increased its military budget and increased its defence budget. The army has also been accused of taking advantage of the country's financial crisis, as it has increased its military budget and increased its defence budgeThe army has also been accused of taking advantage of the country's financial crisis, as it has increased its military budget and increased its defence budget. The army has also been accused of taking advantage of the country's financial crisis, as it has increased its military budget and increased its defence budget. The army has also been accused of taking advantage of the country's financial crisis, as it has increased its military budget and increased its defence budget. The army has also been accused of The most important details in this text are that the Pakistan Army has a large business empire, including stakes in banking, real estate, agriculture, security, and telecommunications. It has also created private sector charitable trusts, such as the Army Welfare Trust, which is run exclusively by retired serving senior military officers. The army has also been accused of taking advantage of the country's financial crisis, as it has increased its military budget and increased its defence budget. The army has also been accused of taking advantage of the country's financial crisis, as it has increased its military budget and increased its defence budget.The army has also been accused of taking advantage of the country's financial crisis, as it has increased its military budget and increased its defence budget. The army has also been accused of The most important details in this text are that the Pakistan Army has a large business empire, including stakes in banking, real estate, agriculture, security, and telecommunications. It has also created private sector charitable trusts, such as the Army Welfare Trust, which is run exclusively by retired serving senior military officers. The army has been accused of looting the country and its people, as well as taking advantage of opportunities to create profitable businesses. The army has also been accused of taking advantage of the country's financial crisis, as it has increased its military budget and increased its defence budget."

In [None]:
scores = evaluation_metric(summary)

In [None]:
print(scores)

[{'rouge-1': {'r': 0.028169014084507043, 'p': 0.13333333333333333, 'f': 0.04651162502704183}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.028169014084507043, 'p': 0.13333333333333333, 'f': 0.04651162502704183}}]
