# **TEXT SUMMARIZATION**

#  **The goal of this project is to develop a text summarization system using the encoder decoder models and selecting an optimum model. Text summarization is the task of generating concise and informative summaries of longer texts, such as articles, documents, or news stories.**

# I have tried different model from huggingface library but BART model gave me a decent accuracy. So trained my model using Facebook/Bart

 # Installing some necessary libraries for the project

In [None]:
!pip install transformers torch

# Importing some important libraries and functionalities

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, BartForConditionalGeneration, EncoderDecoderConfig

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


# Reading the data and spliting it into training and testing data

In [3]:
df=pd.read_csv('/kaggle/input/text-data/data')
X=df['article']
y=df['highlights']

In [4]:
paragraphs, para_test, summaries, summary_test = train_test_split(X,y,test_size=0.3)

In [5]:
paragraphs = paragraphs.to_list()
summaries = summaries.to_list()

In [6]:
para_test = para_test.to_list()
summary_test = summary_test.to_list()

# Pre-processing is not need in the big LLM model so I haven't done processing like removing stopwords, lemmetizing etc. Still I have mentioned the steps 

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

processed_paragraphs = []
processed_summaries = []

In [None]:
for paragraph, summary in zip(paragraphs, summaries):
    
    paragraph_tokens = word_tokenize(paragraph)
    summary_tokens = word_tokenize(summary)

    processed_paragraph_tokens = [lemmatizer.lemmatize(token.lower()) for token in paragraph_tokens if token.lower() not in stop_words]
    processed_summary_tokens = [lemmatizer.lemmatize(token.lower()) for token in summary_tokens if token.lower() not in stop_words]

    processed_paragraph = ' '.join(processed_paragraph_tokens)
    processed_summary = ' '.join(processed_summary_tokens)

    processed_paragraphs.append(processed_paragraph)
    processed_summaries.append(processed_summary)

# Initializing the Bart model and the tokenizer

In [32]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# Tokenizing out training data (X and y label) as inputs and outputs

In [5]:
inputs = tokenizer(paragraphs, truncation=True, padding=True, max_length=512, return_tensors='pt')
outputs = tokenizer(summaries, truncation=True, padding=True, max_length=128, return_tensors='pt')

input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
decoder_input_ids = outputs.input_ids
decoder_attention_mask = outputs.attention_mask

train_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, decoder_input_ids[:, :-1], decoder_attention_mask[:, :-1])
eval_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, decoder_input_ids[:, :-1], decoder_attention_mask[:, :-1])

# Setting some hyperparameters for our training purpose

In [6]:
train_batch_size = 8
num_train_epochs = 10
learning_rate = 2e-5

# Necessary configurations to use the GPU while traning

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.decoder_start_token_id = tokenizer.bos_token_id
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [8]:
torch.cuda.is_available()

True

# This step is to load our data in batches so that the model does not overload and assigning a optimizer

In [9]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=eval_batch_size)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Here the training begins

In [7]:
for epoch in range(num_train_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, decoder_input_ids, decoder_attention_mask = batch

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, return_dict=True)
        
        optimizer.zero_grad()
        optimizer.step()

    print("Epoch",epoch+1)

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10


# Save the model for future inferencing

In [None]:
model.save_pretrained('bart_model')
tokenizer.save_pretrained('bart_tokenizer')

In [8]:
model = BartForConditionalGeneration.from_pretrained("/kaggle/input/text-data/bart_model")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/text-data/bart_tokenizer")

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [34]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id


# Just performing some inferences using some examples of training data

In [22]:
paragraph = (paragraphs[2])

In [23]:
paragraph

'(CNN) -- Nineteen political prisoners were released by the government of Myanmar over the weekend, the human rights group Amnesty International reported Tuesday. Protesters demand democracy for Myanmar at a demonstration in New Delhi, India earlier this month. Among those released was Ma Khin Khin Leh, who was serving a life sentence because her husband, a student activist, had helped plan a protest demonstration in Bago in July 1999, according to Amnesty International USA\'s Web blog . Authorities prevented the demonstration from taking place, but took the woman and her three-year-old daughter into custody after failing to find her husband, Amnesty International said. The child was released after five days but her mom, a 33-year-old school teacher, was sentenced to life in prison. "Even by the normally harsh standards of \'justice\' meted out by Myanmar\'s military government, the life sentence given to Ma Khin Khin Leh was extreme," the human rights organization said. She was design

In [21]:
summaries[2]

"School teacher among 19 political prisoners freed in Myanmar, Amnesty says . Ma Khin Khin Leh sentenced to life in 1999 after her husband planned a protest . Myanmar's military rulers are widely condemned for alleged human rights abuses . Pro-democracy leader Aung San Suu Kyi still confined to home ."

In [35]:
input_ids = tokenizer(paragraph[2], max_length=512, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)

generated_ids = models.generate(input_ids)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_text)

["N.N.C. is the U.N.'s highest-ranking official in charge of peacekeeping operations in the Middle East. It is responsible for peacekeeping efforts in Afghanistan, Iraq, Lebanon and Afghanistan. It's also the highest-ranked country in the world in terms of troop numbers. It has the highest level of peacekeepers in the region."]


# Installing Rouge metric for evaluation of testing data

In [36]:
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=e4bc6746613f42e2f515c84e033bc2283f2bd0d0a359600156dd281ff45077a1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[0m

# Generating the summaries on our test in batches because the model was not taking all records at a time because our GPU does not have high storage

In [37]:
generated_summaries=[]
i=0
j=10
while j <=1000:
    print(i,j)
    input_ids = tokenizer(para_test[i:j],padding=True,truncation=True, max_length=512, return_tensors="pt").input_ids.to(device)

    generated_ids = model.generate(input_ids)
    generated = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    generated_summaries = generated_summaries + generated
    
    i+=10
    j+=10
    
    

0 10




10 20
20 30
30 40
40 50
50 60
60 70
70 80
80 90
90 100
100 110
110 120
120 130
130 140
140 150
150 160
160 170
170 180
180 190
190 200
200 210
210 220
220 230
230 240
240 250
250 260
260 270
270 280
280 290
290 300
300 310
310 320
320 330
330 340
340 350
350 360
360 370
370 380
380 390
390 400
400 410
410 420
420 430
430 440
440 450
450 460
460 470
470 480
480 490
490 500
500 510
510 520
520 530
530 540
540 550
550 560
560 570
570 580
580 590
590 600
600 610
610 620
620 630
630 640
640 650
650 660
660 670
670 680
680 690
690 700
700 710
710 720
720 730
730 740
740 750
750 760
760 770
770 780
780 790
790 800
800 810
810 820
820 830
830 840
840 850
850 860
860 870
870 880
880 890
890 900
900 910
910 920
920 930
930 940
940 950
950 960
960 970
970 980
980 990
990 1000


In [38]:
len(generated_summaries)

1000

In [39]:
from datasets import load_metric
rouge_metric = load_metric("rouge")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [40]:
tokenized_hypotheses = tokenizer.batch_encode_plus(
    generated_summaries,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

In [41]:
tokenized_references = tokenizer.batch_encode_plus(
    summary_test[:1000],
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

In [42]:
hypotheses_str = tokenizer.batch_decode(tokenized_hypotheses.input_ids, skip_special_tokens=True)
references_str = tokenizer.batch_decode(tokenized_references.input_ids, skip_special_tokens=True)



# Calculating the score using ROUGE

In [43]:
rouge_results = rouge_metric.compute(predictions=hypotheses_str, references=references_str)

# Access individual ROUGE scores
rouge1_f1 = rouge_results["rouge1"].mid.fmeasure
rouge2_f1 = rouge_results["rouge2"].mid.fmeasure
rougeL_f1 = rouge_results["rougeL"].mid.fmeasure

print(f"ROUGE-1 F1 Score: {rouge1_f1:.4f}")
print(f"ROUGE-2 F1 Score: {rouge2_f1:.4f}")
print(f"ROUGE-L F1 Score: {rougeL_f1:.4f}")

ROUGE-1 F1 Score: 0.3855
ROUGE-2 F1 Score: 0.1800
ROUGE-L F1 Score: 0.2832


# The model performed pretty decent with average score because I don't have a good computing resources, I couldn't train the model with more data and with more epochs.Still the model performed pretty well