In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

import tensorflow as tf
from datasets import Dataset, DatasetDict
from transformers import BartTokenizer, TFBartForConditionalGeneration, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, create_optimizer
from rouge_score import rouge_scorer

#importing the data from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
training_path=os.path.join("/content/drive/MyDrive/Data/News/news_summarization_training.csv")
validation_path=os.path.join("/content/drive/MyDrive/Data/News/news_summarization_validation.csv")
train=pd.read_csv(training_path)
valid=pd.read_csv(validation_path)
train=Dataset.from_pandas(train)
valid=Dataset.from_pandas(valid)
dataset=DatasetDict()
dataset['training']=train.remove_columns(['Unnamed: 0'])
dataset['validation']=valid.remove_columns(['Unnamed: 0'])
dataset

DatasetDict({
    training: Dataset({
        features: ['summary', 'text'],
        num_rows: 3736
    })
    validation: Dataset({
        features: ['summary', 'text'],
        num_rows: 660
    })
})

In [None]:
checkpoint="facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(checkpoint)
model= TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [None]:
max_input_length=512
max_target_length=80
def preprocess(example):
  input = tokenizer(example['text'],
                    max_length=max_input_length,
                    truncation=True,
                    )
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(example['summary'],
                       max_length=max_target_length,
                       truncation=True)
  input['labels']=labels['input_ids']
  return input

In [None]:
tokenized_dataset = dataset.map(preprocess,batched=True)
tokenized_dataset

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    training: Dataset({
        features: ['summary', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3736
    })
    validation: Dataset({
        features: ['summary', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 660
    })
})

In [None]:
tokenized_dataset.remove_columns(['summary','text'])

DatasetDict({
    training: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3736
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 660
    })
})

In [None]:
datacollator=DataCollatorForSeq2Seq(tokenizer,model=model,return_tensors="tf")

In [None]:
tf_train_dataset = tokenized_dataset['training'].to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    collate_fn=datacollator,
    shuffle=True,
    batch_size=1
)
tf_train_dataset

<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'labels': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'decoder_input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}>

In [None]:
tf_valid_dataset = tokenized_dataset['validation'].to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    collate_fn=datacollator,
    shuffle=False,
    batch_size=1)
tf_valid_dataset

<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'labels': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'decoder_input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}>

In [None]:
epochs = 1
#No of training steps are len(dataset)/batch_size*no of epochs
num_train_steps = len(tf_train_dataset) * epochs

#creating a optimizer using transformers create optimizer
optimizer, schedule = create_optimizer(
    init_lr=4e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

model.compile(optimizer=optimizer) #for loss the model will use the models internal loss by default

# Training in mixed-precision float16 for faster training and efficient memory usage
tf.keras.mixed_precision.set_global_policy("mixed_float16")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: Tesla T4, compute capability 7.5


In [None]:
history = model.fit(tf_train_dataset,validation_data=tf_valid_dataset,epochs=epochs)



In [None]:
#generating the summaries on the testing dataset
#testing only on the first 100 samples
reference=[]
model_generated=[]
for i,batch in enumerate(tqdm(tf_valid_dataset),start=1):
  if i>101:
    break
  labels=batch['labels'].numpy()
  labels=np.where(labels!=-100,labels,tokenizer.pad_token_id)
  labels=tokenizer.batch_decode(labels,skip_special_tokens=True)
  reference.extend(labels)
  pred=model.generate(**batch,min_length=55,max_length=100)
  pred_decoded = tokenizer.batch_decode(pred,skip_special_tokens=True)
  model_generated.extend(pred_decoded)

 15%|█▌        | 101/660 [1:06:43<6:09:19, 39.64s/it]


In [None]:
def calc_metrics(preds,actual):
  metrics=['rouge1','rouge2','rougeL']
  result={metrics[0]:[],metrics[1]:[],metrics[2]:[]}
  for metric in metrics:
    precision=[]
    recall=[]
    f1=[]
    scorer = rouge_scorer.RougeScorer([metric],use_stemmer=True)
    for x,y in zip(model_generated,reference):
      scores = scorer.score(x,y)
      precision.append(scores[metric][0])
      recall.append(scores[metric][1])
      f1.append(scores[metric][2])
    result[metric].append(np.mean(precision))
    result[metric].append(np.mean(recall))
    result[metric].append(np.mean(f1))
  return pd.DataFrame(result,index=['Precision','Recall','F1-Score'])

In [None]:
scores=calc_metrics(model_generated,reference)
scores

Unnamed: 0,rouge1,rouge2,rougeL
Precision,0.541336,0.298229,0.395349
Recall,0.492812,0.272186,0.361499
F1-Score,0.514111,0.283581,0.376267


In [None]:
#creating a function to generate summary
def generate_summary(text,min_length=55,max_length=80):
  text = "summarize: "+text
  input = tokenizer(text,max_length=512,truncation=True,return_tensors="tf").input_ids
  op=model.generate(input,min_length=min_length,max_length=max_length)
  decoded_op = tokenizer.batch_decode(op,skip_special_tokens=True)
  return decoded_op

In [None]:
import nltk
from nltk import sent_tokenize
nltk.download('punkt')
testfile=os.path.join("/content/drive/MyDrive/Data/News/summary test.txt")
text = open(testfile,"r").read()
text=" ".join(sent_tokenize(text))
print(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


 BJP's Meghalaya vice-president Bernard N Marak, accused of operating a brothel at his farmhouse, has been arrested in Uttar Pradesh. He was arrested in Hapur district of Uttar Pradesh. West Garo Hills District Superintendent of Police Vivekanand Singh told PTI, "Bernard N Marak alias Rimpu has been arrested in Uttar Pradesh. A team is being sent there to bring him to Tura." Marak had been on the run after six minors were rescued and 73 people arrested from his farmhouse 'Rimpu Bagan' during a raid on Saturday. Police said Marak was asked to cooperate in the probe but is evading the investigators. The arrest came hours after Meghalaya Police put out a lookout notice for the BJP leader. Yesterday, a Tura court had issued a non-bailable warrant against the BJP leader.Five children, locked up at a resort in Tura owned by Marak, were rescued on Saturday, the police said. Claiming that a "brothel" was being operated from the place, they said the raid also led to the detention of 47 young me

In [None]:
predicted_summary = generate_summary(text,min_length=20,max_length=100)

In [None]:
predicted_summary

["BJP's Meghalaya Vice-President Bernard Marak, accused of operating a brothel at his farmhouse, has been arrested in Hapur district of Uttar Pradesh. Police said Marak was asked to cooperate in the probe but is evading the investigators. The arrest came hours after the police put out a lookout notice for Marak. Marak had been on the run after six minors were rescued and 73 people were arrested from his"]

**Actual_summary:** "Meghalaya BJP Leader, Accused Of Running Brothel, Arrested In UP Bernard Marak had been on the run after six minors were rescued and 73 people arrested from his farmhouse 'Rimpu Bagan' during a raid on Saturday"

**Generated_summary:** 
"BJP's Meghalaya Vice-President Bernard Marak, accused of operating a brothel at his farmhouse, has been arrested in Hapur district of Uttar Pradesh. Police said Marak was asked to cooperate in the probe but is evading the investigators. The arrest came hours after the police put out a lookout notice for Marak. Marak had been on the run after six minors were rescued and 73 people were arrested from his"