<a href="https://colab.research.google.com/github/ThisDavidAdams/MMR-summarization/blob/main/MMR_Summarisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Clone WCEP Repository and install dependencies


In [None]:
!git clone https://github.com/complementizer/wcep-mds-dataset
%cd wcep-mds-dataset

fatal: destination path 'wcep-mds-dataset' already exists and is not an empty directory.
/content/wcep-mds-dataset


In [None]:
!pip install -r experiments/requirements.txt
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Download the test dataset

WCEP-100

In [None]:
!mkdir WCEP
!gdown https://drive.google.com/uc?id=1qsd5pOCpeSXsaqNobXCrcAzhcjtG1wA1 -O WCEP/test.jsonl.gz

mkdir: cannot create directory ‘WCEP’: File exists
Downloading...
From: https://drive.google.com/uc?id=1qsd5pOCpeSXsaqNobXCrcAzhcjtG1wA1
To: /content/wcep-mds-dataset/WCEP/test.jsonl.gz
51.5MB [00:01, 50.7MB/s]


In [None]:
import experiments.utils as utils

test_data = list(utils.read_jsonl_gz('WCEP/test.jsonl.gz'))
# test_data = test_data[:25] # for testing
print("Number of clusters:",len(test_data))
print(test_data[0].keys())

Number of clusters: 1022
dict_keys(['id', 'date', 'reference_urls', 'articles', 'summary', 'wiki_links', 'category'])


In [None]:
summary_max = 0
article_max = 0
for c in test_data:
  summary_max = max(summary_max,len(c['summary'].split(" ")))

  for a in c['articles']:
    if article_max < len(a['text'].split(" ")):
      text = a["text"]
    article_max = max(article_max,len(a['text'].split(" ")))

print("max length of articles",article_max)
print("max length of summary",summary_max)

max length of articles 13355
max length of summary 119


In [None]:
c = test_data[0]

a = c['articles'][0]
text1 = a['text']
len(text1.split(" "))

256

## Importing the pretrained models

In [None]:
!pip install torch
!pip install transformers
!pip install sentencepiece
# !pip install spacy
# !pip install bert-extractive-summarizer



In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# PEGASUS

from transformers import PegasusForConditionalGeneration, PegasusTokenizer

class Pegasus():

  def __init__(self,device):
    self.name = "Pegasus"
    self.model_name = "google/pegasus-xsum"
    self.tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
    self.device = device
    self.model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)
  
  def summarise(self, text):
    inputs = self.tokenizer([text], max_length=512, truncation = True, return_tensors='pt')
    inputs.to(self.device)
    translated = self.model.generate(inputs['input_ids'])
    summary = self.tokenizer.batch_decode(translated, skip_special_tokens=True)

    return summary[0]



In [None]:
# T5

from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

class T5():
  
  def __init__(self,device):
    self.name = "T5"
    self.device = device
    self.model = T5ForConditionalGeneration.from_pretrained('t5-large').to(device)
    self.tokenizer = T5Tokenizer.from_pretrained('t5-large')

  def summarise(self, text):
    text = "summarize: " + text
    tokenized_text = self.tokenizer.encode(text, return_tensors="pt", truncation = True).to(self.device)

    summary_ids = self.model.generate(tokenized_text,
                                        num_beams=4,
                                        no_repeat_ngram_size=2,
                                        min_length=20,
                                        max_length=50)
    
    summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# GPT2
class GPT2():

  def __init__(self,device):
    self.name = "GPT2"
    self.device = device
    self.model = GPT2LMHeadModel.from_pretrained("gpt2").to(self.device)
    self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

  def summarise(self,text):
    input_ids = self.tokenizer.encode(text, return_tensors = 'pt', truncation=True, max_length = 971)
    tldr = self.tokenizer.encode(" TL;DR:", return_tensors = 'pt')
    input_ids = torch.cat((input_ids,tldr),-1)
    input_ids = input_ids.to(self.device)
    beam_output = self.model.generate(
        input_ids, 
        min_length = len(input_ids[0]) + 20,
        max_length=len(input_ids[0]) + 50, 
        num_beams=5,
        no_repeat_ngram_size=2, 
        temperature=0.5,
        early_stopping=True,
        top_p = 0.9
    )
    output = self.tokenizer.decode(beam_output[0], skip_special_tokens=True)
    
    summary = output.split("TL;DR:")[-1]
    return summary

In [None]:
# XLNet

from transformers import XLNetTokenizer, XLNetLMHeadModel

class XLNet():
  
  def __init__(self,device):
    self.name = "XLNet"
    self.device = device
    self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
    self.model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased').to(device)

  def summarise(self,text):
    input_ids = self.tokenizer.encode(text, return_tensors = 'pt', truncation=True, max_length = 971)
    tldr = self.tokenizer.encode(" TL;DR:", return_tensors = 'pt')
    input_ids = torch.cat((input_ids,tldr),-1)
    input_ids = input_ids.to(self.device)
    beam_output = self.model.generate(input_ids, 
                                max_length=len(input_ids[0]) + 50, 
                                num_beams=5,
                                no_repeat_ngram_size=2, 
                                temperature=0.7,
                                early_stopping=True,
                                top_p = 0.9
                                )
    
    output = self.tokenizer.decode(beam_output[0], skip_special_tokens=True)
    output = output.split("TL;DR:")[-1]
    return output
  

In [None]:
# ProphetNet

from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration

class ProphetNet():

  def __init__(self,device):
    self.name = "ProphetNet"
    self.device = device
    self.tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
    self.model = ProphetNetForConditionalGeneration.from_pretrained('microsoft/prophetnet-large-uncased').to(self.device)

  def summarise(self,text):
    input_ids = self.tokenizer(text, return_tensors="pt", truncation = True).input_ids
    decoder_input_ids = self.tokenizer("To summarise", return_tensors="pt").input_ids  
    input_ids = input_ids.to(self.device)
    decoder_input_ids = decoder_input_ids.to(self.device)

    beam_output = self.model.generate(input_ids, 
                                decoder_input_ids = decoder_input_ids,
                                max_length= 50, 
                                num_beams=5,
                                no_repeat_ngram_size=2, 
                                temperature=0.7,
                                early_stopping=True,
                                top_p = 0.9
                                )
    
    output = self.tokenizer.decode(beam_output[0], skip_special_tokens=True)
    return output

In [None]:
# BART

from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

class BART():

  def __init__(self,device):
    self.name = "BART"
    self.device = device
    self.model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
    self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

  def summarise(self, text, MDS = False):
    inputs = self.tokenizer([text], max_length=1024, truncation = True, return_tensors='pt').to(self.device)
    summary_ids = self.model.generate(inputs['input_ids'], num_beams=4, max_length=40, early_stopping=True)
    return ' '.join([(self.tokenizer).decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

In [None]:
# LED

from transformers import LEDTokenizer, LEDForConditionalGeneration, LEDConfig

class LED():

  def __init__(self,device):
    self.name = "LED"
    self.device = device
    self.model = LEDForConditionalGeneration.from_pretrained('allenai/led-base-16384').to(self.device)
    self.tokenizer = LEDTokenizer.from_pretrained('allenai/led-base-16384')

  def summarise(self, text, MDS = False):
    inputs = self.tokenizer([text], max_length=1024, truncation=True, return_tensors='pt').to(self.device)
    summary_ids = self.model.generate(inputs['input_ids'], num_beams=4, max_length=50, early_stopping=True)
    return ' '.join([(self.tokenizer).decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

In [None]:
prophetnet = ProphetNet(device)
gpt2 = GPT2(device)
xlnet = XLNet(device)
t5 = T5(device)
led = LED(device)
bart = BART(device)
pegasus = Pegasus(device)

In [None]:
SDS_models = {
      "ProfetNet" : prophetnet,
      "GPT2" : gpt2,
      "T5" : t5,
      "LED" : led,
      "BART" : bart,
      "PEGASUS" : pegasus,
      "XLNet" : xlnet
}

In [None]:
for _,model in SDS_models.items():
  print(model.name,"\n")
  print(model.summarise(text))
  print("-*-"*10,"\n")

ProphetNet 

to summarise the president ’ s week in iowa, sitting down for an exclusive interview with president donald trump, embedding him in the oval office for a day of meetings, greeting him for the first time in his life, and sitting
-*--*--*--*--*--*--*--*--*--*- 

GPT2 



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 If you want to be able to keep your health insurance, then you need to go through the process of getting a court order, which is a process that takes a long time. The process is very, very complicated. There are a number of things
-*--*--*--*--*--*--*--*--*--*- 

T5 

president's 30-hour visit to the white house aired on "20/20" on sunday, july 16. in the interview, president says he feels like this is his "great part of the country" president
-*--*--*--*--*--*--*--*--*--*- 

LED 

Over the course of two days, ABC News' chief anchor George Stephanopoulos spent 30 hours with President Donald Trump, flying on Air Force One to Iowa, traveling in his armored vehicle called “The Beast,” greeting him in his
-*--*--*--*--*--*--*--*--*--*- 

BART 

"President Trump - 30 Hours" airs on Sunday, June 16, 2019, at 8 p.m. ET. George Stephanopoulos spent 30 hours with President Donald Trump in Iowa.
-*--*--*--*--*--*--*--*--*--*- 

Pegasus 

President Donald Trump sat down for an exclusive intervie

## Setting up LDAMallet

In [None]:
!pip install --upgrade gensim

In [None]:
# install JAVA

import os       
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

In [None]:
# Install Mallet

!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

## Compute MMR

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def maximal_marginal_relevance(sentence_vector, phrases, embedding_matrix, lambda_constant=0.5, threshold_terms=10):
    """
    Return ranked phrases using MMR. Cosine similarity is used as similarity measure.
    :param sentence_vector: Query vector
    :param phrases: list of candidate phrases
    :param embedding_matrix: matrix having index as phrases and values as vector
    :param lambda_constant: 0.5 to balance diversity and accuracy. if lambda_constant is high, then higher accuracy. If lambda_constant is low then high diversity.
    :param threshold_terms: number of terms to include in result set
    :return: Ranked phrases with score
    """
    # todo: Use cosine similarity matrix for lookup among phrases instead of making call everytime.
    s = []
    r = sorted(phrases, key=lambda x: x[1], reverse=True)
    r = [i[0] for i in r]
    while len(r) > 0:
        score = 0
        phrase_to_add = ''
        for i in r:
            first_part = cosine_similarity([sentence_vector], [embedding_matrix.loc[i]])[0][0]
            second_part = 0
            for j in s:
                cos_sim = cosine_similarity([embedding_matrix.loc[i]], [embedding_matrix.loc[j[0]]])[0][0]
                if cos_sim > second_part:
                    second_part = cos_sim
            equation_score = lambda_constant*(first_part)-(1-lambda_constant) * second_part
            if equation_score > score:
                score = equation_score
                phrase_to_add = i
        if phrase_to_add == '':
            phrase_to_add = i
        r.remove(phrase_to_add)
        s.append((phrase_to_add, score))
    return (s, s[:threshold_terms])[threshold_terms > len(s)]

## Generate Summaries

In [None]:
from experiments.evaluate import evaluate
from tqdm import tqdm 
import pickle

# Generating summaries

summaries = []
# summary keys : summary, type (MDS/SDS), index (None/article_index), model, clusterid, Rouge_score, MMR_reduced

for cluster in tqdm(test_data):

  # MDS
  for model in MDS_models:
    d = {}

    d["type"] = "MDS"
    d["model"] = model
    d["index"] = None
    d["clusterId"] = clusterId

    summary = MDS_models[model](cluster['articles'])
    d["summary"] = summary

    d["rouge"] = evaluate([summary], [cluster["summary"]])
    d["MMR_reduced"] = None
    summaries.append(d)
  
  # SDS
  for model in SDS_models:
    for article in cluster['articles']:
      d = {}

      d["type"] = "SDS"
      d["model"] = model
      d["index"] = article["id"]
      d["clusterId"] = clusterId

      summary = SDS_models[model](article)
      d["summary"] = summary

      d["rouge"] = evaluate([summary], [cluster["summary"]])
      d["MMR_reduced"] = None
      summaries.append(d)

# save the progress
utils.write_jsonl(summaries, "summaries.jsonl")

In [None]:
summaries = list(utils.read_jsonl("summaries.jsonl"))

## MMR

In [None]:
import experiments.sent_splitter as sent_splitter

sentSplitter = sent_splitter.SentenceSplitter()

# MMR 1
for summ in summaries:

  text = summ["summary"]
  sentences = sentSplitter(text)

  # MMR
  output = None # MMR ranked - least diverse
  summ["MMR_reduced"] = output

# save the progress
utils.write_jsonl(summaries, "summaries.jsonl")

In [None]:
summaries = list(utils.read_jsonl("summaries.jsonl"))

clusters = [c["clusterId"] for c in summaries]
clusters = list(set(clusters))

## Final Summaries

In [None]:
# Generating final summary
# MMR 2

final_summaries = []

for cid in clusters:
  d = {}
  d["clusterId"] = cid

  cluster_summaries = [c for c in summaries if cid = c["clusterId"]]
  ground_truth = [c["summary"] for c in test_data where c["id"] == cid][0]

  cluster_sentences = []
  for summ in cluster_summaries:

    text = summ["MMR_reduced"]
    sentences = sentSplitter(text)

    cluster_sentences.extend(sentences)


  # MMR 

  # pick top n sentences
  final_summary = ' '.join(topn_sentences)
  Rouge_score = evaluate([final_summary], [ground_truth])

  d["summary"] = final_summary
  d["rouge"] = Rouge_score


  final_summaries.append(d)

# save the progress  
utils.write_jsonl(final_summaries, "finalsummaries.jsonl")