# SUPERT 

Example code for using the supert metrics

- Paper: https://arxiv.org/abs/2005.03724
- Git-Project: https://github.com/danieldeutsch/SUPERT

In [1]:
import sys
sys.path.append("SUPERT")
import time
import tracemalloc 
import numpy as np

In [2]:
# to get the reader working
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dwarakvittal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dwarakvittal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from ref_free_metrics.supert import Supert
from utils.data_reader import CorpusReader
from utils.evaluator import evaluate_summary_rouge, add_result

In [4]:
# read docs and summaries
reader = CorpusReader('SUPERT/data/topic_1')
source_docs = reader()
summaries = reader.readSummaries() 
refs = reader.readReferences()

# Evaluate Summaries

### BERT

In [5]:
tracemalloc.start()      

In [6]:
bert_start = time.perf_counter()
# compute the Supert scores
supert = Supert(docs=source_docs, sentence_transformer='bert-base-nli-stsb-mean-tokens')
bert_scores = supert(summaries)
bert_end = time.perf_counter() - bert_start

[{'idx': 0, 'name': '0', 'path': '0_BERT', 'type': 'sentence_transformers.models.BERT'}, {'idx': 1, 'name': '1', 'path': '1_Pooling', 'type': 'sentence_transformers.models.Pooling'}]


In [7]:
current, peak = tracemalloc.get_traced_memory()   
print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")   
tracemalloc.stop()

Current memory usage is 19.27694MB; Peak was 21.71365MB


In [8]:
print(bert_scores)
print("BERT calc time:", bert_end, "sec")

[0.8140881996572805, 0.8140881996572805]
BERT calc time: 23.014478084 sec


### RoBERTa

In [9]:
tracemalloc.start()      

In [10]:
roberta_start = time.perf_counter()
roberta_supert = Supert(docs=source_docs, sentence_transformer='roberta-base-nli-stsb-mean-tokens')
roberta_scores = roberta_supert(summaries)
roberta_end = time.perf_counter() - roberta_start

[{'idx': 0, 'name': '0', 'path': '0_Transformer', 'type': 'sentence_transformers.models.RoBERTa'}, {'idx': 1, 'name': '1', 'path': '1_Pooling', 'type': 'sentence_transformers.models.Pooling'}]


In [11]:
current, peak = tracemalloc.get_traced_memory()   
print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")   
tracemalloc.stop()

Current memory usage is 40.437305MB; Peak was 44.87028MB


In [12]:
print(roberta_scores)
print("RoBERTa calc time:", roberta_end, "sec")

[0.8320928335930919, 0.8320928335930919]
RoBERTa calc time: 22.108549291999996 sec


### DistilBERT

In [14]:
tracemalloc.start()

In [15]:
distilbert_start = time.perf_counter()
distilbert_supert = Supert(docs=source_docs, sentence_transformer='distilbert-base-nli-stsb-mean-tokens')
distilbert_scores = distilbert_supert(summaries)
distilbert_end = time.perf_counter() - distilbert_start

[{'idx': 0, 'name': '0', 'path': '0_Transformer', 'type': 'sentence_transformers.models.DistilBERT'}, {'idx': 1, 'name': '1', 'path': '1_Pooling', 'type': 'sentence_transformers.models.Pooling'}]


In [16]:
current, peak = tracemalloc.get_traced_memory()   
print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")   
tracemalloc.stop()

Current memory usage is 18.528955MB; Peak was 20.969932MB


In [17]:
print(distilbert_scores)
print("DistilBERT calc time:", distilbert_end, "sec")

[0.832189884651843, 0.832189884651843]
DistilBERT calc time: 11.591048790999992 sec


# Create Summaries with RL

### RL Class from other py-script

In [18]:
from ref_free_metrics.supert import Supert
from summariser.ngram_vector.vector_generator import Vectoriser
from summariser.deep_td import DeepTDAgent as RLAgent
from utils.data_reader import CorpusReader
from utils.evaluator import evaluate_summary_rouge, add_result

class RLSummarizer():
    def __init__(self,reward_func, reward_strict=5.,rl_strict=5.,train_episode=5000, base_length=200, sample_summ_num=5000, gpu=False):
        self.reward_func = reward_func
        self.reward_strict = reward_strict
        self.rl_strict = rl_strict
        self.train_episode = train_episode
        self.base_length = base_length
        self.sample_summ_num = sample_summ_num
        self.gpu = gpu

    def get_sample_summaries(self, docs, summ_max_len=100):
        vec = Vectoriser(docs,summ_max_len)
        summary_list = vec.sample_random_summaries(self.sample_summ_num)
        rewards = self.reward_func(summary_list)
        assert len(summary_list) == len(rewards)
        return summary_list, rewards

    def summarize(self, docs, summ_max_len=100):
        # generate sample summaries for memory replay
        summaries, rewards = self.get_sample_summaries(docs, summ_max_len)
        vec = Vectoriser(docs,base=self.base_length)
        rl_agent = RLAgent(vec, summaries, strict_para=self.rl_strict, train_round=self.train_episode, gpu=self.gpu)
        summary = rl_agent(rewards)
        return summary

### BERT

In [19]:
# generate summaries using reinforcement learning, with supert as reward function
rl_summarizer = RLSummarizer(reward_func = supert, gpu = False)
summary = rl_summarizer.summarize(source_docs, summ_max_len=100)

# print out the generated summary
print(summary)

generating samples for memory replay: 100%|██████████| 5000/5000 [00:06<00:00, 729.42it/s]
neural-rl training episodes: 100%|██████████| 5000/5000 [00:39<00:00, 127.18it/s]


Juan Manuel Alvarez, 25, of Compton, the driver of the Jeep Grand Cherokee, was found near the wreckage and arrested on suspicion of homicide. Until Wednesday, Juan Manuel Alvarez was living the average life of an obscure and troubled man. ``These people were just going to work.'' He was told she was taken to the hospital and released. The charges include the special circumstances of using the train, which could make the man, Juan M. Alvarez, eligible for the death penalty. ``Nothing of this magnitude has happened (before) in Glendale,'' he said. As he watched, southbound Train No. ``It was.


In [20]:
rouge_scores = {}
for ref in refs:
    rs = evaluate_summary_rouge(summary, ref)
    add_result(rouge_scores, rs)
roguenames = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-SU4']
for roguename in roguenames:
    print(roguename ,np.mean(rouge_scores[roguename]))
print(rouge_scores)

execute_rouge command is perl /Users/dwarakvittal/Documents/Projekte/NLP_Metric/SUPERT/rouge/ROUGE-RELEASE-1.5.5/ROUGE-1.5.5.pl -e /Users/dwarakvittal/Documents/Projekte/NLP_Metric/SUPERT/rouge/ROUGE-RELEASE-1.5.5/data -n 4 -m -c 95 -r 1000 -f A -p 0.5 -t 0 -a -2 -4 -u -l 100 -a /Users/dwarakvittal/Documents/Projekte/NLP_Metric/SUPERT/rouge_temp_files/69966-1628935528.03852/config.xml
b'---------------------------------------------\n1 ROUGE-1 Average_R: 0.29000 (95%-conf.int. 0.29000 - 0.29000)\n1 ROUGE-1 Average_P: 0.29000 (95%-conf.int. 0.29000 - 0.29000)\n1 ROUGE-1 Average_F: 0.29000 (95%-conf.int. 0.29000 - 0.29000)\n---------------------------------------------\n1 ROUGE-2 Average_R: 0.06061 (95%-conf.int. 0.06061 - 0.06061)\n1 ROUGE-2 Average_P: 0.06061 (95%-conf.int. 0.06061 - 0.06061)\n1 ROUGE-2 Average_F: 0.06061 (95%-conf.int. 0.06061 - 0.06061)\n---------------------------------------------\n1 ROUGE-3 Average_R: 0.01020 (95%-conf.int. 0.01020 - 0.01020)\n1 ROUGE-3 Average_P: 

### RoBERTa

In [21]:
# generate summaries using reinforcement learning, with supert as reward function
rl_summarizer = RLSummarizer(reward_func = roberta_supert, gpu = False)
summary = rl_summarizer.summarize(source_docs, summ_max_len=100)

# print out the generated summary
print(summary)

generating samples for memory replay: 100%|██████████| 5000/5000 [00:06<00:00, 719.43it/s]
neural-rl training episodes: 100%|██████████| 5000/5000 [00:39<00:00, 127.03it/s]


Distraught and remorseful, Alvarez told police he had left the vehicle and watched the derailment, Adams said. About 60 people were treated at the scene and released. The Los Angeles Police Department declared a citywide tactical alert that allows commanders to keep officers on the job beyond the end of their shifts, in order to help with the rescue and crowd control operations. On Thursday, the National Transportation Safety Board said it would not conduct a full investigation of the crash because it appeared to be the result of a criminal act. The car in front of us went sideways.


In [22]:
rouge_scores = {}
for ref in refs:
    rs = evaluate_summary_rouge(summary, ref)
    add_result(rouge_scores, rs)
roguenames = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-SU4']
for roguename in roguenames:
    print(roguename ,np.mean(rouge_scores[roguename]))
print(rouge_scores)

execute_rouge command is perl /Users/dwarakvittal/Documents/Projekte/NLP_Metric/SUPERT/rouge/ROUGE-RELEASE-1.5.5/ROUGE-1.5.5.pl -e /Users/dwarakvittal/Documents/Projekte/NLP_Metric/SUPERT/rouge/ROUGE-RELEASE-1.5.5/data -n 4 -m -c 95 -r 1000 -f A -p 0.5 -t 0 -a -2 -4 -u -l 100 -a /Users/dwarakvittal/Documents/Projekte/NLP_Metric/SUPERT/rouge_temp_files/02545-1628935787.6957371/config.xml
b'---------------------------------------------\n1 ROUGE-1 Average_R: 0.31000 (95%-conf.int. 0.31000 - 0.31000)\n1 ROUGE-1 Average_P: 0.31000 (95%-conf.int. 0.31000 - 0.31000)\n1 ROUGE-1 Average_F: 0.31000 (95%-conf.int. 0.31000 - 0.31000)\n---------------------------------------------\n1 ROUGE-2 Average_R: 0.02020 (95%-conf.int. 0.02020 - 0.02020)\n1 ROUGE-2 Average_P: 0.02020 (95%-conf.int. 0.02020 - 0.02020)\n1 ROUGE-2 Average_F: 0.02020 (95%-conf.int. 0.02020 - 0.02020)\n---------------------------------------------\n1 ROUGE-3 Average_R: 0.00000 (95%-conf.int. 0.00000 - 0.00000)\n1 ROUGE-3 Average_P

### DistilBERT

In [23]:
# generate summaries using reinforcement learning, with supert as reward function
rl_summarizer = RLSummarizer(reward_func = distilbert_supert, gpu = False)
summary = rl_summarizer.summarize(source_docs, summ_max_len=100)

# print out the generated summary
print(summary)

generating samples for memory replay: 100%|██████████| 5000/5000 [00:06<00:00, 717.36it/s]
neural-rl training episodes: 100%|██████████| 5000/5000 [00:38<00:00, 129.04it/s]


The SUV was hit shortly after 6 a.m. Pacific time by a southbound Metrolink train, which then went off the rails and started a deadly chain reaction, authorities said. ``Nothing of this magnitude has happened (before) in Glendale,'' he said. Eleven people died and about 180 were injured in the crash at 6:02 a.m. Wednesday. With his tire, apparently caught between the tracks, Mr. Alvarez jumped out of the Jeep and ran. A fire broke out, and 5,000 gallons of diesel fuel spilled. ( Begin optional trim) Glendale Memorial Hospital treated 13 passengers. At least one train car caught fire.


In [24]:
#summary = "Alvarez is likely to face at least 10 counts of murder, the chief said. The driver of the SUV, identified as Juan Manuel Alvarez, 25, of Compton, Calif., was taken into custody, and police said he would be charged with homicide. A counterterrorism command post was set up near the crash site, but authorities determined within an hour that the disaster was almost certainly the work of one troubled man. He said he got on train 100 at 5:19 a.m. in Simi Valley. ``This is a complete outrage.'' He was told she was taken to the hospital and released."

In [25]:
rouge_scores = {}
for ref in refs:
    rs = evaluate_summary_rouge(summary, ref)
    add_result(rouge_scores, rs)
roguenames = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-SU4']
for roguename in roguenames:
    print(roguename ,np.mean(rouge_scores[roguename]))
print(rouge_scores)

execute_rouge command is perl /Users/dwarakvittal/Documents/Projekte/NLP_Metric/SUPERT/rouge/ROUGE-RELEASE-1.5.5/ROUGE-1.5.5.pl -e /Users/dwarakvittal/Documents/Projekte/NLP_Metric/SUPERT/rouge/ROUGE-RELEASE-1.5.5/data -n 4 -m -c 95 -r 1000 -f A -p 0.5 -t 0 -a -2 -4 -u -l 100 -a /Users/dwarakvittal/Documents/Projekte/NLP_Metric/SUPERT/rouge_temp_files/53489-1628935957.246093/config.xml
b'---------------------------------------------\n1 ROUGE-1 Average_R: 0.26000 (95%-conf.int. 0.26000 - 0.26000)\n1 ROUGE-1 Average_P: 0.25243 (95%-conf.int. 0.25243 - 0.25243)\n1 ROUGE-1 Average_F: 0.25616 (95%-conf.int. 0.25616 - 0.25616)\n---------------------------------------------\n1 ROUGE-2 Average_R: 0.03030 (95%-conf.int. 0.03030 - 0.03030)\n1 ROUGE-2 Average_P: 0.02941 (95%-conf.int. 0.02941 - 0.02941)\n1 ROUGE-2 Average_F: 0.02985 (95%-conf.int. 0.02985 - 0.02985)\n---------------------------------------------\n1 ROUGE-3 Average_R: 0.00000 (95%-conf.int. 0.00000 - 0.00000)\n1 ROUGE-3 Average_P: