<a href="https://colab.research.google.com/github/NileshPranami/SciBert/blob/master/Scientific_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Setting up Runtime**

In [0]:
import os
import sys

if not os.path.exists('old_runtime'):
    !touch old_runtime
    !wget https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip > /dev/null
    !unzip scisummnet_release1.1__20190413.zip > /dev/null
    !git clone https://github.com/WING-NUS/scisumm-corpus.git > /dev/null
    !pip install -U sentence-transformers
    
    !pip install allennlp==1.0.0rc1 allennlp-models==1.0.0rc1 > /dev/null
    !pip install rouge > /dev/null
    
    !wget https://storage.googleapis.com/allennlp-public-models/snli-roberta-large-2020.02.27.tar.gz
    
    !git clone https://github.com/dandynaufaldi/particle-swarm-optimized-clustering pso
    !touch /content/pso/__init__.py
    !sudo apt-get install libxml-parser-perl
    
    # downloading GoogleNews-vectors-negative300.bin.gz
    !FILEID=0B7XkCwpI5KDYNlNUTTlSS21pQmM && FILENAME=GoogleNews-vectors-negative300.bin.gz && wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id='$FILEID -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id="$FILEID -O $FILENAME && rm -rf /tmp/cookies.
    # !wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz -o GoogleNews-vectors-negative300.bin.gz

    !git clone https://github.com/tagucci/pythonrouge.git

if not 'path_set_done' in os.environ:
    os.environ['path_set_done'] = 'True'
    sys.path.append("/content/pso")
    sys.path.append('/content/pythonrouge')
    sys.path.append('/content/pythonrouge/pythonrouge/RELEASE-1.5.5/ROUGE-1.5.5.pl')
    sys.path.append('/content/pythonrouge/pythonrouge/RELEASE-1.5.5/data')
    os.environ['PYTHONPATH'] = f"{os.environ['PYTHONPATH']}:/content/pso"

# sys.path
# os.environ['PYTHONPATH']

# **Import Packages**

In [0]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.nli
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup as bs
import tqdm
import torch
import glob
import sys

from sentence_transformers import SentenceTransformer

In [0]:
from pso.pso import ParticleSwarmOptimizedClustering
from particle import quantization_error, calc_sse
from utils import normalize
# from kmeans import KMeans
from sklearn.cluster import KMeans

In [0]:
from pythonrouge.pythonrouge import Pythonrouge
from rouge import Rouge

In [0]:
#  os.environ['CUDA_VISIBLE_DEVICES']='0'
#  os.environ['CUDA_VISIBLE_DEVICE']='0'

# **Downloading models**

In [0]:
# nltk.download('stopwords')
# nltk.download('punkt')

# **Generate Embeddings**

In [17]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

files = list(glob.glob("/content/scisumm-corpus/data/*/*/Reference_XML/*xml"))
# ===================
# files = files[:5] #
# ===================
out_dir = "/content/embeddnings"
if not os.path.exists(out_dir): os.mkdir(out_dir)

embd_xml = {}

for file in tqdm.tqdm(files):
    with open(file, encoding='latin') as f:
        soup = bs(f.read())
        sentences = list(map(lambda x: x.text, soup.find_all("s")))
        embd = np.array(model.encode(sentences))

        embd_file = os.path.join(out_dir, os.path.basename(file).replace(".xml", ".npy"))
        embd_xml[embd_file] = file
        
    with open(embd_file, "wb") as nf:
        np.save(nf, embd)


100%|██████████| 130/130 [03:35<00:00,  1.66s/it]


In [0]:
embds = '/content/embeddnings/E09-2008.npy'
xml_file = embd_xml[embds]

with open(xml_file) as f:
    xml = f.read()
    sentences = list(map(lambda x: x.text, bs(xml).find_all("s")))

data = np.load(embds)
n_clusters = data.shape[0] // 20

In [9]:
data.shape

(63, 768)

In [0]:
summ_files = list(glob.glob(os.path.join(os.path.dirname(os.path.dirname(xml_file)), "summary") + "/*"))

In [0]:
km = KMeans(n_clusters=n_clusters)
km.fit(data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [0]:
pred = km.predict(data)
pred

array([1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 0, 1, 0, 0],
      dtype=int32)

In [0]:
pso = ParticleSwarmOptimizedClustering(n_cluster=n_clusters, n_particles=10, data=data,
             hybrid=True, max_iter=2000, print_debug=50)
hist = pso.run()

Initial global best score 44.85419527689616
Iteration 0001/2000 current gbest score 44.854195276896156486
Iteration 0051/2000 current gbest score 43.848040898640952889
Iteration 0101/2000 current gbest score 43.834046045939125236
Iteration 0151/2000 current gbest score 43.834046045939125236
Iteration 0201/2000 current gbest score 43.834040959676109139
Iteration 0251/2000 current gbest score 43.834035873413085938
Iteration 0301/2000 current gbest score 43.832684199015297111
Iteration 0351/2000 current gbest score 43.831001281738281250
Iteration 0401/2000 current gbest score 43.831000010172523673
Iteration 0451/2000 current gbest score 43.831000010172523673
Iteration 0501/2000 current gbest score 43.831000010172523673
Iteration 0551/2000 current gbest score 43.831000010172523673
Iteration 0601/2000 current gbest score 43.831000010172523673
Iteration 0651/2000 current gbest score 43.830999374389648438
Iteration 0701/2000 current gbest score 43.830999374389648438
Iteration 0751/2000 curren

# **Helper functions**

In [0]:
def fill_upper(df):
    l = len(df)
    for i in range(l):
        for j in range(i, l):
            df.loc[i][j] = df.loc[j][i]
    return df

def n_sum(n):
    if (n < 1):
        return 0
    return (n * (n + 1)) // 2
    
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
       return v
    return v / norm

In [0]:
def get_document_center(df_wmd):
    return df_wmd.sum(axis=1).idxmin()

def get_cluster_center(df_wmd, cluster_idx):
    cluster_idx.sort()
    df_wmd_cluster = df_wmd.iloc[cluster_idx, cluster_idx]
    return df_wmd_cluster.sum(axis=1).idxmin()

# **WMD Matrix**

In [0]:
import gensim 

In [0]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
def generate_wmd_matrix_2(sentences):
    print("Generating WMD matrix...\n\n")
    l = len(sentences)
    data = [{}]
    row = 0
    for i in tqdm.tqdm_notebook(range(n_sum(l))):
        col = (i - n_sum(row)) % (row + 1)
        if (row == col):
            data[row][col] = 0
            data.append({})
            row += 1
            continue
        data[row][col] = data[col][row] = model.wmdistance(sentences[row], sentences[col])
    return pd.DataFrame.from_dict(data[:-1])

In [0]:
def generate_wmd_matrix(sentences):
    print("Generating WMD matrix...\n\n")
    l = len(sentences)
    df = pd.DataFrame(index=list(range(l)), columns=list(range(l)))
    row = 0
    for i in tqdm.tqdm_notebook(range(n_sum(l))):
        col = (i - n_sum(row)) % (row + 1)
        if (row == col):
            df.loc[row][col] = 0
            row += 1
            continue
        df.loc[row][col] = df.loc[col][row] = model.wmdistance(sentences[row], sentences[col])
    return df

In [0]:
df_wmd = generate_wmd_matrix_2(sentences)
# df_wmd

Generating WMD matrix...




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=2016.0), HTML(value='')))




In [0]:
del model

In [0]:
doc_center = get_document_center(df_wmd)
doc_center

53

In [0]:
def sort_clusters_ids(cluster_ids, df_wmd):
    cluster_df = df_wmd.iloc[cluster_ids, cluster_ids]
    sorted_val = cluster_df.sum().sort_values()
    return sorted_val

In [0]:
def get_cluster_ranking(n_clusters=n_clusters, pred=pred, df_wmd=df_wmd):
    clusters_ids = []
    for i in range(n_clusters):
        sorted_ids = sort_clusters_ids([k for k in range(len(pred)) if pred[k]==i], df_wmd)
        clusters_ids.append((list(sorted_ids.index), sorted_ids.values[0]))
    clusters_ids = dict(enumerate((map(lambda x: x[0], sorted(clusters_ids, key=lambda x: x[1])))))
    return clusters_ids

In [0]:
clusters_ids = get_cluster_ranking()

In [0]:
print(clusters_ids)

{0: [48, 9, 51, 18, 47, 17, 52, 45, 7, 3, 56, 16, 55], 1: [53, 42, 8, 12, 15, 30, 28, 5, 44, 29, 4, 33, 34, 26, 1, 14, 13, 46, 25, 50, 0, 54, 6, 43, 49, 11, 31, 2, 60, 10], 2: [23, 27, 20, 58, 61, 62, 19, 59, 37, 41, 40, 22, 24, 36, 32, 57, 39, 38, 35, 21]}


# **AllenNLP**

In [0]:
predictor = Predictor.from_path("snli-roberta-large-2020.02.27.tar.gz", predictor_name="textual-entailment", cuda_device=0)
# predictor.predict(hypothesis="Two women are sitting on a blanket near some rocks talking about politics.",
#                   premise="Two women are wandering along the shore drinking iced tea.")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [0]:
def pos_entail(sen1, sen2):
    out = predictor.predict(sen1, sen2)
    return out['probs'][0]

def contradiction(sen1, sen2):
    out = predictor.predict(sen1, sen2)
    return out['probs'][1]

def generate_contradiction_matrix(sentences):
    print("Generating contradiction matrix...\n\n")

    l = len(sentences)
    data = [{}]
    row = 0
    for i in tqdm.tqdm_notebook(range(n_sum(l))):
        col = (i - n_sum(row)) % (row + 1)
        if (row == col):
            data[row][col] = 0
            data.append({})
            row += 1
            continue
        data[row][col] = data[col][row] = contradiction(sentences[row], sentences[col])
    return pd.DataFrame.from_dict(data[:-1])

In [0]:
df_cont = generate_contradiction_matrix(sentences)

Generating contradiction matrix...




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=2016.0), HTML(value='')))




# **Summary Generation**

* **Given Summaries**

In [0]:
i = 0
for f in summ_files:
    with open(f) as sf:
        print(f, i)
        exec(f"summ_{i} = bs(sf.read()).text")
        exec(f"print(summ_{i})")
        i += 1
        print("="*100)

/content/scisumm-corpus/data/Training-Set-2017/E09-2008/summary/E09_2008.abstract.txt 0
Foma: a finite-state compiler and library
Foma is a compiler, programming language, and C library for constructing finite-state automata and transducers for various uses.
It has specific support for many natural language processing applications such as producing morphological and phonological analyzers.
Foma is largely compatible with the Xerox/PARC finite-state toolkit.
It also embraces Unicode fully and supports various different formats for specifying regular expressions: the Xerox/PARC format, a Perl-like format, and a mathematical format that takes advantage of the ‘Mathematical Operators’ Unicode block.
/content/scisumm-corpus/data/Training-Set-2017/E09-2008/summary/E09-2008.community.txt 1
Foma is largely compatible with the Xerox/PARC finite-state toolkit.
Foma is a finite-state compiler, programming language, and regular expression/finite-state library designed for multipurpose use with exp

In [0]:
ratio_sen_summ = .1
sen_length = len(sentences)

def check_limit_condition(summ_sentences_idx, sen_length=sen_length, ratio_sen_summ=ratio_sen_summ):
    return len(summ_sentences_idx) < len(sentences) * ratio_sen_summ

* **Summary 1**

Cluster centeres and sentences close to cluster center has been selected to generate summary

In [0]:
print(clusters_ids)

{0: [48, 9, 51, 18, 47, 17, 52, 45, 7, 3, 56, 16, 55], 1: [53, 42, 8, 12, 15, 30, 28, 5, 44, 29, 4, 33, 34, 26, 1, 14, 13, 46, 25, 50, 0, 54, 6, 43, 49, 11, 31, 2, 60, 10], 2: [23, 27, 20, 58, 61, 62, 19, 59, 37, 41, 40, 22, 24, 36, 32, 57, 39, 38, 35, 21]}


In [0]:
summ_sentences_idx = set()

i = 0
while (check_limit_condition(summ_sentences_idx)):
    summ_sentences_idx.add(clusters_ids[i % n_clusters][i // n_clusters])
    i += 1

summ_sentences_idx = sorted(summ_sentences_idx)

generated_summary_1 = "\n".join(map(lambda i: sentences[i], summ_sentences_idx))
print(generated_summary_1)

Foma is licensed under the GNU general public license: in keeping with traditions of free software, the distribution that includes the source code comes with a user manual and a library of examples.
N/A N/A ‘ignores’, left quotient, right quotient, ‘inside’ quotient ∈ ∈/ = /= N/A language membership, position equivalence ≺ < > precedes, follows ∨ ∪ ∧ ∩ - .P. .p. | & − .P. .p. union, intersection, set minus, priority unions => -> (->) @-> => -> (->) @-> context restriction, replacement rules <> shuffle (asynchronous product) × ◦ .x. .o. cross-product, composition Table 1: The regular expressions available in Foma from highest to lower precedence.
For instance, suppose we have defined an arbitrary regular language L, and want to further define a language that contains only one factor of L, we can do so by: OneL = (∃x)(x ∈ L ∧ (∃y)(y ∈ L ∧ (x = y))); Here, quantifiers apply to substrings, and we attribute the usual meaning to ∈ and ∧, and a kind of concatenative meaning to the predicate S

* **Summary 2**  
Cluster centres and sentences contradicting sentences has been selected.

In [0]:
def get_contr_sorted_ids(clusters_ids, df_cont):
    new_clusters_ids = {}
    for i in clusters_ids:
        new_clusters_ids[i] = (list(df_cont.iloc[clusters_ids[i][1:], [clusters_ids[i][0]]].sort_values([clusters_ids[i][0]], ascending=False).index), clusters_ids[i][0])
    return new_clusters_ids

In [0]:
clusters_ids2 = get_contr_sorted_ids(clusters_ids, df_cont)

In [0]:
print(clusters_ids2)

{0: ([16, 17, 45, 3, 47, 18, 52, 51, 56, 55, 9, 7], 48), 1: ([46, 42, 25, 50, 31, 15, 34, 29, 12, 26, 13, 11, 4, 33, 28, 30, 6, 0, 2, 1, 14, 43, 8, 54, 5, 44, 60, 49, 10], 53), 2: ([38, 59, 39, 61, 41, 21, 37, 35, 58, 40, 32, 62, 36, 24, 57, 27, 22, 19, 20], 23)}


In [0]:
summ_sentences_idx = set()

i = 0
while (check_limit_condition(summ_sentences_idx)):
    summ_sentences_idx.add(clusters_ids2[i][1])
    i += 1
    if (i == n_clusters): break

i = 0
while (check_limit_condition(summ_sentences_idx)):
    summ_sentences_idx.add(clusters_ids2[i % n_clusters][0][i // n_clusters])
    i += 1
summ_sentences_idx = sorted(summ_sentences_idx)

generated_summary_2 = "\n".join(map(lambda i: sentences[i], summ_sentences_idx))
print(generated_summary_2)

Unicode (UTF8) is fully supported and is in fact the only encoding accepted by Foma.
It has been successfully compiled on Linux, Mac OS X, and Win32 operating systems, and is likely to be portable to other systems without much effort.
N/A N/A ‘ignores’, left quotient, right quotient, ‘inside’ quotient ∈ ∈/ = /= N/A language membership, position equivalence ≺ < > precedes, follows ∨ ∪ ∧ ∩ - .P. .p. | & − .P. .p. union, intersection, set minus, priority unions => -> (->) @-> => -> (->) @-> context restriction, replacement rules <> shuffle (asynchronous product) × ◦ .x. .o. cross-product, composition Table 1: The regular expressions available in Foma from highest to lower precedence.
5. network = fsm_regex("a+ b+"); 6.
Operations such as unions of automata are also constructed by default with the product construction method that directly produces deterministic automata.
Though the main concern with Foma has not been that of efficiency, but of compatibility and extendibility, from a useful

* **Summary 3**

Sentences with minimun sentences id has been selected

In [0]:
for clid in clusters_ids:
    clusters_ids[clid] = sorted(clusters_ids[clid])
print(clusters_ids)

{0: [3, 7, 9, 16, 17, 18, 45, 47, 48, 51, 52, 55, 56], 1: [0, 1, 2, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 25, 26, 28, 29, 30, 31, 33, 34, 42, 43, 44, 46, 49, 50, 53, 54, 60], 2: [19, 20, 21, 22, 23, 24, 27, 32, 35, 36, 37, 38, 39, 40, 41, 57, 58, 59, 61, 62]}


In [0]:
summ_sentences_idx = set()

i = 0
while (check_limit_condition(summ_sentences_idx)):
    summ_sentences_idx.add(clusters_ids[i % n_clusters][i // n_clusters])
    i += 1

summ_sentences_idx = sorted(summ_sentences_idx)

generated_summary_3 = "\n".join(map(lambda i: sentences[i], summ_sentences_idx))
print(generated_summary_3)

Foma: a finite-state compiler and library
Foma is a compiler, programming language, and C library for constructing finite-state automata and transducers for various uses.
Foma is largely compatible with the Xerox/PARC finite-state toolkit.
One of Foma’s design goals has been compatibility with the Xerox/PARC toolkit.
Foma is licensed under the GNU general public license: in keeping with traditions of free software, the distribution that includes the source code comes with a user manual and a library of examples.
For example, one can either say: ContainsX = Σ* X Σ*; MyWords = {cat}|{dog}|{mouse}; MyRule = n -> m || p; ShortWords = [MyLex1]1 ∩ Σˆ<6; or: Proceedings of the EACL 2009 Demonstrations Session, pages 29–32, Athens, Greece, 3 April 2009.
Qc 2009 Association for Computational Linguistics Operators Compatibility variant Function [ ] () [ ] () grouping parentheses, optionality ∀ ∃ N/A quantifiers \ ‘ term negation, substitution/homomorphism : : cross-product + ∗ + ∗ Kleene closure

# **Rouge Score**

In [0]:
def rouge_su4(hypothesis, premises):
    return Pythonrouge(summary_file_exist=False, summary=hypothesis, reference=premises, n_gram=3, ROUGE_SU4=True,
                       ROUGE_L=True, recall_only=False, stemming=True, stopwords=False, word_level=True, length_limit=True,
                       length=1000, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000,
                       favor=True, p=0.5).calc_score()

def rouge_1_2(hypothesis, premises):
    return Rouge().get_scores(hypothesis, premises)

In [0]:
rouge_su4(generated_summary_1, summ_2)

{'ROUGE-1-F': 0.01615,
 'ROUGE-1-P': 0.01615,
 'ROUGE-1-R': 0.01615,
 'ROUGE-2-F': 0.0,
 'ROUGE-2-P': 0.0,
 'ROUGE-2-R': 0.0,
 'ROUGE-3-F': 0.0,
 'ROUGE-3-P': 0.0,
 'ROUGE-3-R': 0.0,
 'ROUGE-L-F': 0.01615,
 'ROUGE-L-P': 0.01615,
 'ROUGE-L-R': 0.01615,
 'ROUGE-SU4-F': 0.0,
 'ROUGE-SU4-P': 0.0,
 'ROUGE-SU4-R': 0.0}

In [0]:
rouge_su4(generated_summary_2, summ_2)

{'ROUGE-1-F': 0.02869,
 'ROUGE-1-P': 0.02869,
 'ROUGE-1-R': 0.02869,
 'ROUGE-2-F': 0.0,
 'ROUGE-2-P': 0.0,
 'ROUGE-2-R': 0.0,
 'ROUGE-3-F': 0.0,
 'ROUGE-3-P': 0.0,
 'ROUGE-3-R': 0.0,
 'ROUGE-L-F': 0.02869,
 'ROUGE-L-P': 0.02869,
 'ROUGE-L-R': 0.02869,
 'ROUGE-SU4-F': 0.0,
 'ROUGE-SU4-P': 0.0,
 'ROUGE-SU4-R': 0.0}

In [0]:
rouge_su4(generated_summary_3, summ_2)


{'ROUGE-1-F': 0.04263,
 'ROUGE-1-P': 0.04263,
 'ROUGE-1-R': 0.04263,
 'ROUGE-2-F': 0.0,
 'ROUGE-2-P': 0.0,
 'ROUGE-2-R': 0.0,
 'ROUGE-3-F': 0.0,
 'ROUGE-3-P': 0.0,
 'ROUGE-3-R': 0.0,
 'ROUGE-L-F': 0.04263,
 'ROUGE-L-P': 0.04263,
 'ROUGE-L-R': 0.04263,
 'ROUGE-SU4-F': 0.0,
 'ROUGE-SU4-P': 0.0,
 'ROUGE-SU4-R': 0.0}

In [0]:
rouge_1_2(generated_summary_1, summ_2)

[{'rouge-1': {'f': 0.3954802218084062,
   'p': 0.28150134048257375,
   'r': 0.6645569620253164},
  'rouge-2': {'f': 0.3100188994175979,
   'p': 0.22043010752688172,
   'r': 0.5222929936305732},
  'rouge-l': {'f': 0.3921568585250571,
   'p': 0.28112449799196787,
   'r': 0.6481481481481481}}]

In [0]:
rouge_1_2(generated_summary_2, summ_2)

[{'rouge-1': {'f': 0.3979328117057603,
   'p': 0.33624454148471616,
   'r': 0.4873417721518987},
  'rouge-2': {'f': 0.249350644520695,
   'p': 0.21052631578947367,
   'r': 0.3057324840764331},
  'rouge-l': {'f': 0.37275985188576716,
   'p': 0.30409356725146197,
   'r': 0.48148148148148145}}]

In [0]:
rouge_1_2(generated_summary_3, summ_2)

[{'rouge-1': {'f': 0.3786982198732538,
   'p': 0.35555555555555557,
   'r': 0.4050632911392405},
  'rouge-2': {'f': 0.21428570930715007,
   'p': 0.2011173184357542,
   'r': 0.22929936305732485},
  'rouge-l': {'f': 0.30705393696251787,
   'p': 0.2781954887218045,
   'r': 0.3425925925925926}}]