In [3]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from tqdm import tqdm

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_json("/kaggle/input/preprocessed-scien-data/preprocessed.json")
df.head()

Unnamed: 0,text,summary
0,Human evaluation machine translation ( MT ) we...,Bleu: A Method For Automatic Evaluation Of Mac...
1,A large number current language processing sys...,TnT - A Statistical Part-Of-Speech Tagger\nTri...
2,Current automatic summarizers usually rely sen...,Sentence Reduction For Automatic Text Summariz...
3,Even moderately long document typically addres...,Advances In Domain Independent Linear Text Seg...
4,Word sense disambiguation often cast problem s...,A Simple Approach To Building Ensembles Of Nai...


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     1009 non-null   object
 1   summary  1009 non-null   object
dtypes: object(2)
memory usage: 15.9+ KB


In [14]:
df = df[df['text'] != '']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 946 entries, 0 to 1008
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     946 non-null    object
 1   summary  946 non-null    object
dtypes: object(2)
memory usage: 22.2+ KB


In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
!pip install sentence_transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2 ')

In [19]:
df = df.reset_index(drop=True)

In [9]:
model.to('cuda')
def get_summary(text, num_words: int=1000):
    sentences = nltk.sent_tokenize(text)
    embeddings = model.encode(sentences, show_progress_bar=False)
    try:
        sim_matrix = cosine_similarity(embeddings)
    except Exception as e:
        print(e, type(e))
        print(embeddings.shape)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)
    
    ranked_sentences = sorted(((scores[i],s, i) for i,s in enumerate(sentences)), reverse=True)
    final_sents = []
    total_length = 0
    for score, sents, i in ranked_sentences:
        total_length += len(sents.split())
        if total_length < num_words:
            final_sents.append((score, sents, i))
        else:
            break

    top_k_sents = sorted(final_sents, key=lambda x: x[2])
    sents = " ".join([s[1] for s in top_k_sents])

    return sents

def evaluate_fn(text, summary):
    summary = re.sub(r'\s+', ' ', summary)
    summary = summary.strip()

    sents = text
    sents = re.sub(r'\s+', ' ', sents)
    sents = sents.strip()

    smoothie = SmoothingFunction().method4
    score = sentence_bleu([summary.split()], sents.split(), smoothing_function=smoothie)
    return score

# test 
# get_summary(df['text'][0])

In [21]:
scores, summaries = [], []
for i in tqdm(range(len(df))):
    summaries.append(get_summary(df['text'][i]))

100%|██████████| 946/946 [24:19<00:00,  1.54s/it]  


In [22]:
df['extracted_summary'] = summaries
df.to_csv("extracted_summary.csv", index=False)
df.head()

Unnamed: 0,text,summary,extracted_summary
0,Human evaluation machine translation ( MT ) we...,Bleu: A Method For Automatic Evaluation Of Mac...,"For part , various human evaluation approach q..."
1,A large number current language processing sys...,TnT - A Statistical Part-Of-Speech Tagger\nTri...,"Furthermore , large interest part-ofspeech tag..."
2,Current automatic summarizers usually rely sen...,Sentence Reduction For Automatic Text Summariz...,We call operation remove extraneous phrase ext...
3,Even moderately long document typically addres...,Advances In Domain Independent Linear Text Seg...,", 1997b ) improve document navigation visually..."
4,Word sense disambiguation often cast problem s...,A Simple Approach To Building Ensembles Of Nai...,A learning algorithm induce representative mod...


In [10]:
scores, txt_sum, txt_exts_sum = [], [], []
for i in tqdm(range(len(df))):
    scores.append(evaluate_fn(df['extracted_summary'][i], df['summary'][i]))
    txt_sum.append(evaluate_fn(df['text'][i], df['summary'][i]))
    txt_exts_sum.append(evaluate_fn(df['text'][i], df['extracted_summary'][i]))

print("Average BLEU Score between extracted summary and the original summary: ", np.mean(scores))
print("Average BLEU Score between text and the the original summary: ", np.mean(txt_sum))
print("Average BLEU Score between text and the extracted summary: ", np.mean(txt_exts_sum))

100%|██████████| 945/945 [00:39<00:00, 24.05it/s]

Average BLEU Score between extracted summary and the original summary:  0.003896099575859292
Average BLEU Score between text and the the original summary:  0.0019454657108385835
Average BLEU Score between text and the extracted summary:  0.3277067856069346



