Libraries:

In [None]:
# general
import time, winsound, random
from random import randint
# general

# process arrays and dataframes
import pandas as pd
import numpy as np
import collections
import fuzzy_pandas as fpd
from collections import Counter
#/process arrays and dataframes

# parallel calculations
from tqdm import tqdm
#/parallel calculations

# web parsing
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from bs4 import BeautifulSoup
from bs4.element import Tag
import chromedriver_binary
from requests import get
#/web parsing

# parsing libs
import arxiv
from googlesearch import search 
#/parsing libs

# read .pdf
from tika import parser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
#/read .pdf

# text processing
import spacy,nltk,string,re
import neuralcoref
import networkx as nx
from spacy.symbols import nsubj, nsubjpass, VERB
from nltk.tokenize import sent_tokenize,word_tokenize
from more_itertools import unique_everseen
from textblob import TextBlob

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')

nlp = spacy.load('en_core_web_lg')
nlp.max_length = 50000000
#/text processing

# extractive summarizer
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
#/extractive summarizer

# many-to-many evaluation
from rouge import Rouge
from rouge_score import rouge_scorer
#/many-to-many evaluation

CDFs:

In [None]:
############## Extend abstract ##########
def get_ngrams(text): 
    grams = nltk.ngrams(text.split(), 2)
    grams_list = []
    for i in grams:
        grams_list.append(i)
    
    return grams_list 

def get_jaccard_sim(a,b):
    a, b = set(get_ngrams(a)), set(get_ngrams(b)) 
    c = a.intersection(b)

    return round(float(len(c)/len(a)), 2)

def filter_text(content, abstract, threshold=0.5): 
    content_list = []   
    
    for j in content.split('.'):
        try:
            sim_score = get_jaccard_sim(j, abstract)
        except:
            sim_score = 0
            
        if sim_score > threshold:
            content_list.append(j)    
        
        final_list = list(dict.fromkeys(abstract.split('.') + content_list))    
             
    return '. '.join(final_list)
##############/Extend abstract ##########

##### HTML parsing #####
def parse_google_page(url): 
    try:
        title = BeautifulSoup(get(url).content, 'html.parser').title.getText()
        parser = HtmlParser.from_url(url, Tokenizer("English"))
        
        summarizer = Summarizer(Stemmer("English"))
        summarizer.stop_words = get_stop_words("English")

        sentences = []
        for i in summarizer(parser.document, 1000000):
            sentences.append(str(i))
        txt = ' '.join(sentences)
    except:
        txt = ''
        title = ''
    
    return txt, title

def parse_patent_page(url):
    try:
        soup = BeautifulSoup(get(url).text, 'html.parser')
        
        title = re.sub('[^A-Za-z0-9.]+', ' ', soup.title.getText()).replace('Google Patents','').strip()
        descr = soup.find('section', attrs={'itemprop': 'description'}).getText().replace('\n',' ').strip()
        claims = soup.find('section', {'itemprop':'claims'}).getText().replace('\n',' ').strip()
        abstract = soup.abstract.getText().replace('\n',' ').strip() 
        
        abstract = re.sub('[^A-Za-z0-9.]+', ' ', abstract).replace('Google Patents','').strip()
        descr = re.sub('[^A-Za-z0-9.]+', ' ', descr).replace('Google Patents','').strip()
        claims = re.sub('[^A-Za-z0-9.]+', ' ', claims).replace('Google Patents','').strip()
        
        paragraphs = abstract + '; ' + descr + '; ' + claims 
        
    except:
        paragraphs = ''
        title = ''  

    return paragraphs, title

def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def get_unique_text(document):
    unique_sentences = []
    for sentence in [sent.raw for sent in TextBlob(document).sentences]:
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)
    return ' '.join(unique_sentences)

def get_text(url):
    page = urlopen(url)
    soup = BeautifulSoup(page)
    fetched_text = ' '.join(map(lambda p:p.text,soup.find_all('p')))
    return fetched_text
#####/HTML parsing #####

Parameters:

In [None]:
url = 'https://arxiv.org/abs/2006.10213'

Extend test article:

In [None]:
# load driver
driver = webdriver.Chrome(ChromeDriverManager().install())
#/load driver
    
# get urls  
driver.get(url)

soup = BeautifulSoup(driver.page_source,'lxml')
result_div = soup.find_all('blockquote', attrs={'class': 'abstract mathjax'})[0]
abstract = result_div.get_text().replace('\n',' ').replace('\t',' ').strip()

print(abstract)

driver.stop_client()
driver.close()

Collect .pdf data:

In [None]:
file_data = parser.from_file(url.replace('abs', 'pdf'))['content']
content = file_data.replace('\n',' ').replace('\t',' ').strip()

Extend abstract:

In [None]:
extended_abstract = filter_text(content, abstract, threshold=0.1).replace('\n',' ').replace('\t',' ').strip()
print(extended_abstract)