### LIbraries

In [1]:
# general
import time, winsound, random
from random import randint
# general

# process arrays and dataframes
import pandas as pd
import numpy as np
import collections
import fuzzy_pandas as fpd
from collections import Counter
#/process arrays and dataframes

# parallel calculations
from tqdm import tqdm
#/parallel calculations

# web parsing
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from bs4 import BeautifulSoup
from bs4.element import Tag
import chromedriver_binary
from requests import get
#/web parsing

# parsing libs
import arxiv
import wikipediaapi
from googlesearch import search  
#/parsing libs

# read .pdf
from tika import parser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
#/read .pdf

# text processing
import spacy,nltk,string,re
import neuralcoref
import networkx as nx
from spacy.symbols import nsubj, nsubjpass, VERB
from nltk.tokenize import sent_tokenize,word_tokenize
from more_itertools import unique_everseen
from textblob import TextBlob

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')

nlp = spacy.load('en_core_web_lg')
nlp.max_length = 50000000
#/text processing

# create .docx
import docx
from docx import Document
from docx.shared import Cm
from docx.shared import Pt
from docx.enum.dml import MSO_THEME_COLOR_INDEX
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
#/create .docx

# create .pdf
from reportlab.lib.styles import ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER
from reportlab.platypus import Table, TableStyle
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch
from reportlab.platypus import Image
from reportlab import platypus
#/create .pdf

# keywords extraction
import yake
#/keywords extraction

# extractive summarizer
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
#/extractive summarizer

# abstractive summarizer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
import textstat
import torch
#/abstractive summarizer

# many-to-many evaluation
from rouge import Rouge
from rouge_score import rouge_scorer
#/many-to-many evaluation

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\skamenshchikov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Parameters

In [2]:
UPLOAD_FOLDER = 'docs/'

sent_number = 50 # total sentences
threshold = 0.8 #extension boundary

months_delta = 12 #actual dates

page_number = 10 # page volume
keys_number = 10 #number of tags

max_length = 60 #number of tokens
title_num = 5 #number of titles

### Feature toggles

In [3]:
compress = True
paraphrase = False

wiki_sum = True
gogle_sum = True
arxiv_sum = True
patent_sum = True

### CDFs

In [4]:
##### HTML parsing #####
def parse_google_page(url): 
    try:
        title = BeautifulSoup(get(url).content, 'html.parser').title.getText()
        parser = HtmlParser.from_url(url, Tokenizer("English"))
        
        summarizer = Summarizer(Stemmer("English"))
        summarizer.stop_words = get_stop_words("English")

        sentences = []
        for i in summarizer(parser.document, 1000000):
            sentences.append(str(i))
        txt = ' '.join(sentences)
    except:
        txt = ''
        title = ''
    
    return txt, title

def parse_patent_page(url):
    try:
        soup = BeautifulSoup(get(url).text, 'html.parser')
        
        title = re.sub('[^A-Za-z0-9.]+', ' ', soup.title.getText()).replace('Google Patents','').strip()
        descr = soup.find('section', attrs={'itemprop': 'description'}).getText().replace('\n',' ').strip()
        claims = soup.find('section', {'itemprop':'claims'}).getText().replace('\n',' ').strip()
        abstract = soup.abstract.getText().replace('\n',' ').strip() 
        
        abstract = re.sub('[^A-Za-z0-9.]+', ' ', abstract).replace('Google Patents','').strip()
        descr = re.sub('[^A-Za-z0-9.]+', ' ', descr).replace('Google Patents','').strip()
        claims = re.sub('[^A-Za-z0-9.]+', ' ', claims).replace('Google Patents','').strip()
        
        paragraphs = (abstract) + '; ' + (claims) 
        
    except:
        paragraphs = ''
        title = ''  

    return paragraphs, title

def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def get_unique_text(document):
    unique_sentences = []
    for sentence in [sent.raw for sent in TextBlob(document).sentences]:
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)
    return ' '.join(unique_sentences)

def get_text(url):
    page = urlopen(url)
    soup = BeautifulSoup(page)
    fetched_text = ' '.join(map(lambda p:p.text,soup.find_all('p')))
    return fetched_text
#####/HTML parsing #####

############# Parse Wiki ############# 
def parse_wiki(google_url):
    
    # load driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    #/load driver
    
    # get urls  
    driver.get(google_url)
    time.sleep(randint(1,5))

    soup = BeautifulSoup(driver.page_source,'lxml')
    result_div = soup.find_all('div', attrs={'class': 'g'})

    links = []
    titles = []
    errors = []

    descriptions = []
    for r in result_div:
        try:
            link = r.find('a', href=True)
            title = None
            title = r.find('h3')

            if isinstance(title,Tag):
                title = title.get_text()

            description = None
            description = r.find('span', attrs={'class': 'st'})

            if isinstance(description, Tag):
                description = description.get_text()

            if link != '' and title != '' and description != '':
                links.append(link['href'])
                titles.append(title)
                descriptions.append(description)

        except Exception as e:
            print(e)
            continue

    url_list = links[:(page_number)]
    url_list = [i for i in url_list if 'https://en.wikipedia.org' in i]
    
    title_list = []
    for i in url_list:
        try:
            if 'https://en.wikipedia.org' in i: 
                title_list.append(i.split('/')[4])  
        except:
            continue
    #/ get urls
        
    driver.stop_client()
    driver.close()
    
    return title_list     
############# Parse Wiki ##############

############## Extend abstract ##########
def get_ngrams(text): 
    grams = nltk.ngrams(text.split(), 2)
    grams_list = []
    for i in grams:
        grams_list.append(i)
    
    return grams_list 

def get_jaccard_sim(a,b):
    a, b = set(get_ngrams(a)), set(get_ngrams(b)) 
    c = a.intersection(b)

    return round(float(len(c)/len(a)), 2)

def filter_text(content, abstract, threshold=0.5): 

    content_list = []   
    for j in content.split('.'):
        try:
            sim_score = get_jaccard_sim(j, abstract)
        except:
            sim_score = 0
            
        if sim_score > threshold:
            content_list.append(j)    
        
        final_list = list(dict.fromkeys(abstract.split('.') + content_list))    
             
    return '. '.join(final_list)
##############/Extend abstract #########

############# Parse Arxiv #############
def parse_arxiv(query, delta_months):
    
    arxivtext = ''
    urls = []
    titles = []

    closest_value = 100
    req = 'https://arxiv.org/search/?query='+query+'&size='+str(closest_value)
    req = req + '&searchtype=all&source=header&start=0&date-filter_by=past_' + str(delta_months) 
    
    htmlString = get(req)

    soup = BeautifulSoup(htmlString.content, 'html5lib')
    hrefs = soup.find_all('a', {'href': re.compile(r'arxiv.org/abs/')})

    titles = list(soup.find_all('p', {'class' : 'title is-5 mathjax'}))[:page_number]
    titles_r = [i.text.replace('\n','').replace('  ','') for i in titles]
    titles = ', '.join(titles_r)

    if (len(hrefs) > 0):
        for i in hrefs:
            urls.append(i['href'])

    txt = []
    for i in urls[:page_number]:
        time.sleep(random.randint(1,8))
        soup = BeautifulSoup(get(str(i)).content, 'html5lib')
        abstract = ' '.join(soup.find('blockquote').text.replace('  ',' ').split())
        txt.append(abstract)

    arxivtext = re.sub('[^A-Za-z0-9.]+', ' ', '; '.join(txt))
    df = pd.DataFrame(list(zip(txt, urls, titles_r)), columns=['text','link', 'page'])

    return arxivtext, titles, df
#############/Parse Arxiv #############

############# Parse Google ###############
def parse_google(query):   
    
    txt = []
    titles = []
    errors = []

    # load driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    #/load driver 

    # get urls
    google_url = "https://www.google.com/search?q=" + query + "&num=" + str(page_number+1)
    google_url = google_url + '&hl=en&gl=en' + '&lr=lang_en&cr=countryGB'
    
    driver.get(google_url)
    time.sleep(randint(1,5))

    soup = BeautifulSoup(driver.page_source,'lxml')
    result_div = soup.find_all('div', attrs={'class': 'g'})

    links = []
    titles = []
    errors = []

    descriptions = []
    for r in result_div:
        try:
            link = r.find('a', href=True)
            title = None
            title = r.find('h3')

            if isinstance(title,Tag):
                title = title.get_text()

            description = None
            description = r.find('span', attrs={'class': 'st'})

            if isinstance(description, Tag):
                description = description.get_text()

            if link != '' and title != '' and description != '':
                links.append(link['href'])
                titles.append(title)
                descriptions.append(description)

        except Exception as e:
            print(e)
            continue

    url_list = list(set(links))[:(page_number)] 
    #/ get urls
    
    for j in tqdm(url_list):
        delta = random.randint(1,8)
        time.sleep(delta) 
        
        try:  
            if str(j).endswith('.pdf'):
                file_data = parser.from_file(str(j))        
                t = file_data['content'].replace('\n','')    
                titles.append(t[:100])
            else:
                t = parse_google_page(j)[0].replace('\n','') 
                titles.append(parse_google_page(j)[1].replace('\n',''))
            
            txt.append(t)
            
        except:
            print('Parsing error:',str(j))
            errors.append(str(j))
          
    df = pd.DataFrame(list(zip(txt, url_list, titles)), columns=['text','link', 'page'])
    df = df[~df['page'].str.contains('|'.join(['403','404']))]
    df.replace('', np.nan, inplace=True)
    df.dropna(inplace=True)
    
    googletext = re.sub('[^A-Za-z0-9.]+', ' ', '; '.join(list(df['text'])))
    titles = list(df['page'])
   
    return googletext, errors, df, titles
#############/Parse Google ###############

############# Parse Patents #############
def parse_patents(query, keys_number, page_number):   
    
    # load driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    #/load driver

    # get urls 
    google_url = "https://www.google.com/search?q=" + query + "&num=" + str(page_number+1)
    driver.get(google_url)
    time.sleep(randint(1,5))

    soup = BeautifulSoup(driver.page_source,'lxml')
    result_div = soup.find_all('div', attrs={'class': 'g'})

    links = []
    titles = []
    errors = []

    descriptions = []
    for r in result_div:
        try:
            link = r.find('a', href=True)
            title = None
            title = r.find('h3')

            if isinstance(title,Tag):
                title = title.get_text()

            description = None
            description = r.find('span', attrs={'class': 'st'})

            if isinstance(description, Tag):
                description = description.get_text()

            if link != '' and title != '' and description != '' and ('/en' in link['href']):
                links.append(link['href'])
                titles.append(title)
                descriptions.append(description)

        except Exception as e:
            print(e)
            continue

    url_list = links[:(page_number)]
    #/ get urls

    # load content 
    txt = []
    titles = []
    for j in tqdm(url_list):
        delta = random.randint(1,8)
        time.sleep(delta)
         
        txt.append(re.sub('[^A-Za-z0-9.]+', ' ', parse_patent_page(j)[0]))
        titles.append(re.sub('[^A-Za-z0-9.]+', ' ', parse_patent_page(j)[1].split('\n')[0]))         
       
    patenttext = txt
    df = pd.DataFrame(list(zip(txt, url_list, titles)), columns=['text','link', 'page'])
    
    driver.stop_client()
    driver.close()
   
    return patenttext, errors, df 
#############/Parse Patents #############

##### Text processing #####
def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

def text_normalize(txt):
    processed_text = re.sub('[^a-zA-Z]', ' ', txt)
    processed_text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",processed_text)
    processed_text=re.sub("(\\d|\\W)+"," ",processed_text)

    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(processed_text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if not word in stop_words]
    tokens = [i for i in tokens if (tags(i) in ['NN', 'NNP', 'NNS', 'NNPS'])]

    return tokens

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def filter_triplet(final_text):
    
    final_text = get_unique_text(final_text)
    doc = nlp(final_text)
    valid_sents = []

    for s in list(doc.sents):
        if syntax_full(s):
            valid_sents.append(s.text)
    
    final_text = ' '.join(valid_sents)
    
    return final_text

def coref_res(rawtext, coref_greedn = 0.5):

    neuralcoref.add_to_pipe(nlp, greedyness = coref_greedn, store_scores=False)
    doc = nlp(rawtext)

    resolved = list(tok.text_with_ws for tok in doc)

    for cluster in doc._.coref_clusters:
        for coref in cluster:
            if coref != cluster.main:
                if coref.text[0].isalpha() and coref.text[0].isupper():

                    main_words_list=word_tokenize(cluster.main.text)
                    main_words_list[0]=main_words_list[0].capitalize()
                    resolved[coref.start] = detokenizer(main_words_list) + doc[coref.end-1].whitespace_

                for i in range(coref.start+1, coref.end):
                    resolved[i] = ""
            else:
                resolved[coref.start] = cluster.main.text + doc[coref.end-1].whitespace_
                for i in range(coref.start+1, coref.end):
                    resolved[i] = ""

    text_resolved = ''.join(resolved)
    nlp.remove_pipe("neuralcoref")

    return text_resolved

def compress(spacy_sents,sents_whitelist):
    blacklist_tokens=[]
    n=1
    for sent in spacy_sents:
        if (n in sents_whitelist):
            for token in sent:
                if token.dep_ in ['appos','advmod']:
                    token_sub_tree=token.subtree
                    for t in token_sub_tree:
                        blacklist_tokens.append(t.i)

        n=n+1
    return(blacklist_tokens)

def spacy_compress(rawtext):

    doc1 = nlp(rawtext)
    sents_whitelist = get_sents_ids_whitelist(doc1.sents)

    tokens_blacklist = compress(doc1.sents,sents_whitelist)
    sents_tokens = get_list_sents_tokens(doc1.sents,sents_whitelist,tokens_blacklist)
    compressed_text_sents = []

    for s in sents_tokens:
        text=detokenizer(s)
        compressed_text_sents.append(text)
    compressed_text_sents=sentence_grammar_fix(compressed_text_sents)
    text =' '.join(compressed_text_sents)

    return(text)
##### Text processing #####

############## Get summary #############
def get_summary(rawtext, sentences):
    
    stemmer = Stemmer("english")
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words("english")
    parser = PlaintextParser.from_string(' '.join(sent_tokenize(rawtext)[6:]), Tokenizer("english"))

    text_list = []
    for sentence in summarizer(parser.document, sentences):
        text_list.append(str(sentence))

    txt = ' '.join(sent_tokenize(rawtext)[:6]) + ' '+' '.join(text_list)

    z = 0
    output = []
    
    for i in nltk.sent_tokenize(txt):
        output.append(str(i) + '==')
    
    txt = ''.join(output)
    
    return txt
##############/Get summary #############

############## Get tags and entities ###########
def graph_keys(final_text, top_number):
    
    bigrams = list(nltk.ngrams(text_normalize(final_text.lower()),2))
    bigrams = [' '.join(i) for i in bigrams if (i[0]!=i[1])] 
    bigram_counts = collections.Counter(bigrams)
    
    df = pd.DataFrame(bigram_counts.most_common(len(bigram_counts)), columns=['bigram', 'count'])[:top_number]
    df['count'] = 100*df['count']/df['count'].sum().astype(int) 
    keys = ', '.join(list(df['bigram'].astype(str)))

    return keys

def yake_keys(text, keys_number):
    сustom_kw_extractor = yake.KeywordExtractor(lan="en", n=2, top=keys_number)
    keywords = сustom_kw_extractor.extract_keywords(text)
    keywords = ', '.join([i[1] for i in keywords])
    
    return keywords

def get_entities(rawtext, tops):
    spacy_nlp = spacy.load('en_core_web_lg', disable=["tagger","parser"])
    nlp.max_length = 1000000000000
    doc = spacy_nlp(rawtext)

    ners = []
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PERSON']:
            ners.append(ent.text)
   
    ner_counts = collections.Counter(ners)

    try:
        df = pd.DataFrame(ner_counts.most_common(len(ner_counts)), columns=['ner_names', 'count'])[:tops]
        df['count'] = 100*df['count']/df['count'].sum().astype(int) 
        keys = ', '.join(list(df['ner_names'].astype(str)))
    except:
        keys = ''
    
    return keys
############## Get tags and entities #############

############## Add keyurls ################
def add_keyurls(final_keys, query):
    url_keys = []
    for i in final_keys.split(','):
        url = 'https://www.google.com/search?q=' + '+'.join(re.sub(r" ?\([^)]+\)", "", i).strip().split()) + '+' + query + '/keyword/' + i 
        url_keys.append(url)
        
    return url_keys     
##############/Add urls ###################

##### Abstractive summarization #############
def get_response(input_text,num_return_sequences):
    
    batch = tokenizer.prepare_seq2seq_batch([input_text], truncation=True, padding='longest', max_length=60, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch, max_length=60, num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
    
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return tgt_text
#####/Abstractive summarization #############

############# Doc preparation ##########
def add_hyperlink(paragraph, text, url, flag):
    part = paragraph.part
    r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # Create the w:hyperlink tag and add needed values
    hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
    hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

    # Create a w:r element and a new w:rPr element
    new_run = docx.oxml.shared.OxmlElement('w:r')
    rPr = docx.oxml.shared.OxmlElement('w:rPr')

    # Join all the xml elements together add add the required text to the w:r element
    new_run.append(rPr)
    new_run.text = text
    hyperlink.append(new_run)

    # Create a new Run object and add the hyperlink into it
    r = paragraph.add_run()
    r._r.append (hyperlink) 

    # A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
    # Delete this if using a template that has the hyperlink style in it
    r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
    r.font.underline = flag

    return hyperlink

def save_doc(final_summary, summary, query, docs_number, sent_number, url_keys, url_marks, score):
    
    sent_list = list(final_summary.split(sep='<hr>'))
    doc = Document()
    style = doc.styles['Normal']
    
    font = style.font
    font.name = 'Times New Roman'
    font.size = Pt(12)

    hd = doc.add_paragraph()
    hd.alignment = WD_ALIGN_PARAGRAPH.LEFT
    hd.add_run('Summary').bold = True

    if query != 'none':
        hd = doc.add_paragraph('Request: ' + "''" + query + "''")

    hd = doc.add_paragraph('Sentences: ' + str(sent_number))
    hd = doc.add_paragraph('Bigrams: ' + str(score) + '%')
    hd = doc.add_paragraph('Documents: ' + str(docs_number))
    
    hd = doc.add_paragraph('')
    
    hd.add_run('Keys:\n').underline = True
    
    for j in url_keys:
        if j != url_keys[-1]:
            add_hyperlink(hd, (str(j.split('/keyword/')[1]) + ', '), str(j.split('/keyword/')[0]), False)
        else:
            add_hyperlink(hd, (str(j.split('/keyword/')[1])), str(j.split('/keyword/')[0]), False)
        
    hd.add_run('\n\nBenchmarks:\n').underline = True
    
    for j in url_marks:
        if j != url_marks[-1]:
            add_hyperlink(hd, str(j.split('/keyword/')[1]) + ', ', str(j.split('/keyword/')[0]), False)
        else:
            add_hyperlink(hd, str(j.split('/keyword/')[1]), str(j.split('/keyword/')[0]), False)
    
    r = hd.add_run()
    for i in sent_list:
        hd.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY

        if query != 'none':
            try:
                link = re.search(r"<a href=(.*?)target='_blank'", str(i)).group(1).replace(' ','')
                hd = doc.add_paragraph(striphtml(str(i)).replace('<hr>','').replace('<u>','').replace('More',''))               
                add_hyperlink(hd, 'More', link, True).add_run()
            except:
                link = ''
        if query == 'none':
            hd = doc.add_paragraph(striphtml(str(i)).replace('<hr>','').replace('<u>','').replace('More',''))    
         
    doc.save('docs/' + summary + '.docx')
    
    return True
#############/Doc preparation ##########

############## Sandbox functions ##########
def longest_common_substring(s1, s2):
  m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
  longest, x_longest = 0, 0
  for x in range(1, 1 + len(s1)):
    for y in range(1, 1 + len(s2)):
      if s1[x - 1] == s2[y - 1]:
        m[x][y] = m[x - 1][y - 1] + 1
        if m[x][y] > longest:
          longest = m[x][y]
          x_longest = x
      else:
        m[x][y] = 0
  return s1[x_longest - longest: x_longest]

def longest_common_sentence(s1, s2):
    s1_words = s1.split(' ')
    s2_words = s2.split(' ')
    return ' '.join(longest_common_substring(s1_words, s2_words))

def css(a,b):
    if len(a.split()) > 0:
        score = len(longest_common_sentence(a,b).split())/len(a.split())
    else:    
        score = 0
    return score

def readingTime(mytext):
    total_words = len(word_tokenize(mytext))
    estimatedTime = round(total_words/200.0,1)
    return estimatedTime

def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)

def tags(x):
    return nltk.pos_tag(nltk.word_tokenize(x))[0][1]

def syntax_full(spacy_sentence):
    result=[]
    for token in spacy_sentence:
        if (token.dep == nsubj or token.dep == nsubjpass) and token.head.pos == VERB:
            result.append(token.head)
    if result:
        return True
    else:
        return False

def check_min_num_of_clauses(spacy_sentence, n):
    result=[]
    for token in spacy_sentence:
        if (token.dep_ in ['nsubj','nsubjpass','csubj','expl']) and (token.head.pos_ == 'VERB' or token.head.pos_ == 'AUX'):
            result.append(token.head.text)
    if len(result)>=n:
        return True

    else:
        return False

def get_sents_ids_whitelist(spacy_sents):
    whitelist=[]
    i=1
    sents_texts=[]
    for sent in spacy_sents:
        if (sent.text not in sents_texts) and check_min_num_of_clauses(sent,1):
            whitelist.append(i)
            sents_texts.append(sent.text)
        i=i+1
    return(whitelist)

def get_list_sents_tokens(spacy_sents,sents_whitelist,blacklist_tokens):
    sents_tokens=[]
    n=1
    for sent in spacy_sents:
        sent_tokens=[]
        if (n in sents_whitelist):
            for token in sent:
                if (token.i not in blacklist_tokens):
                    sent_tokens.append(token.text)
            sents_tokens.append(sent_tokens)
            sent_tokens=[]

        n=n+1
    return(sents_tokens)

def detokenizer(list_of_tokens):
    text_str="".join([" "+w if not w.startswith("'") and not w.startswith("’") and w!='' and w not in string.punctuation else w for w in list_of_tokens]).strip()
    return(text_str)

def sentence_grammar_fix(sentences):
    fixed=[]
    for sent in sentences:

        sent=sent.strip()
        sent=sent.replace('\n','')
        sent=sent.replace('()','')

        sent=re.sub('\s+',' ',sent)
        sent=sent+'.'
        sent=re.sub(r'([,.\-—:])+',r'\1',sent)

        if len(sent)>1:
            if sent[0] in ['.',',','-','—']:
                sent=sent[1:]
        sent=sent.strip()

        if len(sent)>1:
            if sent[0].isalpha():
                sent=sent[0].upper()+sent[1:]
        fixed.append(sent)

    return(fixed)
##############/Sandbox functions ##########

### Query

In [5]:
query = input()

quantum computer


### Parse web sources

Parse Wiki:

In [10]:
%%time

wikitext = ''
wikikeys = ''

df_wiki = pd.DataFrame()

if wiki_sum == True: 
    wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.WIKI)

    red_query = "https://www.google.com/search?q=" + 'site:https://en.wikipedia.org ' + query + "&num=" + str(page_number+1)
    
    red_query = red_query + '&searchtype=all&source=header&start=0&date-filter_by=past_' + str(months_delta)
    
    wiki_titles = parse_wiki(red_query)

    txts = []
    titles = []

    for i in tqdm(wiki_titles): 
        
        page_sum = wiki_wiki.page(i).summary
        page_txt = wiki_wiki.page(i).text
        sent_list = filter_text(page_txt, page_sum, threshold=threshold)
       
        titles.append(i)
        txts.append(''.join(sent_list).replace('\n', ''))        
    
    wikitext = ''.join(txts).replace('\n','') 

    if compress == True:
        wikitext = coref_res(filter_triplet(wikitext))

    wikikeys = ', '.join(titles).lower().replace('_',' ')
    wikikeys = ', '.join([i for i in wikikeys.split(', ') if len(i.split()) > 1][:keys_number])
    wiki_entities = get_entities(wikitext, keys_number)

    url_list = [str('https://en.wikipedia.org/wiki/' + i)  for i in wiki_titles] 
    
    df_wiki = pd.DataFrame(list(zip(txts, url_list, titles)), columns=['text','link', 'page'])
    df_wiki.replace('', np.nan, inplace=True)
    df_wiki.dropna(inplace=True)
    
    random_num = randint(1,len(df_wiki)) 

    print('|Keywords|:', wikikeys, '\n')
    print('|Entities|:', wiki_entities, '\n')  
    
    print(df_wiki['page'][random_num-1]+'\n')
    print(df_wiki['text'][random_num-1][:1000]+'...'+'\n')
    
winsound.Beep(2500, 1000)    

 


[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\skamenshchikov\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.78it/s]


|Keywords|: quantum computing, quantum computing, superconducting quantum computing, timeline of quantum computing and communication, shor%27s algorithm, quantum algorithm 

|Entities|: Quantum, RSA, Turing, Paul Benioff, Richard Feynman, Yuri Manin, Peter Shor, Google AI, National Aeronautics and Space Administration, NASA 

Qubit

In quantum computing, a qubit () or quantum bit (sometimes qbit) is the basic unit of quantum information—the quantum version of the classical binary bit physically realized with a two-state device.   A qubit is a two-state (or two-level) quantum-mechanical system, one of the simplest quantum systems displaying the peculiarity of quantum mechanics.   Examples include: the spin of the electron in which the two levels can be taken as spin up and spin down; or the polarization of a single photon in which the two states can be taken to be the vertical polarization and the horizontal polarization.  In a classical system, a bit would have to be in one state or th

In [11]:
df_wiki.head()

Unnamed: 0,text,link,page
0,Quantum computing is the use of quantum phenom...,https://en.wikipedia.org/wiki/Quantum_computing,Quantum_computing
1,Quantum computing is the use of quantum phenom...,https://en.wikipedia.org/wiki/Quantum_computing,Quantum_computing
2,Superconducting quantum computing is an implem...,https://en.wikipedia.org/wiki/Superconducting_...,Superconducting_quantum_computing
3,"In quantum computing, a qubit () or quantum bi...",https://en.wikipedia.org/wiki/Qubit,Qubit
4,This is a timeline of quantum computing.,https://en.wikipedia.org/wiki/Timeline_of_quan...,Timeline_of_quantum_computing_and_communication


Parse Arxiv:

In [None]:
%%time

arxivtext = ''
arxivkeys = ''

df_arxiv = pd.DataFrame()

if arxiv_sum == True:
    
    try:
        df_arxiv = parse_arxiv(query, months_delta)[2] 
        df_arxiv.replace('', np.nan, inplace=True)
        df_arxiv.dropna(inplace=True)   
        
        arxivtext = ''.join(list(df_arxiv['text'])) 
    
        if compress == True:
            arxivtext = coref_res(filter_triplet(arxivtext))
    
        arxiv_entities = get_entities(arxivtext, keys_number)
        arxiv_titles = '; '.join(list(df_arxiv['page'])[:title_num]).replace('\n','')
        arxivkeys = ', '.join([i for i in yake_keys(arxivtext, keys_number).split(', ') if len(i.split()) > 1][:keys_number])
        
        random_num = randint(1, len(df_arxiv)) 
    
        print('|Keywords|:', arxivkeys, '\n')
        print('|Titles|:', arxiv_titles, '\n')
        print('|Entities|:', arxiv_entities, '\n')
        
        print(df_arxiv['page'][random_num-1]+'\n')
        print(df_arxiv['text'][random_num-1][:1000]+'...'+'\n')
    
    except:
        print('No data')
     
winsound.Beep(2500, 1000)

Parse Google:

In [None]:
%%time

googletext = ''
googlekeys = ''

df_google = pd.DataFrame()

if gogle_sum == True:
    
    try:
        df_google = parse_google(query)[2]
        df_google.replace('', np.nan, inplace=True)
         
        googletext = ''.join(list(df_google['text']))
    
        if compress == True:
            googletext = coref_res(filter_triplet(googletext))
    
        google_entities = get_entities(googletext, keys_number)
        google_titles = '; '.join(list(df_google['page'])).replace('\n','')
        googlekeys = ', '.join([i for i in yake_keys(googletext, keys_number).split(', ') if len(i.split()) > 1][:keys_number])
        
        random_num = randint(1,len(df_google)) 
    
        print('|Keywords|:', googlekeys, '\n')
        print('|Titles|:', google_titles, '\n')
        print('|Entities|:', google_entities, '\n')
        
        print(df_google['page'][random_num-1]+'\n')
        print(df_google['text'][random_num-1][:1000]+'...'+'\n')

    
    except:
        print('No data')
    
winsound.Beep(2500, 1000)    

Parse Google Patents:

In [None]:
%%time

patenttext = ''
patentkeys = ''

df_patent = pd.DataFrame()

if patent_sum == True:
    
    z = parse_patents("site:https://patents.google.com " + query, keys_number, page_number) 
    patenttext = ''.join(z[0])
    
    df_patent = z[2]
    df_patent.replace('', np.nan, inplace=True)
    df_patent.dropna(inplace=True)
    
    patent_entities = get_entities(patenttext, keys_number)
    patent_titles = '; '.join(list(df_patent['page'])).replace('\n','')
    patentkeys = ', '.join([i for i in yake_keys(patenttext, keys_number).split(', ') if len(i.split()) > 1][:keys_number])
    
    random_num = randint(1, len(df_patent)) 
    
    print('|Keywords|:', patentkeys, '\n')
    print('|Titles|:', patent_titles, '\n')
    print('|Entities|:', patent_entities, '\n')
    
    print(df_patent['page'][random_num-1]+'\n')
    print(df_patent['text'][random_num-1][:1000]+'...'+'\n')
    
winsound.Beep(2500, 1000)

### Extractive summary

Concatenate dataframes:

In [None]:
df = df_wiki.append(df_google).append(df_arxiv).append(df_patent)
df.head(3)

Get text and tags:

In [None]:
%%time

final_text = (wikitext + arxivtext + googletext + patenttext)
final_entities = get_entities(final_text, keys_number)

final_keys = list(frozenset(googlekeys.split(', ') + wikikeys.split(', ') + arxivkeys.split(', ') + patentkeys.split(', ')))
final_keys = ', '.join(final_keys) 

Get extractive summary:

In [None]:
%%time

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

report_summary = get_summary(final_text, sent_number)
scores = scorer.score(report_summary, final_text)
scores = round(100*list(list(scores.values())[0])[2])

print('Information extracted:', (str(scores) + ' %'))
print('\n', (report_summary[:1000])+'...', '\n')

winsound.Beep(2500, 1000)

### Abstractive summary

Paraphrase generation:

In [None]:
%%time

if paraphrase == True:
    
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
    
    counter = 0
    summ_list = []
    
    for i in report_summary.split('==')[:-1]:
        summ_list.append('=='+ get_response(i,1)[0])
    
    summary = ' '.join(summ_list)

    scores = scorer.score(summary, report_summary)
    scores = round(100*list(list(scores.values())[0])[2])
    report_summary = summary 

    print('Plagiarism:', (str(scores) + ' %'))

    winsound.Beep(2500, 1000)

### Extend the content

Create keys with urls:

In [None]:
url_keys = add_keyurls(final_keys, query)
mark_keys = add_keyurls(final_entities, query)  

ref_list = []
pdf_list = []

sent_list = list(report_summary.split(sep='=='))[:-1]

for i in sent_list:
    try:
        df_score = df.copy()
        df_score['score'] = df_score['text'].apply(lambda x: css(i,x))
        df_score = df_score.sort_values(by=['score'], ascending=False)
        
        if str(df_score['link'].iloc[0]):
            pdf_list.append(str(i))
            ref_list.append(str(df_score['link'].iloc[0]))
    except:
        pdf_list.append('')

pdf_summary = ''.join(pdf_list)
winsound.Beep(2500, 1000)

Create dataframe from tags and urls:

In [None]:
df_merged = pd.DataFrame(list(zip(ref_list, pdf_list)), columns=['link', 'text'])
df_merged = df_merged.sort_index(ascending=True).groupby('link', as_index=True).agg(lambda x: ' '.join(x))
df_merged = df_merged.reindex(list(unique_everseen(ref_list))).reset_index()

df_merged.replace('', np.nan, inplace=True)
df_merged.dropna(inplace=True) 

df_merged.head()

Add new sources:

In [None]:
ref_list = []
pdf_list = []

trc = 0
for i in range(len(df_merged)):
    trc = trc + 1
    
    pdf_list.append(str(trc) + '. ...' + str(str(df_merged['text'].iloc[i])) + " <u><a href=" + str(df_merged['link'].iloc[i]) + " target='_blank'>" + "More" + "</a></u>" + "<hr>")
    ref_list.append(str(df_merged['link'].iloc[i]))

pdf_summary = ''.join(pdf_list)

Print sample:

In [None]:
print(pdf_summary[:1000] + '...')

Save docx:

In [None]:
save_doc(pdf_summary, 'summary', query, len(df_merged), sent_number, url_keys, mark_keys, scores)
winsound.Beep(2500, 5000)