### LIbraries

In [1]:
# general
import time, winsound, random
from random import randint
# general

# process arrays and dataframes
import pandas as pd
import numpy as np
import collections
import fuzzy_pandas as fpd
from collections import Counter
#/process arrays and dataframes

# parallel calculations
from tqdm import tqdm
#/parallel calculations

# web parsing
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from bs4 import BeautifulSoup
from bs4.element import Tag
import chromedriver_binary
from requests import get
#/web parsing

# parsing libs
import arxiv
import wikipediaapi
from googlesearch import search  
#/parsing libs

# read .pdf
from tika import parser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
#/read .pdf

# text processing
import spacy,nltk,string,re
import neuralcoref
import networkx as nx
from spacy.symbols import nsubj, nsubjpass, VERB
from nltk.tokenize import sent_tokenize, word_tokenize
from more_itertools import unique_everseen
from textblob import TextBlob

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')

nlp = spacy.load('en_core_web_lg')
nlp.max_length = 50000000
#/text processing

# create .docx
import docx
from docx import Document
from docx.shared import Cm
from docx.shared import Pt
from docx.enum.dml import MSO_THEME_COLOR_INDEX
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
#/create .docx

# keywords extraction
import yake
#/keywords extraction

# extractive summarizer
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
#/extractive summarizer

# abstractive summarizer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
import textstat
import torch
#/abstractive summarizer

# many-to-many evaluation
from rouge import Rouge
from rouge_score import rouge_scorer
#/many-to-many evaluation

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\skamenshchikov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Parameters

In [2]:
UPLOAD_FOLDER = 'docs/'

threshold = 0.8 #extension boundary
page_number = 10 # page volume
max_length = 60 #number of tokens

### Feature toggles

In [3]:
filter_request = False
paraphrase = False
compress = True

wiki_sum = True
gogle_sum = True
arxiv_sum = True

### CDFs

In [6]:
##### HTML parsing #####
def parse_google_page(url): 
    try:
        title = BeautifulSoup(get(url).content, 'html.parser').title.getText()
        parser = HtmlParser.from_url(url, Tokenizer("English"))
        
        summarizer = Summarizer(Stemmer("English"))
        summarizer.stop_words = get_stop_words("English")

        sentences = []
        for i in summarizer(parser.document, 1000000):
            sentences.append(str(i))
        txt = ' '.join(sentences)
    except:
        txt = ''
        title = ''
    
    return txt, title

def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def get_unique_text(document):
    unique_sentences = []
    for sentence in [sent.raw for sent in TextBlob(document).sentences]:
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)
    return ' '.join(unique_sentences)

def get_text(url):
    page = urlopen(url)
    soup = BeautifulSoup(page)
    fetched_text = ' '.join(map(lambda p:p.text,soup.find_all('p')))
    return fetched_text
#####/HTML parsing #####

############# Parse Wiki ############# 
def parse_wiki(google_url):
    
    # load driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    #/load driver
    
    # get urls  
    driver.get(google_url)
    time.sleep(randint(1,5))

    soup = BeautifulSoup(driver.page_source,'lxml')
    result_div = soup.find_all('div', attrs={'class': 'g'})

    links = []
    titles = []
    errors = []

    descriptions = []
    for r in result_div:
        try:
            link = r.find('a', href=True)
            title = None
            title = r.find('h3')

            if isinstance(title,Tag):
                title = title.get_text()

            description = None
            description = r.find('span', attrs={'class': 'st'})

            if isinstance(description, Tag):
                description = description.get_text()

            if link != '' and title != '' and description != '':
                links.append(link['href'])
                titles.append(title)
                descriptions.append(description)

        except Exception as e:
            print(e)
            continue

    url_list = links[:(page_number)]
    url_list = [i for i in url_list if 'https://en.wikipedia.org' in i]
    
    title_list = []
    for i in url_list:
        try:
            if 'https://en.wikipedia.org' in i: 
                title_list.append(i.split('/')[4])  
        except:
            continue
    #/ get urls
        
    driver.stop_client()
    driver.close()
    
    return title_list     
############# Parse Wiki ##############

############# Parse Arxiv #############
def parse_arxiv(query):
    
    arxivtext = ''  
    
    urls = []
    titles = []
  
    arxiv_data = arxiv.query(query=query, max_results=page_number)

    urls = [i['id'].replace('arxiv.org/', 'export.arxiv.org/') for i in arxiv_data]
    titles = [i['title'] for i in arxiv_data]
    abstracts = [i['summary'] for i in arxiv_data] 

    txts = []
    driver = webdriver.Chrome(ChromeDriverManager().install()) 

    for i in tqdm(urls):
    
        driver.get(i)
        soup = BeautifulSoup(driver.page_source,'lxml')
        result_div = soup.find_all('blockquote', attrs={'class': 'abstract mathjax'})[0]
        abstract = result_div.get_text().replace('\n',' ').replace('\t',' ').strip()

        file_data = parser.from_file(i.replace('abs', 'pdf'))['content']
        content = file_data.replace('\n',' ').replace('\t',' ').strip()

        extended_abstract = filter_text(content, abstract, threshold=0.01).replace('\n',' ').replace('\t',' ').strip()
        txts.append(extended_abstract)
    
    driver.stop_client()
    driver.close()

    arxivtext = re.sub('[^A-Za-z0-9.]+', ' ', '; '.join(txts))

    df = pd.DataFrame(list(zip(txts, urls, titles)), columns=['text','link', 'page'])
    
    return arxivtext, titles, df
#############/Parse Arxiv #############

############# Parse Google ###############
def parse_google(query):   
    
    txt = []
    titles = []
    errors = []

    # load driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    #/load driver 

    # get urls
    google_url = "https://www.google.com/search?q=" + query + "&num=" + str(page_number+1)
    
    if filter_request == True:
        google_url = google_url + '&searchtype=all&source=header&start=0&date-filter_by=past_' + str(months_delta)
        google_url = google_url + '&hl=en&gl=en' + '&lr=lang_en&cr=countryGB'
    
    driver.get(google_url)
    time.sleep(randint(1,5))

    soup = BeautifulSoup(driver.page_source,'lxml')
    result_div = soup.find_all('div', attrs={'class': 'g'})

    links = []
    titles = []
    errors = []

    descriptions = []
    for r in result_div:
        try:
            link = r.find('a', href=True)
            title = None
            title = r.find('h3')

            if isinstance(title,Tag):
                title = title.get_text()

            description = None
            description = r.find('span', attrs={'class': 'st'})

            if isinstance(description, Tag):
                description = description.get_text()
                
            wikiarxiv_filter = ('wikipedia.org' not in link['href']) and ('arxiv.org' not in link['href'])
            patent_filter = ('patents.google.com/' not in link['href'])

            if wikiarxiv_filter and patent_filter and link != '' and title != '' and description != '':
                links.append(link['href'])
                titles.append(title)
                descriptions.append(description)

        except Exception as e:
            print(e)
            continue

    url_list = list(set(links))[:(page_number)] 
    #/ get urls
    
    for j in tqdm(url_list):
        delta = random.randint(1,8)
        time.sleep(delta) 
        
        try:  
            if str(j).endswith('.pdf'):
                file_data = parser.from_file(str(j))        
                t = file_data['content'].replace('\n','')    
                titles.append(t[:100])
            else:
                t = parse_google_page(j)[0].replace('\n','') 
                titles.append(parse_google_page(j)[1].replace('\n',''))
            
            txt.append(t)
            
        except:
            print('Parsing error:',str(j))
            errors.append(str(j))
          
    df = pd.DataFrame(list(zip(txt, url_list, titles)), columns=['text','link', 'page'])
    df = df[~df['page'].str.contains('|'.join(['403','404']))]
    df.replace('', np.nan, inplace=True)
    df.dropna(inplace=True)
    
    googletext = re.sub('[^A-Za-z0-9.]+', ' ', '; '.join(list(df['text'])))
    titles = list(df['page'])
   
    return googletext, errors, df, titles
#############/Parse Google ###############

############## Text processing #########
def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

def text_normalize(txt):
    processed_text = re.sub('[^a-zA-Z]', ' ', txt)
    processed_text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",processed_text)
    processed_text=re.sub("(\\d|\\W)+"," ",processed_text)

    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(processed_text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if not word in stop_words]
    tokens = [i for i in tokens if (tags(i) in ['NN', 'NNP', 'NNS', 'NNPS'])]

    return tokens

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def filter_triplet(final_text):
    
    final_text = get_unique_text(final_text)
    doc = nlp(final_text)
    valid_sents = []

    for s in list(doc.sents):
        if syntax_full(s):
            valid_sents.append(s.text)
    
    final_text = ' '.join(valid_sents)
    
    return final_text

def coref_res(rawtext, coref_greedn = 0.5):

    neuralcoref.add_to_pipe(nlp, greedyness = coref_greedn, store_scores=False)
    doc = nlp(rawtext)

    resolved = list(tok.text_with_ws for tok in doc)

    for cluster in doc._.coref_clusters:
        for coref in cluster:
            if coref != cluster.main:
                if coref.text[0].isalpha() and coref.text[0].isupper():

                    main_words_list=word_tokenize(cluster.main.text)
                    main_words_list[0]=main_words_list[0].capitalize()
                    resolved[coref.start] = detokenizer(main_words_list) + doc[coref.end-1].whitespace_

                for i in range(coref.start+1, coref.end):
                    resolved[i] = ""
            else:
                resolved[coref.start] = cluster.main.text + doc[coref.end-1].whitespace_
                for i in range(coref.start+1, coref.end):
                    resolved[i] = ""

    text_resolved = ''.join(resolved)
    nlp.remove_pipe("neuralcoref")

    return text_resolved

def compress(spacy_sents,sents_whitelist):
    blacklist_tokens=[]
    n=1
    for sent in spacy_sents:
        if (n in sents_whitelist):
            for token in sent:
                if token.dep_ in ['appos','advmod']:
                    token_sub_tree=token.subtree
                    for t in token_sub_tree:
                        blacklist_tokens.append(t.i)

        n=n+1
    return(blacklist_tokens)

def spacy_compress(rawtext):

    doc1 = nlp(rawtext)
    sents_whitelist = get_sents_ids_whitelist(doc1.sents)

    tokens_blacklist = compress(doc1.sents,sents_whitelist)
    sents_tokens = get_list_sents_tokens(doc1.sents,sents_whitelist,tokens_blacklist)
    compressed_text_sents = []

    for s in sents_tokens:
        text=detokenizer(s)
        compressed_text_sents.append(text)
    compressed_text_sents=sentence_grammar_fix(compressed_text_sents)
    text =' '.join(compressed_text_sents)

    return(text)
##### Text processing #####

############## Get summary #############
def get_summary(rawtext, sentences):
    
    stemmer = Stemmer("english")
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words("english")
    parser = PlaintextParser.from_string(' '.join(sent_tokenize(rawtext)[6:]), Tokenizer("english"))

    text_list = []
    for sentence in summarizer(parser.document, sentences):
        text_list.append(str(sentence))

    txt = ' '.join(sent_tokenize(rawtext)[:6]) + ' '+' '.join(text_list)

    z = 0
    output = []
    
    for i in nltk.sent_tokenize(txt):
        output.append(str(i) + '==')
    
    txt = ''.join(output)
    
    return txt
##############/Get summary #############

############## Get tags and entities ###########
def graph_keys(final_text, top_number):
    
    bigrams = list(nltk.ngrams(text_normalize(final_text.lower()),2))
    bigrams = [' '.join(i) for i in bigrams if (i[0]!=i[1])] 
    bigram_counts = collections.Counter(bigrams)
    
    df = pd.DataFrame(bigram_counts.most_common(len(bigram_counts)), columns=['bigram', 'count'])[:top_number]
    df['count'] = 100*df['count']/df['count'].sum().astype(int) 
    keys = ', '.join(list(df['bigram'].astype(str)))

    return keys

def yake_keys(text, keys_number):
    сustom_kw_extractor = yake.KeywordExtractor(lan="en", n=2, top=keys_number)
    keywords = сustom_kw_extractor.extract_keywords(text)
    keywords = ', '.join([i[1] for i in keywords])
    
    return keywords

def get_entities(rawtext, tops):
    spacy_nlp = spacy.load('en_core_web_lg', disable=["tagger","parser"])
    nlp.max_length = 1000000000000
    doc = spacy_nlp(rawtext)

    ners = []
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PERSON']:
            ners.append(ent.text)
   
    ner_counts = collections.Counter(ners)

    try:
        df = pd.DataFrame(ner_counts.most_common(len(ner_counts)), columns=['ner_names', 'count'])[:tops]
        df['count'] = 100*df['count']/df['count'].sum().astype(int) 
        keys = ', '.join(list(df['ner_names'].astype(str)))
    except:
        keys = ''
    
    return keys
############## Get tags and entities #############

############## Add keyurls ################
def add_keyurls(final_keys, query):
    url_keys = []
    for i in final_keys.split(','):
        url = 'https://www.google.com/search?q=' + '+'.join(re.sub(r" ?\([^)]+\)", "", i).strip().split()) + '+' + query + '/keyword/' + i 
        url_keys.append(url)
        
    return url_keys     
##############/Add urls ###################

##### Abstractive summarization #############
def get_response(input_text,num_return_sequences):
    
    batch = tokenizer.prepare_seq2seq_batch([input_text], truncation=True, padding='longest', max_length=60, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch, max_length=60, num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
    
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return tgt_text
#####/Abstractive summarization #############

############# Doc preparation ##########
def add_hyperlink(paragraph, text, url, flag):
    part = paragraph.part
    r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # Create the w:hyperlink tag and add needed values
    hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
    hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

    # Create a w:r element and a new w:rPr element
    new_run = docx.oxml.shared.OxmlElement('w:r')
    rPr = docx.oxml.shared.OxmlElement('w:rPr')

    # Join all the xml elements together add add the required text to the w:r element
    new_run.append(rPr)
    new_run.text = text
    hyperlink.append(new_run)

    # Create a new Run object and add the hyperlink into it
    r = paragraph.add_run()
    r._r.append (hyperlink) 

    # A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
    # Delete this if using a template that has the hyperlink style in it
    r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
    r.font.underline = flag

    return hyperlink

def save_doc(final_summary, summary, query, score, compression):
    
    sent_list = list(final_summary.split(sep='<hr>'))
    doc = Document()
    style = doc.styles['Normal']
    
    font = style.font
    font.name = 'Times New Roman'
    font.size = Pt(12)

    hd = doc.add_paragraph()
    hd.alignment = WD_ALIGN_PARAGRAPH.LEFT
    hd.add_run('Summary').bold = True

    if query != 'none':
        hd = doc.add_paragraph('Request: ' + "''" + query + "''")

    hd = doc.add_paragraph('Information: ' + str(score))
    hd = doc.add_paragraph('Word compression: ' + str(compression))
    hd = doc.add_paragraph('Model efficiency: ' + str(round((score/compression),2)))  
    
    r = hd.add_run()
    for i in sent_list:
        hd.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY

        if query != 'none':
            try:
                link = re.search(r"<a href=(.*?)target='_blank'", str(i)).group(1).replace(' ','')
                hd = doc.add_paragraph(striphtml(str(i)).replace('<hr>','').replace('<u>','').replace('More',''))               
                add_hyperlink(hd, 'More', link, True).add_run()
            except:
                link = ''
        if query == 'none':
            hd = doc.add_paragraph(striphtml(str(i)).replace('<hr>','').replace('<u>','').replace('More',''))    
         
    doc.save('docs/' + summary + '.docx')
    
    return True
#############/Doc preparation ##########

############## Sandbox functions ##########
def longest_common_substring(s1, s2):
  m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
  longest, x_longest = 0, 0
  for x in range(1, 1 + len(s1)):
    for y in range(1, 1 + len(s2)):
      if s1[x - 1] == s2[y - 1]:
        m[x][y] = m[x - 1][y - 1] + 1
        if m[x][y] > longest:
          longest = m[x][y]
          x_longest = x
      else:
        m[x][y] = 0
  return s1[x_longest - longest: x_longest]

def longest_common_sentence(s1, s2):
    s1_words = s1.split(' ')
    s2_words = s2.split(' ')
    return ' '.join(longest_common_substring(s1_words, s2_words))

def css(a,b):
    if len(a.split()) > 0:
        score = len(longest_common_sentence(a,b).split())/len(a.split())
    else:    
        score = 0
    return score

def readingTime(mytext):
    total_words = len(word_tokenize(mytext))
    estimatedTime = round(total_words/200.0,1)
    return estimatedTime

def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)

def tags(x):
    return nltk.pos_tag(nltk.word_tokenize(x))[0][1]

def syntax_full(spacy_sentence):
    result=[]
    for token in spacy_sentence:
        if (token.dep == nsubj or token.dep == nsubjpass) and token.head.pos == VERB:
            result.append(token.head)
    if result:
        return True
    else:
        return False

def check_min_num_of_clauses(spacy_sentence, n):
    result=[]
    for token in spacy_sentence:
        if (token.dep_ in ['nsubj','nsubjpass','csubj','expl']) and (token.head.pos_ == 'VERB' or token.head.pos_ == 'AUX'):
            result.append(token.head.text)
    if len(result)>=n:
        return True

    else:
        return False

def get_sents_ids_whitelist(spacy_sents):
    whitelist=[]
    i=1
    sents_texts=[]
    for sent in spacy_sents:
        if (sent.text not in sents_texts) and check_min_num_of_clauses(sent,1):
            whitelist.append(i)
            sents_texts.append(sent.text)
        i=i+1
    return(whitelist)

def get_list_sents_tokens(spacy_sents,sents_whitelist,blacklist_tokens):
    sents_tokens=[]
    n=1
    for sent in spacy_sents:
        sent_tokens=[]
        if (n in sents_whitelist):
            for token in sent:
                if (token.i not in blacklist_tokens):
                    sent_tokens.append(token.text)
            sents_tokens.append(sent_tokens)
            sent_tokens=[]

        n=n+1
    return(sents_tokens)

def detokenizer(list_of_tokens):
    text_str="".join([" "+w if not w.startswith("'") and not w.startswith("’") and w!='' and w not in string.punctuation else w for w in list_of_tokens]).strip()
    return(text_str)

def sentence_grammar_fix(sentences):
    fixed=[]
    for sent in sentences:

        sent=sent.strip()
        sent=sent.replace('\n','')
        sent=sent.replace('()','')

        sent=re.sub('\s+',' ',sent)
        sent=sent+'.'
        sent=re.sub(r'([,.\-—:])+',r'\1',sent)

        if len(sent)>1:
            if sent[0] in ['.',',','-','—']:
                sent=sent[1:]
        sent=sent.strip()

        if len(sent)>1:
            if sent[0].isalpha():
                sent=sent[0].upper()+sent[1:]
        fixed.append(sent)

    return(fixed)

def get_scores(report_summary, final_text):
    scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
    scores = scorer.score(' '.join(text_normalize(report_summary)).lower(), ' '.join(text_normalize(final_text)).lower())
    scores = round(list(list(scores.values())[0])[2],2)
    
    return scores 
##############/Sandbox functions ##########

############## Extend abstract ##########
def get_ngrams(text): 
    grams = nltk.ngrams(text.split(), 2)
    grams_list = []
    for i in grams:
        grams_list.append(i)
    
    return grams_list 

def get_jaccard_sim(a,b):
    a, b = set(get_ngrams(a)), set(get_ngrams(b)) 
    c = a.intersection(b)

    return round(float(len(c)/len(a)), 2)

def filter_text(content, abstract, threshold=0.5, content_type='arxiv'): 
    
    content_list = []   
    
    for j in content.split('.'):
        try:
            sim_score = get_jaccard_sim(j, abstract)
        except:
            sim_score = 0
            
        if sim_score > threshold:
            content_list.append(j)    
        
        if content_type == 'wiki':
            reduced_list = [i for i in content_list if i not in list(abstract.split('.'))]
            final_list = list(dict.fromkeys(abstract.split('.') + reduced_list)) 
        else:
            final_list = content_list 
                        
    return '. '.join(final_list)
##############/Extend abstract #########

### Query

In [4]:
query = input()

window smart glass


### Parse web sources

Parse Wiki:

In [7]:
%%time

wikitext = ''
wikikeys = ''

df_wiki = pd.DataFrame()

if wiki_sum == True: 
    
    wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.WIKI)
    
    red_query = "https://www.google.com/search?q=" + 'site:https://en.wikipedia.org ' + query + "&num=" + str(page_number+1)
    red_query = red_query + '&searchtype=all&source=header'
    
    wiki_titles = parse_wiki(red_query)

    txts = []
    titles = []

    for i in tqdm(wiki_titles): 
        
        page_sum = wiki_wiki.page(i).summary
        page_txt = wiki_wiki.page(i).text
        sent_list = filter_text(page_txt, page_sum, threshold=threshold, content_type='wiki')
       
        titles.append(i)
        txts.append(''.join(sent_list).replace('\n', ''))        
    
    wikitext = ''.join(txts).replace('\n','') 

    if compress == True:
        wikitext = coref_res(filter_triplet(wikitext))

    url_list = [str('https://en.wikipedia.org/wiki/' + i)  for i in wiki_titles] 
    
    df_wiki = pd.DataFrame(list(zip(txts, url_list, titles)), columns=['text','link', 'page'])
    df_wiki.replace('', np.nan, inplace=True)
    df_wiki.dropna(inplace=True)
    
    random_num = randint(1,len(df_wiki))  
    
    print(df_wiki['page'][random_num-1]+'\n')
    print(df_wiki['text'][random_num-1][:1000]+'...'+'\n')

winsound.Beep(2500, 1000)    

 


[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\skamenshchikov\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:06<00:00,  1.62it/s]


Heated_glass

Heated glass is a resistance heater created when a transparent, electrically conductive coating is applied to float glass and then subjected to an electric current.  The electric current in the coating creates heat energy, which warms the glass until the glass radiates heat. ...

Wall time: 13 s


Wiki output:

In [8]:
wikitext[:2000] + '...'

'Smart glass or switchable glass (also smart windows or switchable windows in those applications) is a glass or glazing whose light transmission properties are altered when voltage, light, or heat is applied.  In general, the glass changes from transparent to translucent and vice versa, changing from letting light pass through to blocking some (or all) wavelengths of light and vice versa. Smart glass technologies include electrochromic, photochromic, thermochromic, suspended-particle, micro-blind, and polymer-dispersed liquid-crystal devices. When installed in the envelope of buildings, smart glass creates climate adaptive building shells. Smart film, also called Switchable film, is a product that is capable of adjusting light transmission between transparent and opaque using AC power.  Due to moisture sensitivity, earlier versions of the film were used only to make smart glass by lamination on glass.  With continual improvement in moisture resistance, the new (3rd) generation of the f

Parse Arxiv:

In [9]:
%%time

arxivtext = ''
arxivkeys = ''

df_arxiv = pd.DataFrame()

if arxiv_sum == True:
    
    try:
        df_arxiv = parse_arxiv(query)[2] 
        df_arxiv.replace('', np.nan, inplace=True)
        df_arxiv.dropna(inplace=True)   
        
        arxivtext = ''.join(list(df_arxiv['text'])) 
    
        if compress == True:
            arxivtext = coref_res(filter_triplet(arxivtext))
    
        random_num = randint(1, len(df_arxiv)) 
        
        print(df_arxiv['page'][random_num-1]+'\n')
        print(df_arxiv['text'][random_num-1][:1000]+'...'+'\n')
    
    except:
        print('No data')
     
winsound.Beep(2500, 1000)

 


[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\skamenshchikov\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]2021-01-20 19:24:00,102 [MainThread  ] [INFO ]  Retrieving http://export.arxiv.org/pdf/1911.05273v1 to C:\Users\SKAMEN~1\AppData\Local\Temp/pdf-1911.05273v1.
 10%|████████▎                                                                          | 1/10 [00:08<01:16,  8.52s/it]2021-01-20 19:24:04,595 [MainThread  ] [INFO ]  Retrieving http://export.arxiv.org/pdf/1911.02990v2 to C:\Users\SKAMEN~1\AppData\Local\Temp/pdf-1911.02990v2.
 20%|████████████████▌                                                                  | 2/10 [00:14<01:01,  7.68s/it]2021-01-20 19:24:10,344 [MainThread  ] [INFO ]  Retrieving http://export.arxiv.org/pdf/1610.08807v1 to C:\User

Hybrid electrochromic device with Tungsten oxide (WO3-x) and nafion
  membrane: performance with varying tungsten oxide thickness

Microsoft Word - Hybrid electrochromic device with Tungsten oxide- corrected   1     Hybrid electrochromic device with Tungsten oxide (WO3-x) and   nafion membrane: performance with varying tungsten oxide thickness   K Uday Kumar1, S D Bhat2,V V Giridhar2 and A Subrahmanyam1   1 Semiconductor laboratory, Department of Physics, Indian Institute of Technology Madras,   Chennai, 600036, India   2 CSIR-Central Electrochemical Research Institute-Madras Unit, CSIR Madras Complex,   Chennai 600 113, India   Abstract:   Electrochromic devices, which dynamically change color under the applied potential, are   widely studied because of its wide range of applications such as energy-efficient smart   windows, rear view mirrors and display devices etc.  In this study we are reporting four layer   electrochromic device based on tungsten oxide as a electrochromic layer an

Parse Google:

In [32]:
%%time

googletext = ''
df_google = pd.DataFrame()

if gogle_sum == True:
    
    try:
        df_google = parse_google(query)[2]
        df_google.replace('', np.nan, inplace=True)
        
        googletext = ''.join(list(df_google['text']))
        
        if compress == True:
            googletext = coref_res(filter_triplet(googletext))
            
        random_num = randint(1,len(df_google)) 
        
        print(list(df_google['page'])[random_num-1]+'\n')
        print(list(df_google['text'])[random_num-1])

    except:
        print('No data')
    
winsound.Beep(2500, 1000)    

 


[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\skamenshchikov\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache
100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [01:19<00:00,  6.54s/it]


What Are Smart Glass Windows? | Intelligent Glass



In the Switchable Smart Glass industry, one of the questions we are asked most frequently is ‘what are Smart Windows?’, which is a broad question that has just as many answers as there are ways to ask it. Once a customer has ascertained what Smart Glass windows actually are, questions pertaining to how Smart Windows work begin to surface, however, to get any meaning out of these questions, the customer first needs to understand what a Smart Window is and how the different options involved can impact the answers. In this post, we will be looking at various types of Smart Glass Windows that Intelligent Glass offers and the differences between them in order to help you find the right product for you. For the purposes of keeping this post relevant, we will be looking at PDLC Switchable Smart Glass products and their window-based applications rather than wider definitions of ‘Smart Windows’ that may include electrochromic or photochromic glass. This post will instead focus on ‘Smart Windows

Wall time: 1min 30s


### Extractive summary

Concatenate dataframes:

In [33]:
df = df_wiki.append(df_google).append(df_arxiv)
df.head()

Unnamed: 0,text,link,page
0,Smart glass or switchable glass (also smart wi...,https://en.wikipedia.org/wiki/Smart_glass,Smart_glass
1,"Smart film, also called Switchable film, is a ...",https://en.wikipedia.org/wiki/Smart_film,Smart_film
2,An electrochromic device (ECD) controls optica...,https://en.wikipedia.org/wiki/Electrochromic_d...,Electrochromic_device
3,Electrochromism is the phenomenon where the co...,https://en.wikipedia.org/wiki/Electrochromism,Electrochromism
4,Smartglasses or smart glasses are wearable com...,https://en.wikipedia.org/wiki/Smartglasses,Smartglasses


Define optimal compression rate:

In [34]:
%%time

final_text = (wikitext + arxivtext + googletext)

for i in tqdm(range(10, 10000, 10)): 
    
    report_summary = get_summary(final_text, i)
    scores = get_scores(report_summary, final_text)
    
    print('Volume:', i)
    print('Score:', scores, '\n')
    
    if scores > 0.5:
        print('Optimal volume:', i, 'sentences', '\n')
        sent_number = i
        break

  0%|                                                                                          | 0/999 [00:00<?, ?it/s]

Volume: 10
Score: 0.04 



  0%|                                                                                | 1/999 [00:28<7:59:14, 28.81s/it]

Volume: 20
Score: 0.07 



  0%|▏                                                                               | 2/999 [00:52<7:30:45, 27.13s/it]

Volume: 30
Score: 0.09 



  0%|▏                                                                               | 3/999 [01:13<7:03:49, 25.53s/it]

Volume: 40
Score: 0.12 



  0%|▎                                                                               | 4/999 [01:36<6:47:21, 24.56s/it]

Volume: 50
Score: 0.13 



  1%|▍                                                                               | 5/999 [01:58<6:35:46, 23.89s/it]

Volume: 60
Score: 0.15 



  1%|▍                                                                               | 6/999 [02:20<6:28:09, 23.45s/it]

Volume: 70
Score: 0.17 



  1%|▌                                                                               | 7/999 [02:43<6:23:30, 23.20s/it]

Volume: 80
Score: 0.19 



  1%|▋                                                                               | 8/999 [03:06<6:22:22, 23.15s/it]

Volume: 90
Score: 0.21 



  1%|▋                                                                               | 9/999 [03:31<6:31:01, 23.70s/it]

Volume: 100
Score: 0.22 



  1%|▊                                                                              | 10/999 [03:54<6:28:30, 23.57s/it]

Volume: 110
Score: 0.23 



  1%|▊                                                                              | 11/999 [04:18<6:28:32, 23.60s/it]

Volume: 120
Score: 0.25 



  1%|▉                                                                              | 12/999 [04:42<6:31:07, 23.78s/it]

Volume: 130
Score: 0.26 



  1%|█                                                                              | 13/999 [05:07<6:38:20, 24.24s/it]

Volume: 140
Score: 0.27 



  1%|█                                                                              | 14/999 [05:31<6:35:25, 24.09s/it]

Volume: 150
Score: 0.3 



  2%|█▏                                                                             | 15/999 [05:59<6:54:10, 25.25s/it]

Volume: 160
Score: 0.31 



  2%|█▎                                                                             | 16/999 [06:25<6:55:54, 25.39s/it]

Volume: 170
Score: 0.33 



  2%|█▎                                                                             | 17/999 [06:50<6:52:21, 25.19s/it]

Volume: 180
Score: 0.34 



  2%|█▍                                                                             | 18/999 [07:16<6:55:36, 25.42s/it]

Volume: 190
Score: 0.35 



  2%|█▌                                                                             | 19/999 [07:47<7:25:45, 27.29s/it]

Volume: 200
Score: 0.36 



  2%|█▌                                                                             | 20/999 [08:14<7:24:06, 27.22s/it]

Volume: 210
Score: 0.39 



  2%|█▋                                                                             | 21/999 [08:42<7:26:58, 27.42s/it]

Volume: 220
Score: 0.4 



  2%|█▋                                                                             | 22/999 [09:11<7:33:52, 27.87s/it]

Volume: 230
Score: 0.41 



  2%|█▊                                                                             | 23/999 [09:38<7:30:26, 27.69s/it]

Volume: 240
Score: 0.42 



  2%|█▉                                                                             | 24/999 [10:05<7:24:20, 27.34s/it]

Volume: 250
Score: 0.43 



  3%|█▉                                                                             | 25/999 [10:31<7:18:56, 27.04s/it]

Volume: 260
Score: 0.43 



  3%|██                                                                             | 26/999 [10:58<7:16:07, 26.89s/it]

Volume: 270
Score: 0.45 



  3%|██▏                                                                            | 27/999 [11:24<7:13:13, 26.74s/it]

Volume: 280
Score: 0.47 



  3%|██▏                                                                            | 28/999 [11:51<7:13:37, 26.79s/it]

Volume: 290
Score: 0.48 



  3%|██▎                                                                            | 29/999 [12:18<7:14:16, 26.86s/it]

Volume: 300
Score: 0.49 



  3%|██▎                                                                            | 30/999 [12:45<7:14:57, 26.93s/it]

Volume: 310
Score: 0.49 



  3%|██▍                                                                            | 31/999 [13:13<7:18:33, 27.18s/it]

Volume: 320
Score: 0.5 



  3%|██▌                                                                            | 32/999 [13:40<7:18:44, 27.22s/it]

Volume: 330
Score: 0.51 

Optimal volume: 330 sentences 

Wall time: 14min 7s


Get extractive summary:

In [35]:
%%time

final_text = (wikitext + arxivtext + googletext)
report_summary = get_summary(final_text, sent_number)

Wall time: 8.94 s


Summary metrics:

In [36]:
compression = round(len(word_tokenize(report_summary))/len(word_tokenize(final_text)),2) 
scores = get_scores(report_summary, final_text)

print('Bigrams extracted:', str(scores))
print('\nCompression:', compression, '\n')
print('Efficiency:', str(round((scores/compression),2)), '\n')  

print((report_summary[:1000])+'...', '\n')

winsound.Beep(2500, 1000)

Bigrams extracted: 0.51

Compression: 0.33 

Efficiency: 1.55 

Smart glass or switchable glass (also smart windows or switchable windows in those applications) is a glass or glazing whose light transmission properties are altered when voltage, light, or heat is applied.==In general, the glass changes from transparent to translucent and vice versa, changing from letting light pass through to blocking some (or all) wavelengths of light and vice versa.==Smart glass technologies include electrochromic, photochromic, thermochromic, suspended-particle, micro-blind, and polymer-dispersed liquid-crystal devices.==When installed in the envelope of buildings, smart glass creates climate adaptive building shells.==Smart film, also called Switchable film, is a product that is capable of adjusting light transmission between transparent and opaque using AC power.==Due to moisture sensitivity, earlier versions of the film were used only to make smart glass by lamination on glass.==With continual imp

### Abstractive summary

Paraphrase generation:

In [37]:
%%time

if paraphrase == True:
    
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
    
    counter = 0
    summ_list = []
    
    for i in report_summary.split('==')[:-1]:
        summ_list.append('=='+ get_response(i,1)[0])
    
    summary = ' '.join(summ_list)

    scores = scorer.score(summary, report_summary)
    scores = round(100*list(list(scores.values())[0])[2])
    report_summary = summary 

    print('Plagiarism:', (str(scores) + ' %'))

    winsound.Beep(2500, 1000)

Wall time: 0 ns


### Extend the content

Create keys with urls:

In [38]:
ref_list = []
pdf_list = []

sent_list = list(report_summary.split(sep='=='))[:-1]

for i in sent_list:
    try:
        df_score = df.copy()
        df_score['score'] = df_score['text'].apply(lambda x: css(i,x))
        df_score = df_score.sort_values(by=['score'], ascending=False)
        
        if str(df_score['link'].iloc[0]):
            pdf_list.append(str(i))
            ref_list.append(str(df_score['link'].iloc[0]))
    except:
        pdf_list.append('')

pdf_summary = ''.join(pdf_list)
winsound.Beep(2500, 1000)

Create dataframe from tags and urls:

In [39]:
df_merged = pd.DataFrame(list(zip(ref_list, pdf_list)), columns=['link', 'text'])
df_merged = df_merged.sort_index(ascending=True).groupby('link', as_index=True).agg(lambda x: ' '.join(x))
df_merged = df_merged.reindex(list(unique_everseen(ref_list))).reset_index()

df_merged.replace('', np.nan, inplace=True)
df_merged.dropna(inplace=True) 

df_merged.head()

Unnamed: 0,link,text
0,https://en.wikipedia.org/wiki/Smart_glass,Smart glass or switchable glass (also smart wi...
1,https://en.wikipedia.org/wiki/Smart_film,"Smart film, also called Switchable film, is a ..."
2,https://en.wikipedia.org/wiki/Electrochromism,"By doing so, an electrochromic smart window ca..."
3,https://en.wikipedia.org/wiki/Smartglasses,"Alternatively, smartglasses are sometimes defi..."
4,"https://en.wikipedia.org/wiki/View,_Inc.","Founded in 2007, the company is headquartered ..."


Add new sources:

In [40]:
ref_list = []
pdf_list = []

trc = 0
for i in range(len(df_merged)):
    trc = trc + 1
    
    pdf_list.append(str(trc) + '. ...' + str(str(df_merged['text'].iloc[i])) + " <u><a href=" + str(df_merged['link'].iloc[i]) + " target='_blank'>" + "More" + "</a></u>" + "<hr>")
    ref_list.append(str(df_merged['link'].iloc[i]))

pdf_summary = ''.join(pdf_list)

In [None]:
print(pdf_summary[:1000] + '...')

Save docx:

In [41]:
save_doc(pdf_summary, 'summary', query, scores, compression)
winsound.Beep(2500, 3000)