In [24]:
import numpy as np
import pandas as pd
import selenium
from selenium import webdriver
import bs4
from bs4 import BeautifulSoup
import time
import re
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from collections import defaultdict

def extractor(url,wait_time):
    """
    Accepts a url and stores its html code before parsing and extracting the abstract as text.
    Feeds directly into parser, so don't call this function unless you want to obtain the abstract
    from a single url.
    """
    driver = webdriver.Chrome()
    driver.get(url)
    #driver.find_element_by_id('').send_keys('')#
    #driver.find_element_by_id ('').send_keys('')#
    #driver.find_element_by_id('submit').click()#
    time.sleep(wait_time) # important

    html_doc = driver.page_source # stores the source HTML code in the driver's page_source attribute
    soup = BeautifulSoup(html_doc, 'html.parser')
    abstract = soup.find('div', {'class':"abstract author"}).find('p').text
    
    driver.quit()
    return abstract

def parse_all(driver):
    """
    The following method is designed to automatically parse each url contained in a long list 
    of scraped urls, and writes the title, abstract, and doi to a new text file with a user
    input "file_name.txt."
    """
    url_lst = input("Enter name of file with .txt extension with list of urls: ")
    data = pd.read_csv(url_lst,header=None,names=['url']) #text file containing a list of the scraped urls (should be in same directory)
    file_name = input("Input the file name with .txt extension you wish to store abstracts in: ")
    file = open(file_name,'w')

    max_iters = len(data) #total number of scraped urls to be parsed
    print("The parser will parse: " + str(max_iters) + " urls.")

    for i in range(0,max_iters):
        print('On url ',i)
        driver.refresh()
        time.sleep(2)
        urli = str(extractor(data.iloc[i,0],3))
        file.write(urli)
        file.write('\n')
    driver.quit()

    return file_name

### Actual executable code is shown below ###
### driver = webdriver.Chrome()
### parse_all(driver)

def tokenizer(file_name):
    """ 
    Accepts text file as a string (e.g. "abstracts.txt") containing a list of abstracts as input and cleans up text using regex.
    """
    with open(file_name) as file:
        corpus = file.readlines()
        processed_abstracts = [w.lower() for w in corpus]
        processed_abstracts = [re.sub('[^a-zA-Z]', ' ', w) for w in processed_abstracts]
        processed_abstracts = [re.sub(r'\s+', ' ', w) for w in processed_abstracts]
    tokens = [nltk.word_tokenize(sent) for sent in processed_abstracts]

    for i in range(len(processed_abstracts)):
        tokens[i] = [w for w in tokens[i] if w not in stopwords.words('english')]

    # Passes all tokens to Word2Vec to train model
    model = Word2Vec(tokens, size=100, min_count=2, iter=10) 
    vocabulary = model.wv.vocab

    return model, vocabulary

def single_abstract_tkzr(url,wait_time):
    """
    This method tokenizes an abstract from a single url. Wait time is an integer number of seconds you
    want to wait for the page to load.
    """
    driver = webdriver.Chrome()
    abstract_text = extractor(url,wait_time)
    test_abstract = abstract_text.lower()
    test_abstract = re.sub('[^a-zA-Z]', ' ', test_abstract) 
    test_abstract = re.sub(r'\s+', ' ', test_abstract)
        
    abstract_tokens = nltk.word_tokenize(test_abstract)

    text_tokens = [tkn for tkn in abstract_tokens if tkn not in stopwords.words('english')]
    driver.quit()
    return text_tokens

def cosine_scores(search_terms,text_tokens,model,n_tokens):
    """
    Extracts the top n most similar tokens and their respective cosine similarity scores by 
    comparing the tokens from a single abstract to the trained vocabulary.
    Parameters:
    search_terms: desired search terms written as a list of strings; 
    text_tokens: A list of tokens for the text_tokens;
    model: The trained word2vec model;
    n_tokens: the number of top most similar tokens you'd like to return
    """

    store = defaultdict(int)
    for word in search_terms:
        for tkn in text_tokens:
            store[tkn] += model.wv.similarity(word,tkn)
    
    # Orders dictionary from highest to lowest cosine similarity score
    cos_scores = sorted(store.items() , reverse=True, key=lambda x: x[1])
    
    # Extracts top 20 most similar tokens
    return cos_scores[:int(n_tokens)]

In [8]:

df = pd.read_csv("organic.laser.url.csv")

#df.loc[df['PDF only'] == 0, 'PDF only'] = 'Nan'; for changing the values of 'PDF only' from 0 to 'NaN'#

first_condition = df['PDF only'] == 0.0
second_condition = df['Score AC'] >= 0.0 
third_condition = df['Score TG'] >= 0.0
fourth_condition = df['Score WT'] >= 0.0
msk = (first_condition & (second_condition | third_condition | fourth_condition ))

new_df = df[msk]
new_df = new_df.reindex(columns = ['URL', 'Abstract', 'Tokens', 'Cosine Top 20', 'Cumulative Sum', 'Score AC', 'Score TG', 'Score WT'])


driver = webdriver.Chrome()


for i in range(0, 3):
    print('On url', i)
    driver.refresh()
    time.sleep(2)
    new_df['Abstract'].iloc[i] = str(extractor(new_df['URL'].iloc[i],3))

driver.quit()

#Need to save new_df and abstracts.txt

print (new_df)


On url 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


On url 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


On url 2
                                                    URL  \
0     https://www-sciencedirect-com.offcampus.lib.wa...   
1     https://www-sciencedirect-com.offcampus.lib.wa...   
2     https://www-sciencedirect-com.offcampus.lib.wa...   
3     https://www-sciencedirect-com.offcampus.lib.wa...   
4     https://www-sciencedirect-com.offcampus.lib.wa...   
5     https://www-sciencedirect-com.offcampus.lib.wa...   
6     https://www-sciencedirect-com.offcampus.lib.wa...   
7     https://www-sciencedirect-com.offcampus.lib.wa...   
8     https://www-sciencedirect-com.offcampus.lib.wa...   
9     https://www-sciencedirect-com.offcampus.lib.wa...   
10    https://www-sciencedirect-com.offcampus.lib.wa...   
11    https://www-sciencedirect-com.offcampus.lib.wa...   
13    https://www-sciencedirect-com.offcampus.lib.wa...   
15    https://www-sciencedirect-com.offcampus.lib.wa...   
16    https://www-sciencedirect-com.offcampus.lib.wa...   
17    https://www-sciencedirect-com.offcampus.l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [25]:
test_url = 'https://www-sciencedirect-com.offcampus.lib.washington.edu/science/article/pii/S1566119911002801'

print(str(extractor(test_url, 3)))

Introduction of microstructures into an organic light-emitting device (OLED) is being considered as an effective approach to outcouple photons trapped in waveguide (WG) and surface plasmon-polariton (SPP) modes within the devices. However, the attempt has been hampered by the difficulty in applying lithographic patterning technologies on organic materials. Here, we show the end has been simply reached by one-step directly laser ablating the hole-transporting layer of the OLEDs without inducing any optical or electrical deterioration. Three times efficiency enhancement has been experimentally attained from the corrugated OLEDs, which has then been ascribed by numerical simulation to the efficient outcoupling of the SPP and WG modes to radiation.


In [None]:
new_df.to_csv('Organic_Laser_Abstracts.csv')
new_df['Abstracts'].to_txt('Organic_Laser_Tokenizer.txt')

In [None]:
#Have tokens added to the new_df and teach Word2Vec model#

txt_abstracts = pd.read_txt('.txt')
new_df = pd.read_csv('.csv')

for i in range(0,len(new_df)):
    

      Year                                                URL  Score AC  \
0     2011  https://www-sciencedirect-com.offcampus.lib.wa...       1.0   
1     2011  https://www-sciencedirect-com.offcampus.lib.wa...       2.0   
2     2011  https://www-sciencedirect-com.offcampus.lib.wa...       1.0   
3     2011  https://www-sciencedirect-com.offcampus.lib.wa...       1.0   
4     2011  https://www-sciencedirect-com.offcampus.lib.wa...       2.0   
5     2011  https://www-sciencedirect-com.offcampus.lib.wa...       0.0   
6     2011  https://www-sciencedirect-com.offcampus.lib.wa...       0.0   
7     2011  https://www-sciencedirect-com.offcampus.lib.wa...       0.0   
8     2011  https://www-sciencedirect-com.offcampus.lib.wa...       0.0   
9     2011  https://www-sciencedirect-com.offcampus.lib.wa...       0.0   
10    2011  https://www-sciencedirect-com.offcampus.lib.wa...       0.0   
11    2011  https://www-sciencedirect-com.offcampus.lib.wa...       0.0   
12    2011  https://www-s