In [23]:
import numpy as np
import pandas as pd
import selenium
from selenium import webdriver
import bs4
from bs4 import BeautifulSoup
import time
import re
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from collections import defaultdict

def extractor(url,wait_time):
    """
    Accepts a url and stores its html code before parsing and extracting the abstract and date as text.
    Feeds directly into parser, so don't call this function unless you want to obtain the abstract
    from a single url.
    """
    driver = webdriver.Chrome()
    driver.get(url)
    #driver.find_element_by_id('').send_keys('')#
    #driver.find_element_by_id ('').send_keys('')#
    #driver.find_element_by_id('submit').click()#
    time.sleep(wait_time) # important

    html_doc = driver.page_source # stores the source HTML code in the driver's page_source attribute
    soup = BeautifulSoup(html_doc, 'html.parser')
    abstract = soup.find('div', {'class':"abstract author"}).find('p').text
    pub_date = soup.find('meta', {'name': "citation_publication_date"})['content']
    
    driver.quit()
    return pub_date, abstract


def parse_all(driver):
    """
    The following method is designed to automatically parse each url contained in a long list 
    of scraped urls, and writes the title, abstract, and doi to a new text file with a user
    input "file_name.txt."
    """
    url_lst = input("Enter name of file with .txt extension with list of urls: ")
    data = pd.read_csv(url_lst,header=None,names=['url']) #text file containing a list of the scraped urls (should be in same directory)
    file_name = input("Input the file name with .txt extension you wish to store abstracts in: ")
    file = open(file_name,'w')

    max_iters = len(data) #total number of scraped urls to be parsed
    print("The parser will parse: " + str(max_iters) + " urls.")

    for i in range(0,max_iters):
        print('On url ',i)
        driver.refresh()
        time.sleep(2)
        urli = str(extractor(data.iloc[i,0],3))
        file.write(urli)
        file.write('\n')
    driver.quit()

    return file_name

### Actual executable code is shown below ###
### driver = webdriver.Chrome()
### parse_all(driver)

def tokenizer(file_name):
    """ 
    Accepts text file as a string (e.g. "abstracts.txt") containing a list of abstracts as input and cleans up text using regex.
    """
    with open(file_name) as file:
        corpus = file.readlines()
        processed_abstracts = [w.lower() for w in corpus]
        processed_abstracts = [re.sub('[^a-zA-Z]', ' ', w) for w in processed_abstracts]
        processed_abstracts = [re.sub(r'\s+', ' ', w) for w in processed_abstracts]
    tokens = [nltk.word_tokenize(sent) for sent in processed_abstracts]

    for i in range(len(processed_abstracts)):
        tokens[i] = [w for w in tokens[i] if w not in stopwords.words('english')]

    # Passes all tokens to Word2Vec to train model
    model = Word2Vec(tokens, size=100, min_count=2, iter=10) 
    vocabulary = model.wv.vocab

    return model, vocabulary

def single_abstract_tkzr(abstract):
    """
    This method tokenizes an abstract from a single url. Wait time is an integer number of seconds you
    want to wait for the page to load.
    """
    #driver = webdriver.Chrome()
    #abstract_text = extractor(url,wait_time)
    test_abstract = abstract.lower()
    test_abstract = re.sub('[^a-zA-Z]', ' ', test_abstract) 
    test_abstract = re.sub(r'\s+', ' ', test_abstract)
        
    abstract_tokens = nltk.word_tokenize(test_abstract)

    text_tokens = [tkn for tkn in abstract_tokens if tkn not in stopwords.words('english')]
    #driver.quit()
    return text_tokens

def cosine_scores(search_terms,text_tokens,model,n_tokens):
    """
    Extracts the top n most similar tokens and their respective cosine similarity scores by 
    comparing the tokens from a single abstract to the trained vocabulary.
    Parameters:
    search_terms: desired search terms written as a list of strings; 
    text_tokens: A list of tokens for the text_tokens;
    model: The trained word2vec model;
    n_tokens: the number of top most similar tokens you'd like to return
    """

    store = defaultdict(int)
    for word in search_terms:
        for tkn in text_tokens:
            store[tkn] += model.wv.similarity(word,tkn)
    
    # Orders dictionary from highest to lowest cosine similarity score
    cos_scores = sorted(store.items() , reverse=True, key=lambda x: x[1])
    
    # Extracts top 20 most similar tokens
    return cos_scores[:int(n_tokens)]

In [59]:
df = pd.read_csv("organic.laser.url.csv")

first_condition = df['PDF only'].isnull() #NaN actually means that nothing is there, not a string
second_condition = df['Score AC'] >= 0.0 
third_condition = df['Score TG'] >= 0.0
fourth_condition = df['Score WT'] >= 0.0
keep = (first_condition & (second_condition | third_condition | fourth_condition )) #Masking of URLs with PDFs

new_df = df[keep]
new_df = new_df.reindex(columns = ['Date', 'URL', 'Abstract', 'Tokens', 'Cosine Top 20', 'Cumulative Sum', 'Score AC', 'Score TG', 'Score WT'])


Unnamed: 0,Date,URL,Abstract,Tokens,Cosine Top 20,Cumulative Sum,Score AC,Score TG,Score WT
0,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,1.0,,
1,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,2.0,,
2,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,1.0,,
3,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,1.0,,
4,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,2.0,,
5,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,0.0,,
6,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,0.0,,
7,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,0.0,,
8,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,0.0,,
9,,https://www-sciencedirect-com.offcampus.lib.wa...,,,,,0.0,,


In [3]:
driver = webdriver.Chrome()

for i in range(0, len(new_df)):
    print('On url', i)
    driver.refresh()
    time.sleep(2)
    pub_date, abstract = extractor(new_df['URL'].iloc[i],3) #Extracting both publication date and abstract
    new_df.iloc[i,2] = str(abstract) #Date is position 0 and Abstract is position 2 in the array
    new_df.iloc[i,0] = str(pub_date)
    
driver.quit()

#Need to save new_df as csv for later and abstracts as txt for tokenizer
new_df.to_csv('Organic_Laser_Abstracts.csv')
with open('Organic_Laser_Tokenizer.txt', "w") as output:
    output.write(str(new_df['Abstract']))
                 

On url 0
On url 1
On url 2
On url 3
On url 4
On url 5
On url 6
On url 7
On url 8
On url 9
On url 10
On url 11
On url 12
On url 13
On url 14
On url 15
On url 16
On url 17
On url 18
On url 19
On url 20
On url 21
On url 22
On url 23
On url 24
On url 25
On url 26
On url 27
On url 28
On url 29
On url 30
On url 31
On url 32
On url 33
On url 34
On url 35
On url 36
On url 37
On url 38
On url 39
On url 40
On url 41
On url 42
On url 43
On url 44
On url 45
On url 46
On url 47
On url 48
On url 49
On url 50
On url 51
On url 52
On url 53
On url 54
On url 55
On url 56
On url 57
On url 58
On url 59
On url 60
On url 61
On url 62
On url 63
On url 64
On url 65
On url 66
On url 67
On url 68
On url 69
On url 70
On url 71
On url 72
On url 73
On url 74
On url 75
On url 76
On url 77
On url 78
On url 79
On url 80
On url 81
On url 82
On url 83
On url 84
On url 85
On url 86
On url 87
On url 88
On url 89
On url 90
On url 91
On url 92
On url 93
On url 94
On url 95
On url 96
On url 97
On url 98
On url 99
On url 100

On url 755
On url 756
On url 757
On url 758
On url 759
On url 760
On url 761
On url 762
On url 763
On url 764
On url 765
On url 766
On url 767
On url 768
On url 769
On url 770
On url 771
On url 772
On url 773
On url 774
On url 775
On url 776
On url 777
On url 778
On url 779
On url 780
On url 781
On url 782
On url 783
On url 784
On url 785
On url 786
On url 787
On url 788
On url 789
On url 790
On url 791
On url 792
On url 793
On url 794
On url 795
On url 796
On url 797
On url 798
On url 799
On url 800
On url 801
On url 802
On url 803
On url 804
On url 805
On url 806
On url 807
On url 808
On url 809
On url 810
On url 811
On url 812
On url 813
On url 814
On url 815
On url 816
On url 817
On url 818
On url 819
On url 820
On url 821
On url 822
On url 823
On url 824
On url 825
On url 826
On url 827
On url 828
On url 829
On url 830
On url 831
On url 832
On url 833
On url 834
On url 835
On url 836
On url 837
On url 838
On url 839
On url 840
On url 841
On url 842
On url 843
On url 844
On url 845

Unnamed: 0,Date,URL,Abstract,Tokens,Cosine Top 20,Cumulative Sum,Score AC,Score TG,Score WT
0,2011/04/01,https://www-sciencedirect-com.offcampus.lib.wa...,Functional laser printed Organic Thin Film Tra...,,,,1.0,,
1,2011/08/01,https://www-sciencedirect-com.offcampus.lib.wa...,"We studied laser dynamics of poly(9,9′-dioctyl...",,,,2.0,,
2,2011/06/01,https://www-sciencedirect-com.offcampus.lib.wa...,In this paper we present a potentially fast me...,,,,1.0,,
3,2011/04/01,https://www-sciencedirect-com.offcampus.lib.wa...,Laser-induced forward transfer (LIFT) has been...,,,,1.0,,
4,2011/11/01,https://www-sciencedirect-com.offcampus.lib.wa...,Introduction of microstructures into an organi...,,,,2.0,,
5,2011/04/01,https://www-sciencedirect-com.offcampus.lib.wa...,We report on the development of hybrid organic...,,,,0.0,,
6,2011/07/01,https://www-sciencedirect-com.offcampus.lib.wa...,The aim of this study was to characterize the ...,,,,0.0,,
7,2011/08/01,https://www-sciencedirect-com.offcampus.lib.wa...,With the aim to study and to improve LIBS capa...,,,,0.0,,
8,2011/01/01,https://www-sciencedirect-com.offcampus.lib.wa...,With the objective of detection and identifica...,,,,0.0,,
9,2011/01/15,https://www-sciencedirect-com.offcampus.lib.wa...,We examined the correlation between thickness ...,,,,0.0,,


In [64]:
new_df = pd.read_csv('Organic_Laser_Abstracts.csv', index_col=[0])
for i in range(0, len(new_df)):
    new_df.iloc[i,3] = str(single_abstract_tkzr(new_df.iloc[i,2])) #Single_abstract_tkzr repurposed for abstracts already in new_df


In [69]:
new_df.to_csv('Organic_Laser_Tokens.csv', index=False) #Saving new_df after adding Tokens
new_df

Unnamed: 0,Date,URL,Abstract,Tokens,Cosine Top 20,Cumulative Sum,Score AC,Score TG,Score WT
0,2011/04/01,https://www-sciencedirect-com.offcampus.lib.wa...,Functional laser printed Organic Thin Film Tra...,"['functional', 'laser', 'printed', 'organic', ...",,,1.0,,
1,2011/08/01,https://www-sciencedirect-com.offcampus.lib.wa...,"We studied laser dynamics of poly(9,9′-dioctyl...","['studied', 'laser', 'dynamics', 'poly', 'dioc...",,,2.0,,
2,2011/06/01,https://www-sciencedirect-com.offcampus.lib.wa...,In this paper we present a potentially fast me...,"['paper', 'present', 'potentially', 'fast', 'm...",,,1.0,,
3,2011/04/01,https://www-sciencedirect-com.offcampus.lib.wa...,Laser-induced forward transfer (LIFT) has been...,"['laser', 'induced', 'forward', 'transfer', 'l...",,,1.0,,
4,2011/11/01,https://www-sciencedirect-com.offcampus.lib.wa...,Introduction of microstructures into an organi...,"['introduction', 'microstructures', 'organic',...",,,2.0,,
5,2011/04/01,https://www-sciencedirect-com.offcampus.lib.wa...,We report on the development of hybrid organic...,"['report', 'development', 'hybrid', 'organic',...",,,0.0,,
6,2011/07/01,https://www-sciencedirect-com.offcampus.lib.wa...,The aim of this study was to characterize the ...,"['aim', 'study', 'characterize', 'organic', 'c...",,,0.0,,
7,2011/08/01,https://www-sciencedirect-com.offcampus.lib.wa...,With the aim to study and to improve LIBS capa...,"['aim', 'study', 'improve', 'libs', 'capabilit...",,,0.0,,
8,2011/01/01,https://www-sciencedirect-com.offcampus.lib.wa...,With the objective of detection and identifica...,"['objective', 'detection', 'identification', '...",,,0.0,,
9,2011/01/15,https://www-sciencedirect-com.offcampus.lib.wa...,We examined the correlation between thickness ...,"['examined', 'correlation', 'thickness', 'epit...",,,0.0,,


In [72]:
for i in range(0, len(new_df)):
    tokens_for_model += (new_df.iloc[i,3]) #Concatenating all tokens for model
    
model = Word2Vec(tokens_for_model, size = 100, min_count = 1) #Feed the Word2Vec model with collected abstracts
vocabulary = model.wv.vocab 

In [74]:
for i in range(0, len(new_df)):
    new_df.iloc[i,5] = cosine_scores('Organic Laser',new_df.iloc[i,3],model,20)
new_df

KeyError: "word 'O' not in vocabulary"

In [None]:
#Year vs score plot#

