In [1]:
import numpy as np
import pandas as pd
import selenium
from selenium import webdriver
import bs4
from bs4 import BeautifulSoup
import time
import re
import nltk
import gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates as mdates

def extractor(url,wait_time):
    """
    Accepts a url and stores its html code before parsing and extracting the abstract and date as text.
    Feeds directly into parser, so don't call this function unless you want to obtain the abstract
    from a single url.
    """
    driver = webdriver.Chrome()
    driver.get(url)
    #driver.find_element_by_id('').send_keys('')#
    #driver.find_element_by_id ('').send_keys('')#
    #driver.find_element_by_id('submit').click()#
    time.sleep(wait_time) # important

    html_doc = driver.page_source # stores the source HTML code in the driver's page_source attribute
    soup = BeautifulSoup(html_doc, 'html.parser')
    abstract = soup.find('div', {'class':"abstract author"}).find('p').text
    pub_date = soup.find('meta', {'name': "citation_publication_date"})['content']
    
    driver.quit()
    return pub_date, abstract


def parse_all(driver):
    """
    The following method is designed to automatically parse each url contained in a long list 
    of scraped urls, and writes the title, abstract, and doi to a new text file with a user
    input "file_name.txt."
    """
    url_lst = input("Enter name of file with .txt extension with list of urls: ")
    data = pd.read_csv(url_lst,header=None,names=['url']) #text file containing a list of the scraped urls (should be in same directory)
    file_name = input("Input the file name with .txt extension you wish to store abstracts in: ")
    file = open(file_name,'w')

    max_iters = len(data) #total number of scraped urls to be parsed
    print("The parser will parse: " + str(max_iters) + " urls.")

    for i in range(0,max_iters):
        print('On url ',i)
        driver.refresh()
        time.sleep(2)
        urli = str(extractor(data.iloc[i,0],3))
        file.write(urli)
        file.write('\n')
    driver.quit()

    return file_name

### Actual executable code is shown below ###
### driver = webdriver.Chrome()
### parse_all(driver)

def tokenizer(file_name):
    """ 
    Accepts text file as a string (e.g. "abstracts.txt") containing a list of abstracts as input and cleans up text using regex.
    """
    with open(file_name) as file:
        corpus = file.readlines()
        processed_abstracts = [w.lower() for w in corpus]
        processed_abstracts = [re.sub('[^a-zA-Z]', ' ', w) for w in processed_abstracts]
        processed_abstracts = [re.sub(r'\s+', ' ', w) for w in processed_abstracts]
    tokens = [nltk.word_tokenize(sent) for sent in processed_abstracts]

    for i in range(len(processed_abstracts)):
        tokens[i] = [w for w in tokens[i] if w not in stopwords.words('english')]
   
    # Passes all tokens to Word2Vec to train model
    model = Word2Vec(tokens, min_count=2)
    print(model)
    vocabulary = list(model.wv.vocab)

    return model, vocabulary

def single_abstract_tkzr(abstract):
    """
    This method tokenizes an abstract from a single url. Wait time is an integer number of seconds you
    want to wait for the page to load.
    """
    #driver = webdriver.Chrome()
    #abstract_text = extractor(url,wait_time)
    test_abstract = abstract.lower()
    test_abstract = re.sub('[^a-zA-Z]', ' ', test_abstract) 
    test_abstract = re.sub(r'\s+', ' ', test_abstract)
        
    abstract_tokens = nltk.word_tokenize(test_abstract)

    text_tokens = [tkn for tkn in abstract_tokens if tkn not in stopwords.words('english')]
    #driver.quit()
    return text_tokens

def cosine_scores(search_terms,text_tokens,model,n_tokens):
    """
    Extracts the top n most similar tokens and their respective cosine similarity scores by 
    comparing the tokens from a single abstract to the trained vocabulary.
    Parameters:
    search_terms: desired search terms written as a list of strings; 
    text_tokens: A list of tokens for the text_tokens;
    model: The trained word2vec model;
    n_tokens: the number of top most similar tokens you'd like to return
    """

    store = defaultdict(int)
    for word in search_terms:
        for tkn in text_tokens:
            store[tkn] += model.wv.similarity(word,tkn)
    
    # Orders dictionary from highest to lowest cosine similarity score
    cos_scores = sorted(store.items() , reverse=True, key=lambda x: x[1])
    
    # Extracts top 20 most similar tokens
    return cos_scores[:int(n_tokens)]

In [None]:
df = pd.read_csv("organic.laser.url.csv")

first_condition = df['PDF only'].isnull() #NaN actually means that nothing is there, not a string
second_condition = df['Score AC'] >= 0.0 
third_condition = df['Score TG'] >= 0.0
fourth_condition = df['Score WT'] >= 0.0
keep = (first_condition & (second_condition | third_condition | fourth_condition )) #Masking of URLs with PDFs

new_df = df[keep]
new_df = new_df.reindex(columns = ['Date', 'URL', 'Abstract', 'Tokens', 'Cosine Top 20', 'Cumulative Sum', 'Score AC', 'Score TG', 'Score WT'])


In [None]:
driver = webdriver.Chrome()

for i in range(0, len(new_df)):
    print('On url', i)
    driver.refresh()
    time.sleep(2)
    pub_date, abstract = extractor(new_df['URL'].iloc[i],3) #Extracting both publication date and abstract
    new_df.iloc[i,2] = str(abstract) #Date is position 0 and Abstract is position 2 in the array
    new_df.iloc[i,0] = str(pub_date)
    
driver.quit()

#Need to save new_df as csv for later and abstracts as txt for tokenizer
new_df.to_csv('Organic_Laser_Abstracts.csv')
                 

In [None]:
new_df = pd.read_csv('Organic_Laser_Abstracts.csv', index_col=[0])
for i in range(0, len(new_df)):
    new_df.iloc[i,3] = str(single_abstract_tkzr(new_df.iloc[i,2])) #Single_abstract_tkzr repurposed for abstracts already in new_df

In [None]:
new_df.to_csv('Organic_Laser_Tokens.csv', index=False) #Saving new_df after adding Tokens

In [None]:
abstracts = ''
for i in range(0, len(new_df)):
    abstracts += str(new_df.iloc[i,2])

In [None]:
with open('Organic_Laser_Tokenizer.txt', "w") as output: #Save .txt file for tokenizer
    output.write(abstracts)

In [None]:
new_df = pd.read_csv('Organic_Laser_Tokens.csv')

In [None]:
test_df = new_df
for i in range(0,len(test_df)):
    test_df.iloc[i,3] = new_df.iloc[i,3][1:-1] #Getting rid of the brackets around the tokens in Tokens column

In [None]:
for i in range(0,len(new_df)):
    new_df.iloc[i,3] = new_df.iloc[i,3][1:-1] #Getting rid of the brackets around the tokens in Tokens column

In [None]:
new_df.to_csv('Organic_Laser_Tokens2.csv', index=False) #Saving new_df after eliminating brackets in Tokens

In [None]:
new_df = pd.read_csv('Organic_Laser_Tokens2.csv')

In [None]:
def remove_apostrophe(word):
    word = word[1:-1]
    return word

In [None]:
#For Word2Vec model
tokens = []
for i in range(0,len(new_df)):
    string_of_tokens = new_df.iloc[i,3]
    string_of_tokens = string_of_tokens.split(', ')
    new_tokens = list(map(remove_apostrophe, string_of_tokens)) #Removing the apostrophes in each token
    tokens.append(new_tokens) #"tokens" is the list of tokens from each abstract to be used for model training


In [None]:
model = Word2Vec(tokens, size=100, min_count=1, iter=30) #training model with token list from above
vocabulary = list(model.wv.vocab) #saving vocabulary as list for visualization
print(model)
model.save("NLP_Project_test.model")

In [None]:
#For cosine score
tokens_cs = []
for i in range(0,len(new_df)):
    string_of_tokens_cs = new_df.iloc[i,3]
    string_of_tokens_cs = string_of_tokens_cs.split(', ')
    new_tokens_cs = list(map(remove_apostrophe, string_of_tokens_cs)) #Removing the apostrophes in each token for cosine score
    tokens_cs.extend(new_tokens_cs)

In [None]:
#Lemmatized tokens for cosine score, was not source of error, turned out to be min_count
lemmatized_tokens = []
lemmatized_tokens = list(map(lemmatizer.lemmatize, tokens_cs))
lemmatized_tokens.remove('successively') #'successively' would not be lemmatized

In [None]:
#Cosine scores of Top 20 tokens
for i in range(0, len(new_df)):
    new_df['Cosine Top 20'].loc[i] = cosine_scores(['organic','laser'],tokens[i],model,20)


In [None]:
#Cumulative Sum of Top 20
for i in range(0, len(new_df)):
    new_df['Cumulative Sum'].loc[i] = sum(i for j, i in new_df['Cosine Top 20'].loc[i])

In [None]:
#Transfer date strings to dates, and sort the list in ascending order of dates  
data_df = new_df
data_df['Date'] = pd.to_datetime(data_df['Date'])
data_df.sort_values(by=['Date'], inplace=True, ascending=True)
    
 

In [None]:
data_df.to_csv('NLP_Organic_Laser_Data.csv', index=False) #Saving data_df after filling out all columns and organizing based on date

In [None]:
data_df = pd.read_csv('NLP_Organic_Laser_Data.csv')

In [None]:
new_df.to_csv('Unsorted_Organic_laser_Data.csv', index=False) #Saving new_df with unsorted by date data

In [None]:
new_df = pd.read_csv('Unsorted_Organic_laser_Data.csv')

In [None]:
#Plot of Cumulative Top 20 Cosine Score for ['Organic', 'Laser'] versus Publication Date
x = data_df['Date']
y = data_df['Cumulative Sum']
plt.xlabel('Date of Publication')
plt.ylabel('Cumulative Top 20 Cosine Score')
plt.plot(x,y)

plt.show()

In [None]:
#Scatterplot with data not sorted by date
x = new_df['Date']
y = new_df['Cumulative Sum']
plt.xlabel('Date of Publication')
plt.ylabel('Cumulative Top 20 Cosine Score')
plt.scatter(x[::],y[::], s=5)

plt.show()

In [None]:
#TG Notes
#Train model for a certain amount of epochs (saving them each time .save). 
#Observing the cosine score between two words in the W2V vocabulary.
#model.similarity('france', 'spain')
#words to compare: laser, plqy, photoluminescence quantum yield, amplified spontaneous emission, ase, lasing threshold
#model = gensim.models.Word2Vec.load("NLP_Project.model")
#model = Word2Vec(tokens, size=100, min_count=1, iter=30)
#model.save("NLP_Project.model")


In [None]:
#Issue now is that I am not sure how to save the model at each epoch of training. 
#I think I am just overwriting it each time I go through the for loop.
#model_test = [] list of models could work?
#NLP_Project_test.model is just the model trained above that has been renamed
test_array = np.empty((101,2))
test_array_df = pd.DataFrame(data=test_array[0:,:], columns = {'Epochs', 'Cosine Score'})

for i in range(len(test_array_df)):
    test_array_df['Epochs'].iloc[i] = (1+i)*30
    
    model_test = gensim.models.Word2Vec.load("NLP_Project_test.model") #model_test.append to add models to list
    model_test = model.train.model_test
    
    test_array_df['Cosine Score'].iloc[i] = model_test.wv.similarity('emission', 'laser')
    
    print('On epoch:', i)
    model.save("NLP_Project_test.model")

test_array_df