In [4]:
import numpy as np
import pandas as pd
import os
import random

In [5]:
beto = '/Users/DavidJuergens/Desktop/BETO2020/'
google = "/Users/DavidJuergens/Google Drive File Stream/Shared drives/ChemE NLP Team Drive/Corrosion Inhibitors"

In [6]:
os.chdir('/Users/DavidJuergens/Desktop/BETO2020/scripts/')
import tfidf_wrapper
from tfidf_wrapper import tfidf_model

In [7]:
os.chdir(google)
file = open('rank-2-CIpubs.txt','r')

In [8]:
os.chdir(beto)

## The above code just imports tfidf_wrapper, grabs rank2 corpus from Google. 

In [9]:
# make a list of publications from the file. tfidf_wrapper likes lists
corpus = []
for pub in file:
    corpus.append(pub)
file.close()

In [10]:
# so there are now 3346 full texts in this list
len(corpus)

3346

In [11]:
def scrambled(orig):
    dest = orig[:]
    random.shuffle(dest)
    return dest

In [14]:
# shake things up to be statistically fair.
corpus = scrambled(corpus)

In [15]:
# if you need documentation of how to use tfidf_wrapper, or you can just look at the source code
# help(tfidf_wrapper)

In [16]:
# This is the first call made for a tfidf_model class, create the object, which requires input corpus
model = tfidf_model(corpus)

In [17]:
# Now we make the tfidf matrix. This is a matrix that will contain vectors for each paper.
# Each vector will have the unique tfidf information for every word
model.tfidf_matrix()

In [18]:
# the array attribute of the model contains all of the tfidf information
mat = model.array
vocab = model.vocab

In [19]:
# looks like this tfidf matrix has 3346 rows, corresponding to each individual paper 
# 255764 corresponds to the length of each tfidf vector for each paper.
# There are apparently 255764 unique words in the corpus 
mat.shape

(3346, 255764)

In [20]:
# here is a single tfidf vector for a paper, each input corresponds to each word in the vocabulary
mat[1001]

array([0.0076284 , 0.00327592, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [21]:
# I just found this method by Googling. This finds the 4 highest numbers in a numpy array. 
# We can use this to get the highest tfidf scores from any paper
ind = np.argpartition(mat[1001], -2000)[-2000:]

In [22]:
# method to make sure we don't get words with numbers
def hasNumbers(inputString):
    import re
    return bool(re.search(r'\d', inputString))

In [29]:
def good_words(matrix,vocab,n,stopcount):
    """
    This method finds the most relevant words in a corpus, as calculated by tfidf scores.
    
    Parameters:
    -----------
    matrix (object, required): The object corresponding to the array attribute of a tfidf_wrapper model
    
    vocab (object, required): The object corresponding to the vocab attribute of a tfidf_wrapper model
    
    n (int,required): This integer describes 
    """
    goodwords = []
    for i in range(mat.shape[0]):
        highscore_indices = np.argpartition(mat[i], -n)[-n:]
        
        for j in range(n):
            word = vocab[highscore_indices[j]]
            
            if not hasNumbers(word) and word not in goodwords:
                goodwords.append(word)
        
        if len(goodwords) >= stopcount:
            return goodwords
            break
               
               
        
                
        

In [31]:
words = good_words(mat,vocab,50,5000)
print(words)

['activation', 'values', 'charge', 'efficiency', 'time', 'curves', 'concentration', 'by', 'were', 'mol', 'absence', 'molecule', 'δg', 'indole', 'cor', 'resistance', 'presence', 'on', 'was', 'mild', 'steel', 'with', 'from', 'metal', 'to', 'polarization', 'acetic', 'surface', 'hcl', 'immersion', 'at', 'calculated', 'corrosion', 'of', 'inhibitor', 'is', 'this', 'as', 'inhibition', 'and', 'in', 'adsorption', 'ads', 'the', 'that', 'are', 'molecules', 'value', 'com', 'phenol', 'benzimidazole', 'https', 'image', 'prod', 'thumbnail', 'methylimidazole', 'gif', 'us', 'eu', 'chlorophenyl', 'for', 'east', 'yl', 'μd', 'sml', 'bromo', 'ucs', 'aminomethyl', 'pii', 'content', 'amazonaws', 'west', 'imidazol', 'stripin', 'imidazole', 'store', 'downsampled', 'jpg', 'âˆ', 'acid', 'quantum', 'ha', 'figure', 'epzc', 'nitrobenzylidene', 'sci', 'thione', 'el', 'corros', 'ïƒ', 'proceedings', 'phys', 'chem', 'merimi', 'today', 'triazolic', 'compound', 'dihydro', 'triazole', 'adhesion', 'healant', 'nanocapsule',