In [None]:
# This notebook is meant to prototype a script that can find the publication PII identification numbers automatically for a query into the science direct database.

# To test queries, go to https://www.scopus.com/search/form.uri?display=advanced

In [None]:
# The link to elsevier active journals link: https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls

In [1]:
import pybliometrics
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus.exception import Scopus429Error
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import os
import multiprocessing
from os import system, name
import json
import time
from IPython.display import clear_output
from pybliometrics.scopus import config

# Goals for the algorithm

List of things by which the algorithm will parse searches:

1. Year
2. Journal
3. Keyword search

Here is an example search syntax: `s = ScopusSearch('FIRSTAUTH ( kitchin  j.r. )')`

### The following method creates a dataframe that only contains journals mentioning certain keywords in their 'Full_Category' column. 
### It still needs work on user friendlyness

In [2]:
els_jpath = 'https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls'

def make_jlist(jlist_url):
    """
    This method creates a dataframe of relevant journals to query. The dataframe contains two columns:
    (1) The names of the Journals
    (2) The issns of the Journals
    """
    
    # This creates a dataframe of the active journals and their subjects from elsevier
    active_journals = pd.read_excel(jlist_url)
    active_journals.rename(columns = {'Display Category Full Name':'Full_Category','Full Title':'Journal_Title'}, inplace = True)
    
    active_journals.Full_Category = active_journals.Full_Category.str.lower() # lowercase topics for searching
    active_journals = active_journals.drop_duplicates(subset = 'Journal_Title') # drop any duplicate journals
    active_journals = shuffle(active_journals,random_state = 42) 


    # journal_strings is currently unused
    # The set of default strings that will be used to sort which journals we want
    journal_strings = ['chemistry','energy','molecular','atomic','chemical','biochem'
                      ,'organic','polymer','chemical engineering','biotech','coloid']

    # making this an easier command to type
    name = active_journals['Full_Category'].str.contains


    # desired keywords
    # new dataframe full of only journals who's topic description contained the
    active_journals= active_journals[name('polymer') | name('chemistry') | name('energy')| 
                                     name('molecular') | name('colloid') | name('biochem')| 
                                     name('organic') | name('biotech') | name('chemical')]
    
    journal_frame = active_journals[['Journal_Title','ISSN']]
    
    return journal_frame
    
    
    
    

In [3]:
def clear_stdout(): 
    os.system('cls' if os.name == 'nt' else 'clear')

In [4]:
journal_list = make_jlist(els_jpath)

In [5]:
journal_list.head()

Unnamed: 0,Journal_Title,ISSN
2580,Gene: X,25901583
3917,Journal of Infection,1634453
6729,Thermochimica Acta,406031
6587,Surface Science,396028
3554,Journal of Colloid and Interface Science,219797


### The following method builds the keyword search portion of a query. There is an example below that can be copy-pasted into the Scopus advanced Search.

In [6]:
def build_search_terms(kwds):
    """
    This builds the keyword search portion of the query string. 
    """
    tak = ""
    for i in range(len(kwds)):
        if i != len(kwds)-1:
            tak += kwds[i] + ' OR '
        else:
            tak += kwds[i] + ' '
    
    return tak

### The following method builds the entiry query to be put into pybliometrics

In [None]:
# Here is a model test query 
# test = search(verbose = True, query = 'polymer OR organic OR molecular AND PUBYEAR IS 2019 AND ISSN(00404020)')

In [7]:
def build_query_dict(term_list,issn_list,year_list):
    """
    This method takes the list of journals and creates a nested dictionary
    containing all accessible queries, in each year, for each journal,
    for a given keyword search on sciencedirect.
    
    Parameters
    ----------
    term_list(list, required): the list of search terms looked for in papers by the api.
    
    issn_list(list, required): the list of journal issn's to be queried. Can be created by getting the '.values'
    of a 'journal_list' dataframe that has been created from the 'make_jlist' method.
    
    year_list(list, required): the list of years which will be searched through
    
    """
    search_terms = build_search_terms(term_list)
    dict1 = {}
    
    for issn in issn_list:
        
        issn_terms = ' AND ISSN(' + issn + ')'
        dict2 = {}
        
        for year in year_list:
            
            year_terms = "AND PUBYEAR IS " + str(year)
            querystring = search_terms + year_terms + issn_terms

            dict2[year] = querystring

        dict1[issn] = dict2

    return dict1



In [8]:
term_list = ['polymer','organic','molecular','molecule']

In [9]:
# example of how to use the dictionary builder
issn_list = journal_list['ISSN'].values
dictionary = build_query_dict(term_list,issn_list,range(1995,2021))
dictionary['00404020'][2015]

'polymer OR organic OR molecular OR molecule AND PUBYEAR IS 2015 AND ISSN(00404020)'

In [20]:
newkey = fresh_keys[2]
config["Authentication"]["APIKey"] = newkey

In [21]:
test = ScopusSearch(verbose=True, query = dictionary['00404020'][2003])

Downloading results for query "polymer OR organic OR molecular OR molecule AND PUBYEAR IS 2003 AND ISSN(00404020)":
Progress: |██████████████████████████████████████████████████| 100.00% Complete


In [None]:
#len(test.results)

### Here is a method to clear the cache. Doesn't matter too much because 1.1 million pubs stored in cache only took 2 GB of memory 

In [10]:
def clear_cache(cache_path):
    """
    Be very careful with this method. It can delete your entire computer if you let it. 
    """
    
    # if the cache path contains the proper substring, and if the files we are deleting are of the propper length, delete the files
    
    if '.scopus/scopus_search/' in cache_path:
        for file in os.listdir(cache_path):
            
            # Making sure the deleted files match the standard length of pybliometrics cache output
            if len(file) == len('8805245317ccb15059e3cfa219be2dd4'):
                os.remove(cache_path + file)

### The method below loops through the entire journal list and collects article metadata, including PII

Things we probably want to just grab because we have them:
1. Author names
2. Author keywords
3. Cited by count
4. title
5. PII
6. DOI
7. Description

### The cell below here is test code for making a key replacement method

In [11]:
def get_piis(term_list, journal_frame, year_list, cache_path, output_path,keymaster=False,fresh_keys=None,config_path='/Users/DavidCJ/.scopus/config.ini'):
    """
    This should be a standalone method that recieves a list of journals (issns), a keyword search,
    an output path and a path to clear the cache. It should be mappable to multiple parallel processes. 
    """
    if output_path[-1] is not '/':
        raise Exception('Output file path must end with /')
    
    if '.scopus/scopus_search' not in cache_path:
        raise Exception('Cache path is not a sub-directory of the scopus_search. Make sure cache path is correct.')
    
        
    # Two lists who's values correspond to each other    
    issn_list = journal_frame['ISSN'].values
    journal_list = journal_frame['Journal_Title'].values
    
    # Find and replaces slashes and spaces in names for file storage purposes
    for j in range(len(journal_list)):
        if ':' in journal_list[j]:
            journal_list[j] = journal_list[j].replace(':','')
        elif '/' in journal_list[j]:
            journal_list[j] = journal_list[j].replace('/','_')
        
        elif ' ' in journal_list[j]:
            journal_list[j] = journal_list[j].replace(' ','_')
    
            
    
    # Build the dictionary that can be used to sequentially query elsevier for different journals and years
    query_dict = build_query_dict(term_list,issn_list,year_list)
    
    # Must write to memory, clear cache, and clear a dictionary upon starting every new journal
    for i in range(len(issn_list)):
        # At the start of every year, clear the standard output screen
        clear_stdout()
        #clear_cache(cache_path) # only want clear_cache() uncommented if we are NOT paralellizing. Multiple processes trying to clear the same cache is ugly.
        paper_counter = 0

        issn_dict = {}
        for j in range(len(year_list)):
            
            # for every year in every journal, query the keywords
            print(f'{journal_list[i]} in {year_list[j]}.')
            
            # Want the sole 'keymaster' process to handle 429 responses by swapping the key. 
            if keymaster:
                try:
                    query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]])
                except:
                    print('entered scopus 429 error loop... replacing key')
                    newkey = fresh_keys.pop()
                    config["Authentication"]["APIKey"] = newkey
                    time.sleep(5)
                    query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]])   
            # If this process isn't the keymaster, try a query. 
            # If it excepts, wait a few seconds for keymaster to replace key and try again.
            else:
                try:
                    query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]])
                except:
                    time.sleep(15)
                    query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]])
            
            
            # store relevant information from the results into a dictionary pertaining to that query
            year_dict = {}
            if query_results.results is not None:
                # some of the query results might be of type None 
                
                
                for k in range(len(query_results.results)):
                    paper_counter += 1
                    
                    result_dict = {}
                    result = query_results.results[k]

                    result_dict['pii'] = result.pii
                    result_dict['doi'] = result.doi
                    result_dict['title'] = result.title
                    result_dict['num_authors'] = result.author_count
                    result_dict['authors'] = result.author_names
                    result_dict['description'] = result.description
                    result_dict['citation_count'] = result.citedby_count
                    result_dict['keywords'] = result.authkeywords
                    
                    year_dict[k] = result_dict

                # Store all of the results for this year in the dictionary containing to a certain journal
                issn_dict[year_list[j]] = year_dict
            else:
                # if it was a None type, we will just store the empty dictionary as json
                issn_dict[year_list[j]] = year_dict
        
        
        # Store all of the results for this journal in a folder as json file
        os.mkdir(f'{output_path}{journal_list[i]}')
        with open(f'{output_path}{journal_list[i]}/{journal_list[i]}.json','w') as file:
            json.dump(issn_dict, file)
        
        with open(f'{output_path}{journal_list[i]}/{journal_list[i]}.txt','w') as file2:
            file2.write(f'This file contains {paper_counter} publications.')

In [12]:
cache_path = '/Users/DavidCJ/.scopus/scopus_search/COMPLETE/'
term_list = ['polymer','organic','molecular','property']
journal_frame = make_jlist(els_jpath)

# the below command worked well for a single process
#get_piis(term_list,journal_frame,range(1995,2021),cache_path=cache_path,output_path = '/Users/DavidJuergens/Desktop/pyblio_test/')

### Multi process cell

In [None]:
df1, df2 = np.array_split(journal_frame,2)

In [15]:
fresh_keys = ['5c3e44d3231b7ef83bbd46a1fca5fe0d','5fdac5c4056d99b0afcca6dfa7a846ae','2964abe851124885c54e2ae3b83acdd1']

In [None]:
p1 = multiprocessing.Process(target = get_piis, args = [term_list,df1,range(1995,2021),cache_path,'/Volumes/My Passport/Davids Stuff/pyblio_test3/',True,fresh_keys])
p2 = multiprocessing.Process(target = get_piis, args = [term_list,df2,range(1995,2021),cache_path,'/Volumes/My Passport/Davids Stuff/pyblio_test3/'])
#p3 = multiprocessing.Process(target = get_piis, args = [term_list,df3,range(1995,2021),cache_path,'/Volumes/My Passport/Davids Stuff/pyblio_test2/'])
#p4 = multiprocessing.Process(target = get_piis, args = [term_list,df4,range(1995,2021),cache_path,'/Volumes/My Passport/Davids Stuff/pyblio_test2/'])

p1.start()
p2.start()
#p3.start()
#p4.start()

# starttime=time.time()
# while True:
#     clear_cache(cache_path)
#     clear_output()
#     time.sleep(20.0 - ((time.time() - starttime) % 20.0)) 

p1.join()
p2.join()
#p3.join()
#p4.join()

### Stuff below is for counting how many publications are located in an output directory

In [None]:
import os

def absoluteFilePaths(directory):
    for dirpath,_,filenames in os.walk(directory):
        for f in filenames:
            yield os.path.abspath(os.path.join(dirpath, f))

In [None]:
file2 = open('/Volumes/My Passport/Davids Stuff/pyblio_test/Gene: X/Gene: X.txt','r')

In [None]:
file2.readline()

In [None]:
def count_pubs(output_path):
    count = 0
    for path in absoluteFilePaths(output_path):
        if 'txt' in path and '._' not in path:
            file = open(path,'r')
            #print(path)
            a = sum([int(s) for s in string.split() if s.isdigit()])
            count+=a

    return count

In [None]:
count_pubs('/Volumes/My Passport/Davids Stuff/pyblio_test2')