In [117]:
# This notebook is meant to prototype a script that can find the publication PII identification numbers automatically for a query into the science direct database.

# To test queries, go to https://www.scopus.com/search/form.uri?display=advanced

In [118]:
# The link to elsevier active journals link: https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls

In [158]:
from pybliometrics.scopus import ScopusSearch
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import os
import multiprocessing
from os import system, name
import json

# Goals for the algorithm

List of things by which the algorithm will parse searches:

1. Year
2. Journal
3. Keyword search

Here is an example search syntax: `s = ScopusSearch('FIRSTAUTH ( kitchin  j.r. )')`

### The following method creates a dataframe that only contains journals mentioning certain keywords in their 'Full_Category' column. 
### It still needs work on user friendlyness

In [144]:
els_jpath = 'https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls'

def make_jlist(jlist_url):
    """
    This method creates a dataframe of relevant journals to query. The dataframe contains two columns:
    (1) The names of the Journals
    (2) The issns of the Journals
    """
    
    # This creates a dataframe of the active journals and their subjects from elsevier
    active_journals = pd.read_excel(jlist_url)
    active_journals.rename(columns = {'Display Category Full Name':'Full_Category','Full Title':'Journal_Title'}, inplace = True)
    
    active_journals.Full_Category = active_journals.Full_Category.str.lower() # lowercase topics for searching
    active_journals = active_journals.drop_duplicates(subset = 'Journal_Title') # drop any duplicate journals
    active_journals = shuffle(active_journals,random_state = 42) 


    # journal_strings is currently unused
    # The set of default strings that will be used to sort which journals we want
    journal_strings = ['chemistry','energy','molecular','atomic','chemical','biochem'
                      ,'organic','polymer','chemical engineering','biotech','coloid']

    # making this an easier command to type
    name = active_journals['Full_Category'].str.contains


    # desired keywords
    # new dataframe full of only journals who's topic description contained the
    active_journals= active_journals[name('polymer') | name('chemistry') | name('energy')| 
                                     name('molecular') | name('colloid') | name('biochem')| 
                                     name('organic') | name('biotech') | name('chemical')]
    
    journal_list = active_journals[['Journal_Title','ISSN']]
    
    return journal_frame
    
    
    
    

In [151]:
def clear_stdout(): 
  
    # for windows 
    if name == 'nt': 
        _ = system('cls') 
  
    # for mac and linux(here, os.name is 'posix') 
    else: 
        _ = system('clear')

In [139]:
journal_list = make_jlist(els_jpath)

In [121]:
active_journals[active_journals['Journal_Title'].str.contains('edron')]

Unnamed: 0,Journal_Title,ISSN,Product ID,Change History,Parent Category,Full_Category
6707,Tetrahedron,404020,942,,"Biochemistry, Genetics and Molecular Biology","biochemistry, genetics and molecular biology::..."
5723,Polyhedron,2775387,218,Incorporating Inorganic and Nuclear Chemistry ...,"Biochemistry, Genetics and Molecular Biology","biochemistry, genetics and molecular biology::..."
6710,Tetrahedron Letters,404039,233,,"Biochemistry, Genetics and Molecular Biology","biochemistry, genetics and molecular biology::..."


### The following method builds the keyword search portion of a query. There is an example below that can be copy-pasted into the Scopus advanced Search.

In [122]:
def build_search_terms(kwds):
    """
    This builds the keyword search portion of the query string. 
    """
    tak = ""
    for i in range(len(kwds)):
        if i != len(kwds)-1:
            tak += kwds[i] + ' OR '
        else:
            tak += kwds[i] + ' '
    
    return tak

### The following method builds the entiry query to be put into pybliometrics

In [123]:
# Here is a model test query 
# test = search(verbose = True, query = 'polymer OR organic OR molecular AND PUBYEAR IS 2019 AND ISSN(00404020)')

In [154]:
def build_query_dict(term_list,issn_list,year_list):
    """
    This method takes the list of journals and creates a nested dictionary
    containing all accessible queries, in each year, for each journal,
    for a given keyword search on sciencedirect.
    
    Parameters
    ----------
    term_list(list, required): the list of search terms looked for in papers by the api.
    
    issn_list(list, required): the list of journal issn's to be queried. Can be created by getting the '.values'
    of a 'journal_list' dataframe that has been created from the 'make_jlist' method.
    
    year_list(list, required): the list of years which will be searched through
    
    """
    search_terms = build_search_terms(term_list)
    dict1 = {}
    
    for issn in issn_list:
        
        issn_terms = ' AND ISSN(' + issn + ')'
        dict2 = {}
        
        for year in year_list:
            
            year_terms = "AND PUBYEAR IS " + str(year)
            querystring = search_terms + year_terms + issn_terms

            dict2[year] = querystring

        dict1[issn] = dict2

    return dict1



In [155]:
term_list = ['polymer','organic','molecular','molecule']

In [156]:
# example of how to use the dictionary builder
dictionary = build_query_dict(term_list,issn_list,range(1995,2021))

In [157]:
dictionary['00404020'][2015]

'polymer OR organic OR molecular OR molecule AND PUBYEAR IS 2015 AND ISSN(00404020)'

### Below is a method(s) to:
#### 1) clear the cache so we don't run out of memory
#### 2) redirect download output to the desired filepath


In [79]:
test = ScopusSearch(verbose=True, query = dictionary[2015]['00404020'])

Downloading results for query "polymer OR organic OR molecular OR molecule AND PUBYEAR IS 2015 AND ISSN(00404020)":
Progress: |██████████████████████████████████████████████████| 100.00% Complete


In [80]:
len(test.results)

1077

In [153]:
test.results[10].pii

'S0040402015301630'

In [108]:
test.results[14]

Document(eid='2-s2.0-84947976373', doi='10.1016/j.tet.2015.10.020', pii='S0040402015301204', pubmed_id=None, title='A simple synthesis of bannucine and 5′-epibannucine from (-)-vindoline', subtype='ar', creator='Ilkei V.', afid='60030035;60027811;60026753;116005646', affilname='Budapest University of Technology and Economics;Magyar Tudomanyos Akademia;Gedeon Richter Plc;XiMo Hungary Ltd.', affiliation_city='Budapest;Budapest;Budapest;Budapest', affiliation_country='Hungary;Hungary;Hungary;Hungary', author_count='13', author_names='Ilkei, Viktor;Bana, Péter;Tóth, Flórián;Palló, Anna;Holczbauer, Tamás;Czugler, Mátyás;Sánta, Zsuzsanna;Dékány, Miklós;Szigetvári, Áron;Hazai, László;Szántay, Csaba;Kalaus, György', author_ids='56974381900;56974499200;13604903200;22981097600;26323231800;7004891749;15722681900;56395306900;56974239900;6602468474;57191519955;6603841259', author_afids='60030035;60030035;116005646;60027811;60027811;60027811;60026753;60026753;60030035;60030035;60030035-60026753;6003

In [58]:
cache_path = '/Users/DavidJuergens/.scopus/scopus_search/COMPLETE'

In [145]:
def clear_cache(cache_path):
    """
    Be very careful with this method. It can delete your entire computer if you let it. 
    """
    if len(os.listdir(cache_path)) < 4: # add something about checking that you're within the .scopus or something
        os.chdir(cache_path)
        for file in os.listdir(cache_path):
            os.remove(file)

### The method below loops through the entire journal list and collects article metadata, including PII

Things we probably want to just grab because we have them:
1. Author names
2. Author keywords
3. Cited by count
4. title
5. PII
6. DOI
7. Description

In [159]:
def get_piis(term_list, journal_fame, year_list, cache_path, output_path):
    """
    This should be a standalone method that recieves a list of journals (issns), a keyword search,
    an output path and a path to clear the cache. It should be mappable to multiple parallel processes. 
    """
    issn_list = journal_frame['ISSN'].values
    journal_list = journal_frame['Journal_Title'].values
    
    # Build the dictionary that can be used to sequentially query elsevier for different journals and years
    query_dict = build_query_dict(term_list,issn_list,year_list)
    
    # Must write to memory, clear cache, and clear a dictionary upon starting every new journal
    for i in range(len(issn_list)):
        
        issn_dict = {}
        for j in range(len(year_list)):
            
            # for every year and every journal, query the keywords
            print(f'Downloading results from {journa_list[j]} in {year_list[i]}.')
            query_results = ScopusSearch(query = query_dict[issn_list[i]][year_list[j]])
            
            # store relevant information from the results into a dictionary pertaining to that query
            result_dict = {}
            for result in query_results:
                
                result_dict['pii'] = result.pii
                result_dict['doi'] = result.doi
                result_dict['title'] = result.title
                result_dict['num_authors'] = result.author_count
                result_dict['authors'] = result.author_names
                result_dict['description'] = result.description
                result_dict['citation_count'] = result.citedby_count
                result_dict['keywords'] = result.authkeywords
            
            # Store all of the results for this year in the dictionary containing to a certain journal
            issn_dict[year] = result_dict
        
        # Store all of the results for this journal in a folder as json file
        with open(f'{outputpath}{journal_list[j]}.json','w') as file:
            json.dump(issn_dict, fp)