In [1]:
# This notebook is meant to prototype a script that can find the publication PII identification numbers automatically for a query into the science direct database.

# To test queries, go to https://www.scopus.com/search/form.uri?display=advanced

In [2]:
# The link to elsevier active journals link: https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls

In [106]:
api = '506e073160e8aa445febbadf41f11c83'

In [20]:
from pybliometrics.scopus import ScopusSearch as search
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

# Goals for the algorithm

List of things by which the algorithm will parse searches:

1. Year
2. Journal
3. Keyword search

Here is an example search syntax: `s = ScopusSearch('FIRSTAUTH ( kitchin  j.r. )')`

In [12]:
# This creates a dataframe of the active journals and their subjects from elsevier
active_journals = pd.read_excel('https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls')
active_journals.rename(columns = {'Display Category Full Name':'Full_Category','Full Title':'Journal_Title'}, inplace = True)

### The following method creates a dataframe that only contains journals mentioning certain keywords in their 'Full_Category' column. 
### It still needs work on user friendlyness

In [60]:
active_journals.Full_Category = active_journals.Full_Category.str.lower() # lowercase topics for searching
active_journals = active_journals.drop_duplicates(subset = 'Journal_Title') # drop any duplicate journals
active_journals = shuffle(active_journals,random_state = 42) 



# The set of default strings that will be used to sort which journals we want
journal_strings = ['chemistry','energy','molecular','atomic','chemical','biochem'
                  ,'organic','polymer','chemical engineering','biotech','coloid']

# making this an easier command to type
name = active_journals['Full_Category'].str.contains


# desired keywords
# new dataframe full of only journals who's topic description contained the
active_journals= active_journals[name('polymer') | name('chemistry') | name('energy')| name('molecular') | name('colloid') | name('biochem')| name('organic') | name('biotech') | name('chemical')]
issn_list = active_journals['ISSN'].values

In [63]:
active_journals[active_journals['Journal_Title'].str.contains('edron')]

Unnamed: 0,Journal_Title,ISSN,Product ID,Change History,Parent Category,Full_Category
5723,Polyhedron,2775387,218,Incorporating Inorganic and Nuclear Chemistry ...,"Biochemistry, Genetics and Molecular Biology","biochemistry, genetics and molecular biology::..."
6710,Tetrahedron Letters,404039,233,,"Biochemistry, Genetics and Molecular Biology","biochemistry, genetics and molecular biology::..."
6707,Tetrahedron,404020,942,,"Biochemistry, Genetics and Molecular Biology","biochemistry, genetics and molecular biology::..."


In [134]:
issn_list

array(['10434666', '87563282', '18766102', '03702693', '15677249',
       '00219517', '18751741', '10049541', '03605442', '00489697',
       '0018506X', '0960894X', '03790738', '23727705', '00162361',
       '25901524', '01956701', '25889125', '16720229', '00098981',
       '25889133', '03601323', '18794068', '09582118', '00260495',
       '01689525', '24680230', '00032670', '00092614', '1567133X',
       '00224073', '23521864', '03783812', '22151532', '07173458',
       '00796565', '15345807', '18961126', '00166480', '24523364',
       '22150161', '25901826', '01694332', '17406749', '13881981',
       '00102180', '00221759', '13861425', '08986568', '0147619X',
       '1744117X', '01681656', '14384221', '23521791', '13595113',
       '09684328', '01959255', '10462023', '25423649', '15504131',
       '09248579', '23523204', '09213449', '03043894', '24520721',
       '13835718', '03603199', '00222828', '10106030', '09232508',
       '03603016', '19365233', '25425293', '09262040', '037851

### The following method builds the keyword search portion of a query. There is an example below that can be copy-pasted into the Scopus advanced Search.

In [142]:
def build_search_terms(kwds):
    """
    This builds the keyword search portion of the query string.
    """
    tak = ""
    for i in range(len(kwds)):
        if i != len(kwds)-1:
            tak += kwds[i] + ' OR '
        else:
            tak += kwds[i] + ' '
    
    return tak

In [126]:
build_search_terms(['polymer','organic','molecular'])

'polymer OR organic OR molecular'

### The following method builds the entiry query to be put into pybliometrics

In [90]:
len(search(make_query(build_search_terms(['polymer','organic','molecular','property']),2018,'00404020')).results)

202

In [161]:
# Here is a model test query 
# test = search(verbose = True, query = 'polymer OR organic OR molecular AND PUBYEAR IS 2019 AND ISSN(00404020)')

In [159]:
def build_query_dict(term_list,issn_list,year_list):
    """
    This method takes the list of journals and creates a nested dictionary
    containing all accessible queries, in each year, for each journal,
    for a given keyword search on sciencedirect.
    """
    search_terms = build_search_terms(term_list)
    dict1 = {}
    
    for year in year_list:
        
        year_terms = " AND PUBYEAR IS " + str(year)
        dict2 = {}
        
        for issn in issn_list:
            
            issn_terms = ' AND ISSN(' + issn + ')'
            querystring = search_terms + str(year) + issn_terms

            dict2[issn] = querystring

        dict1[year] = dict2

    return dict1



In [160]:
dictionary = build_query_dict(['polymer','organic','molecular','molecule','property','corrosion inhibitor','flame retardant'],issn_list,range(1995,2021))

### Below is a method to clear the cache so we don't run out of memory. 