In [None]:
# This notebook is meant to prototype a script that can find the publication PII identification numbers automatically for a query into the science direct database.

# To test queries, go to https://www.scopus.com/search/form.uri?display=advanced

In [None]:
# The link to elsevier active journals link: https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls

In [2]:
from pybliometrics.scopus import ScopusSearch as search
import pandas as pd
import numpy as np

# Goals for the algorithm

List of things by which the algorithm will parse searches:

1. Year
2. Journal
3. Keyword search

Here is an example search syntax: `s = ScopusSearch('FIRSTAUTH ( kitchin  j.r. )')`

In [3]:
# This creates a dataframe of the active journals and their subjects from elsevier
active_journals = pd.read_excel('https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls')

In [5]:
active_journals.head()

Unnamed: 0,Full Title,ISSN,Product ID,Change History,Parent Category,Display Category Full Name
0,Academic Pediatrics,18762859,7802,Formerly known as Ambulatory Pediatrics,Medicine and Dentistry,"Medicine and Dentistry::Perinatology, Pediatri..."
1,Academic Pediatrics,18762859,7802,Formerly known as Ambulatory Pediatrics,Medicine and Dentistry,Medicine and Dentistry::Public Health and Heal...
2,Academic Radiology,10766332,13351,,Medicine and Dentistry,Medicine and Dentistry::Radiology and Imaging
3,Accident Analysis & Prevention,14575,336,,Chemical Engineering,Chemical Engineering::Chemical Health and Safety
4,Accident Analysis & Prevention,14575,336,,Engineering,"Engineering::Safety, Risk, Reliability and Qua..."


In [39]:
# How many journals contain the substring "chem" in their title or description
def j_contains(active_journals,kwd,output=False):
    """
    This method finds the rows that contain the keywords in journal titles or descriptions
    """
    df = active_journals[active_journals['Display Category Full Name'].str.contains(kwd)]
    print(len(df),"journals with the substring ",kwd)
    
    if output is True:
        return df

In [53]:
active_journals.iloc[0,5]

'Medicine and Dentistry::Perinatology, Pediatrics and Child Health'

In [50]:
j_contains(active_journals,'chem')
j_contains(active_journals,'Chem')
j_contains(active_journals,'polymer')
j_contains(active_journals,'Polymer')
j_contains(active_journals,'Molecul')
j_contains(active_journals,'')


662 journals with the substring  chem
428 journals with the substring  Chem
0 journals with the substring  polymer
19 journals with the substring  Polymer
669 journals with the substring  Molecul
0 journals with the substring  edron


In [None]:
def make_query(kwds,year,journal):
    """
    This method creates a query string from the input information that can be used in the ScopusSearch method. 
    """
    
    # This builds the keyword search portion of the query string
    tak = "TITLE-ABS-KEY("
    for i in range(len(kwds)):
        if i != len(kwds)-1:
            tak += kwds[i] + 'OR'
        else:
            tak += kwds[i] + ')'
    
    # This builds the year portion of the query string 
    yr = "PUBYEAR IS " + year
    
    # This builds the journal portion of the query string 
    