# Data Practices Analysis Text Mining

## Notebook Configuration

### Library Imports and Logging Configuration

In [1]:
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval
import pandas as pd
import requests
import logging
from logging.config import dictConfig
import pathlib
from secret import APIKEY

pd.set_option('display.max_columns', None)

logger = logging.getLogger()
fhandler = logging.FileHandler(filename='textmining.log', mode='a')
formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

logger.info("Initializing logfile")

### Helper Functions

In [2]:
# This is a wrap-around function designed to work around a download issue with Scopus.
# It keeps trying the search until there are no KeyErrors.
def robust_query(q, refresh=False):
    try:
        return ScopusSearch(q, refresh=refresh).results
    except KeyError:
        return ScopusSearch(q, refresh=True).results

### Constants

In [3]:
# OHSU Affiliation ID
af_id = "60016733"

# Publication Year
pubyear = 2010

# Scopus Query
query = f"AF-ID({af_id}) AND PUBYEAR > {pubyear}"

# HTTP Request header for fulltext requests
headers = {'X-ELS-APIKey': APIKEY }

## Scopus Search

### Execute search

In [4]:
search = robust_query(query)

### Results -> DataFrame

In [5]:
df = pd.DataFrame(search)

### DataFrame stats

In [6]:
df.shape

(23000, 33)

In [7]:
list(df)

['eid',
 'doi',
 'pii',
 'pubmed_id',
 'title',
 'subtype',
 'creator',
 'afid',
 'affilname',
 'affiliation_city',
 'affiliation_country',
 'author_count',
 'author_names',
 'author_ids',
 'author_afids',
 'coverDate',
 'coverDisplayDate',
 'publicationName',
 'issn',
 'source_id',
 'eIssn',
 'aggregationType',
 'volume',
 'issueIdentifier',
 'article_number',
 'pageRange',
 'description',
 'authkeywords',
 'citedby_count',
 'openaccess',
 'fund_acr',
 'fund_no',
 'fund_sponsor']

### Inspect Data

In [8]:
df.head()

Unnamed: 0,eid,doi,pii,pubmed_id,title,subtype,creator,afid,affilname,affiliation_city,affiliation_country,author_count,author_names,author_ids,author_afids,coverDate,coverDisplayDate,publicationName,issn,source_id,eIssn,aggregationType,volume,issueIdentifier,article_number,pageRange,description,authkeywords,citedby_count,openaccess,fund_acr,fund_no,fund_sponsor
0,2-s2.0-85065488887,10.1007/978-3-030-17795-9_47,,,Evaluating the Implementation of Deep Learning...,cp,Purkayastha S.,60024609;60016733,Indiana University-Purdue University Indianapo...,Indianapolis;Portland,United States;United States,5,"Purkayastha, Saptarshi;Buddi, Surendra Babu;Nu...",36480509900;57208665364;57208673405;5720867500...,60024609;60024609;60024609;60024609;60016733,2020-01-01,2020,Advances in Intelligent Systems and Computing,21945357.0,5100152904,,Book Series,943,,,648-657,"© 2020, Springer Nature Switzerland AG. Respir...",Chest X-ray | CheXNet | Deep learning | LibreH...,0,0,,undefined,
1,2-s2.0-85062605139,10.1038/s41598-019-40518-5,,30846826.0,Maintenance of MYC expression promotes de novo...,ar,Coleman D.,60016733,Oregon Health and Science University,Portland,United States,13,"Coleman, Daniel J.;Gao, Lina;Schwartzman, Jaco...",57190757543;55448906400;35725963500;6602509295...,60016733;60016733;60016733;60016733;60016733;6...,2019-12-01,1 December 2019,Scientific Reports,,21100200805,20452322.0,Journal,9,1.0,3823.0,,"© 2019, The Author(s). The BET bromodomain pro...",,1,1,OCTRI,CA178610,National Cancer Institute
2,2-s2.0-85063067563,10.1038/s41598-019-41402-y,,30886396.0,MR1 recycling and blockade of endosomal traffi...,ar,Karamooz E.,60016733;109869224,Oregon Health and Science University;Research ...,Portland;Portland,United States;United States,5,"Karamooz, Elham;Harriff, Melanie J.;Narayanan,...",14071474600;21734451400;57207847064;5720446108...,109869224-60016733;109869224-60016733;60016733...,2019-12-01,1 December 2019,Scientific Reports,,21100200805,20452322.0,Journal,9,1.0,4797.0,,"© 2019, The Author(s). The MHC-Ib molecule MR1...",,0,1,,R01AI29976,
3,2-s2.0-85061782645,10.1038/s41598-019-39003-w,,30783177.0,Toughening of Dental Composites with Thioureth...,ar,Fugolin A.,60016733;105621292,Oregon Health and Science University;Maringa U...,Portland,United States;Brazil,4,"Fugolin, Ana P.;Sundfeld, Daniel;Ferracane, Ja...",55207293800;55910743000;7004872922;57206657237,60016733;105621292;60016733;60016733,2019-12-01,1 December 2019,Scientific Reports,,21100200805,20452322.0,Journal,9,1.0,2286.0,,"© 2019, The Author(s). Stress of polymerizatio...",,0,1,,1R15-DE023211-01A1,
4,2-s2.0-85065568634,10.1038/s41598-019-43682-w,,31073224.0,Differential microRNA profile underlies the di...,ar,Simões A.,60027561;60016733;60008088,University of Illinois at Chicago;Oregon Healt...,Chicago;Portland;Sao Paulo,United States;United States;Brazil,9,"Simões, Alyne;Chen, Lin;Chen, Zujian;Zhao, Yan...",23566972200;8584978600;36698710900;57060978700...,60027561-60008088;60027561;60027561;60027561;6...,2019-12-01,1 December 2019,Scientific Reports,,21100200805,20452322.0,Journal,9,1.0,7160.0,,"© 2019, The Author(s). Oral mucosal wounds hea...",,0,1,,R01GM50875,Fundação de Amparo à Pesquisa do Estado de São...


### Add PII and DOI URI columns for ScienceDirect full text pull

In [9]:
sd_pii_uri = "https://api.elsevier.com/content/article/pii/"

df['sd_pii_uri'] = df.apply(lambda row: f"{sd_pii_uri}" + row['pii'] if row['pii'] else "", axis=1)

In [10]:
sd_doi_uri = "https://api.elsevier.com/content/article/doi/"

df['sd_doi_uri'] = df.apply(lambda row: f"{sd_doi_uri}" + row['doi'] if row['doi'] else "", axis=1)

### Export DataFrame to CSV

In [11]:
df.to_csv("ohsu_scopus_after_2010.csv")

## ScienceDirect Document Retrieval

### Create list of PII/URI tuples

In [12]:
piis = list(df[['eid', 'sd_pii_uri']].itertuples(index=False, name=False))
piis

[('2-s2.0-85065488887', ''),
 ('2-s2.0-85062605139', ''),
 ('2-s2.0-85063067563', ''),
 ('2-s2.0-85061782645', ''),
 ('2-s2.0-85065568634', ''),
 ('2-s2.0-85061716153', ''),
 ('2-s2.0-85061187949', ''),
 ('2-s2.0-85060548523', ''),
 ('2-s2.0-85059755044', ''),
 ('2-s2.0-85066234687', ''),
 ('2-s2.0-85067335747', ''),
 ('2-s2.0-85064248961', ''),
 ('2-s2.0-85062976120', ''),
 ('2-s2.0-85062827768', ''),
 ('2-s2.0-85061500872', ''),
 ('2-s2.0-85060620988', ''),
 ('2-s2.0-85065796826', ''),
 ('2-s2.0-85065674973', ''),
 ('2-s2.0-85060187210', ''),
 ('2-s2.0-85060154328', ''),
 ('2-s2.0-85063003050', ''),
 ('2-s2.0-85065567276', ''),
 ('2-s2.0-85060371877', ''),
 ('2-s2.0-85064901827', ''),
 ('2-s2.0-85062963558', ''),
 ('2-s2.0-85064531830', ''),
 ('2-s2.0-85061793788', ''),
 ('2-s2.0-85061494936', ''),
 ('2-s2.0-85062726457', ''),
 ('2-s2.0-85060528251', ''),
 ('2-s2.0-85065741319',
  'https://api.elsevier.com/content/article/pii/S0952791519300147'),
 ('2-s2.0-85058631034', ''),
 ('2-s2.

### Create list of DOI/URI tuples

In [13]:
dois = list(df[['eid', 'sd_doi_uri']].itertuples(index=False, name=False))
dois

[('2-s2.0-85065488887',
  'https://api.elsevier.com/content/article/doi/10.1007/978-3-030-17795-9_47'),
 ('2-s2.0-85062605139',
  'https://api.elsevier.com/content/article/doi/10.1038/s41598-019-40518-5'),
 ('2-s2.0-85063067563',
  'https://api.elsevier.com/content/article/doi/10.1038/s41598-019-41402-y'),
 ('2-s2.0-85061782645',
  'https://api.elsevier.com/content/article/doi/10.1038/s41598-019-39003-w'),
 ('2-s2.0-85065568634',
  'https://api.elsevier.com/content/article/doi/10.1038/s41598-019-43682-w'),
 ('2-s2.0-85061716153',
  'https://api.elsevier.com/content/article/doi/10.1038/s41526-019-0063-6'),
 ('2-s2.0-85061187949',
  'https://api.elsevier.com/content/article/doi/10.1097/AIA.0000000000000210'),
 ('2-s2.0-85060548523',
  'https://api.elsevier.com/content/article/doi/10.1038/s41598-018-37348-2'),
 ('2-s2.0-85059755044',
  'https://api.elsevier.com/content/article/doi/10.1038/s41467-018-07872-w'),
 ('2-s2.0-85066234687',
  'https://api.elsevier.com/content/article/doi/10.1038

### Iterate over PIIs, then iterate over DOIs

* We iterate over PIIs first because they are more likely to return HTTP Status 200.  This saves us lots of potential 404 requests.
* These loops may error out before they complete, so we check to see if we've already pulled the file in case we have to start over.  This saves us from making duplicate requests for files.
* The Scopus EID is used for filenames.

In [45]:
for eid, pii in piis:
    path = pathlib.Path(f"fulltext/{eid}")
    if pii and not path.exists():
        response = requests.get(pii, headers=headers)
        logger.info(f"Status {response.status_code}: {eid}, {pii}")ab
        if response.status_code == 200:
            with open(f"fulltext/{eid}", 'w') as f:
                f.write(response.text)


In [None]:
for eid, doi in dois:
    path = pathlib.Path(f"fulltext/{eid}")
    if doi and not path.exists():
        response = requests.get(doi, headers=headers)
        logger.info(f"Status {response.status_code}: {eid}, {doi}")
        if response.status_code == 200:
            with open(f"fulltext/{eid}", 'w') as f:
                f.write(response.text)