# PKP crawler

Collect altmetric data for PKP publications

1. Collect FB shares from Altmetric.com via DOI
2. Collect FB shares from FB directly via URLs
    - Resolved DOI
    - Original PKP URL
    - (opt) PMID
    - (opt) PMCID

In [35]:
import datetime
import time
import sys
import re
import requests
import json
import urllib
from dateutil.parser import parse
from random import shuffle

import pandas as pd
import numpy as np
import lxml.etree as ET
from pathlib import Path
import configparser
from ATB.ATB.Facebook import Facebook
from ATB.ATB.Altmetric import Altmetric

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()




In [3]:
# Load config
Config = configparser.ConfigParser()
Config.read('config.cnf')
FACEBOOK_APP_ID = Config.get('facebook', 'app_id')
FACEBOOK_APP_SECRET = Config.get('facebook', 'app_secret')
ALTMETRIC_KEY = Config.get('altmetric', 'key')

fb_graph = Facebook(app_id=FACEBOOK_APP_ID, app_secret=FACEBOOK_APP_SECRET)
altmetric = Altmetric(api_key = ALTMETRIC_KEY)

Generated access token: 287299458433880|6Y_ml710QWnU7HBYLWjaneoWVKU


In [4]:
data_folder = Path("data/pkp/")
input_file = data_folder / "PKP_20171220.csv"

In [5]:
## Functions
def load_dataset(ids_file, resolv_dois_file):
    ncbi = pd.read_csv(ids_file, parse_dates=['ncbi_ts'], index_col="doi")
    resolved_dois = pd.read_csv(resolv_dois_file, parse_dates=['doi_resolve_ts'], index_col="doi")
    
    df = ncbi.merge(resolved_dois[['doi_url']], left_index=True, right_index=True, how="inner")
    return df.drop_duplicates()

# Facebook
def fb_query(url):
    og_object = None
    og_engagement = None
    og_error = None
    
    try:
        fb_response = fb_graph.get_object(
            id=urllib.parse.quote_plus(url),
            fields="engagement, og_object"
        )
        
        if 'og_object' in fb_response:
            og_object = fb_response['og_object']
        if 'engagement' in fb_response:
            og_engagement = fb_response['engagement']
    except Exception as e:
        og_error = e
  
    return (og_object, og_engagement, og_error)

def collect_fb_engagement(df):
    df = df.copy()
    df['og_obj'] = None
    df['og_eng'] = None
    df['og_err'] = None
    df['og_ts'] = None
    
    rows = list(df.itertuples())
    for row in tqdm_notebook(rows, total=len(rows)):
        now = datetime.datetime.now()
        og_object, og_engagement, og_error = fb_query(row.url)
        
        if og_object:
            df.loc[(df.doi==row.doi) & (df.type==row.type), 'og_obj'] = json.dumps(og_object)
        if og_engagement:
            df.loc[(df.doi==row.doi) & (df.type==row.type), 'og_eng'] = json.dumps(og_engagement)
        if og_error:
            df.loc[(df.doi==row.doi) & (df.type==row.type), 'og_err'] = str(og_error)
        df.loc[(df.doi==row.doi) & (df.type==row.type), 'og_ts'] = str(now)
        
    return df

def extract_fb_shares(df):
    result_cols = ['pkp', 'pkp_ogid',
                   'pmid', 'pmid_ogid',
                   'pmc', 'pmc_ogid',
                   'doi', 'doi_ogid',
                   'total']
    shares = pd.DataFrame(columns=result_cols, index=list(set(df.doi)))
    
    for row in df.itertuples():
        if pd.notnull(row.og_obj):
            shares.loc[row.doi, row.type.split("_")[0] + "_ogid"] =  json.loads(row.og_obj)['id']
            shares.loc[row.doi, row.type.split("_")[0]] =  int(json.loads(row.og_eng)['share_count'])
        #if pd.notnull(row.og_eng):
        #    shares.loc[row.doi, row.type.split("_")[0]] =  int(json.loads(row.og_eng)['share_count'])
        
    return shares
        
# Altmetric
def collect_am_engagement(df):
    df = df[[]].copy()
    df['am_resp'] = None
    df['am_err'] = None
    df['am_ts'] = None
    
    now = datetime.datetime.now()
    
    rows = list(df.itertuples())
    for row in tqdm_notebook(rows, total=len(rows)):
        try:
            am_resp = altmetric.doi(doi=row.Index, fetch=True)
            am_err = None
        except Exception as e:
            am_resp = None
            am_err = e

        df.loc[row.Index, 'am_resp'] = json.dumps(am_resp)
        df.loc[row.Index, 'am_err'] = str(am_err)
        df.loc[row.Index, 'am_ts'] = str(now)
        
    return df

## Filter out invalid records

- Different DOIs with identical PKP URLs
- DOIs resolved to the same URL

In [82]:
article_with_urls = pd.read_csv(data_folder / "articles_with_urls.csv", index_col="doi")

df_urls = article_with_urls.reset_index().melt(
    value_vars=['pmc_url', 'pmid_url', 'pkp_url', 'doi_url'],
    id_vars='doi',
    value_name="url",
    var_name="type")

df_urls = df_urls.replace(["None", "null", ""], np.nan).dropna()
df_urls = df_urls.drop_duplicates()

In [83]:
for _ in ["doi_url", "pkp_url", "pmc_url", "pmid_url"]:
    selection = df_urls.copy()
    selection = selection[selection.type == _]
    selection['netloc'] = selection.url.apply(lambda x: urllib.parse.urlparse(x).netloc)
    selection['path'] = selection.url.apply(lambda x: urllib.parse.urlparse(x).path)
    bad_ones = selection[selection[['netloc', 'path']].duplicated(keep=False)].index
    df_urls = df_urls.drop(bad_ones)
    print("Removed {} rows with bad {}".format(len(bad_ones), _))

Removed 14049 rows with bad doi_url
Removed 1190 rows with bad pkp_url
Removed 0 rows with bad pmc_url
Removed 0 rows with bad pmid_url


In [84]:
# Sanity check: duplicate URLs need to appear in both PKP and DOI
df_urls[df_urls.url.isin(df_urls[df_urls.url.duplicated()].url)].type.value_counts()

pkp_url    117861
doi_url    117861
Name: type, dtype: int64

## Collect FB engagement

In [96]:
fb_results = collect_fb_engagement(df_urls)
fb_results.to_csv(data_folder / "fb_responses.csv")




In [97]:
fb_shares = extract_fb_shares(fb_results)
fb_shares.to_csv(data_folder / "fb_shares.csv")
fb_shares.astype(float).describe()

Unnamed: 0,pkp,pkp_ogid,pmid,pmid_ogid,pmc,pmc_ogid,doi,doi_ogid,total
count,17.0,17.0,0.0,0.0,1.0,1.0,16.0,16.0,0.0
mean,0.705882,889641100000000.0,,,0.0,1150652000000000.0,1.0,1075145000000000.0,
std,2.417765,426251300000000.0,,,,,2.607681,504292300000000.0,
min,0.0,128677000000000.0,,,0.0,1150652000000000.0,0.0,128677000000000.0,
25%,0.0,628486600000000.0,,,0.0,1150652000000000.0,0.0,683580900000000.0,
50%,0.0,828524300000000.0,,,0.0,1150652000000000.0,0.0,1092632000000000.0,
75%,0.0,1111781000000000.0,,,0.0,1150652000000000.0,0.25,1546874000000000.0,
max,10.0,1745181000000000.0,,,0.0,1150652000000000.0,10.0,1852334000000000.0,


## Collect Altmetric engagement

In [113]:
am_queries = collect_am_engagement(article_with_urls)
am_queries.to_csv(data_folder / "am_responses.csv")




## Playground

In [89]:
am_results = pd.read_csv(data_folder / "am_responses.csv", parse_dates=['am_ts'])
fb_results = pd.read_csv(data_folder / "fb_responses.csv",
                         parse_dates=['ts'],
                         dtype={'doi': str, 'type': str, 'url': str, 'og_obj': str, 'og_eng': str, 'og_err': str})
am_results = am_results.replace("null", np.nan)

In [95]:
len(set(df_urls.doi)), len(am_results)

(281607, 278496)

In [93]:
len(df_urls) - 117861, len(fb_results)

(365181, 373780)

In [105]:
frames = []
for t in ['doi_url', 'pkp_url', 'pmid_url', 'pmc_url']:
    a = fb_results[fb_results.type == t].copy()
    b = df_urls[df_urls.type == t].copy()
    c = a[a.doi.isin(b.doi.tolist())]
    frames.append(c)
    print("{}: need to remove {}".format(t, len(a)-len(c)))

doi_url: need to remove 14544
pkp_url: need to remove 1104
pmid_url: need to remove 1
pmc_url: need to remove 1


In [107]:
pd.concat(frames).to_csv(data_folder / "fb_responses_filtered.csv")

In [114]:
am_results[am_results.doi.isin(df_urls.doi)].set_index('doi').to_csv(data_folder / "am_responses_filtered.csv")

In [111]:
len(am_results), len(am_results[am_results.doi.isin(df_urls.doi)])

(278496, 274547)