# PKP crawler

Collect altmetric data for PKP publications

1. Collect FB shares from Altmetric.com via DOI
2. Collect FB shares from FB directly via URLs
    - Resolved DOI
    - Original PKP URL
    - (opt) PMID
    - (opt) PMCID

In [1]:
import datetime
import time
import sys
import re
import requests
import json
import urllib
from dateutil.parser import parse
from random import shuffle

import pandas as pd
import numpy as np
import lxml.etree as ET
from pathlib import Path
import configparser
from ATB.ATB.Facebook import Facebook
from ATB.ATB.Altmetric import Altmetric

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()




In [2]:
# Load config
Config = configparser.ConfigParser()
Config.read('config.cnf')
FACEBOOK_APP_ID = Config.get('facebook', 'app_id')
FACEBOOK_APP_SECRET = Config.get('facebook', 'app_secret')
ALTMETRIC_KEY = Config.get('altmetric', 'key')

fb_graph = Facebook(app_id=FACEBOOK_APP_ID, app_secret=FACEBOOK_APP_SECRET)
altmetric = Altmetric(api_key = ALTMETRIC_KEY)

Generated access token: 287299458433880|6Y_ml710QWnU7HBYLWjaneoWVKU


In [94]:
data_folder = Path("data/clean_pkp_sample/")
input_file = data_folder / "PKP_20171220.csv"

In [112]:
## Functions
def load_dataset(ids_file, resolv_dois_file):
    ncbi = pd.read_csv(ids_file, parse_dates=['ncbi_ts'], index_col="doi")
    resolved_dois = pd.read_csv(resolv_dois_file, parse_dates=['doi_resolve_ts'], index_col="doi")
    
    df = ncbi.merge(resolved_dois[['doi_url']], left_index=True, right_index=True, how="inner")
    return df.drop_duplicates()

# Facebook
def fb_query(url):
    og_object = None
    og_engagement = None
    og_error = None
    
    try:
        fb_response = fb_graph.get_object(
            id=urllib.parse.quote_plus(url),
            fields="engagement, og_object"
        )
        
        if 'og_object' in fb_response:
            og_object = fb_response['og_object']
        if 'engagement' in fb_response:
            og_engagement = fb_response['engagement']
    except Exception as e:
        og_error = e
  
    return (og_object, og_engagement, og_error)

def collect_fb_engagement(df):
    df = df.copy()
    df['og_obj'] = None
    df['og_eng'] = None
    df['og_err'] = None
    df['og_ts'] = None
    
    rows = list(df.itertuples())
    for row in tqdm_notebook(rows, total=len(rows)):
        now = datetime.datetime.now()
        og_object, og_engagement, og_error = fb_query(row.url)
        
        if og_object:
            df.loc[(df.doi==row.doi) & (df.type==row.type), 'og_obj'] = json.dumps(og_object)
        if og_engagement:
            df.loc[(df.doi==row.doi) & (df.type==row.type), 'og_eng'] = json.dumps(og_engagement)
        if og_error:
            df.loc[(df.doi==row.doi) & (df.type==row.type), 'og_err'] = str(og_error)
        df.loc[(df.doi==row.doi) & (df.type==row.type), 'og_ts'] = str(now)
        
    return df

def extract_fb_shares(df):
    result_cols = ['pkp', 'pkp_ogid',
                   'pmid', 'pmid_ogid',
                   'pmc', 'pmc_ogid',
                   'doi', 'doi_ogid',
                   'total']
    shares = pd.DataFrame(columns=result_cols, index=list(set(df.doi)))
    
    for row in df.itertuples():
        if pd.notnull(row.og_obj):
            shares.loc[row.doi, row.type.split("_")[0] + "_ogid"] =  json.loads(row.og_obj)['id']
            shares.loc[row.doi, row.type.split("_")[0]] =  int(json.loads(row.og_eng)['share_count'])
        #if pd.notnull(row.og_eng):
        #    shares.loc[row.doi, row.type.split("_")[0]] =  int(json.loads(row.og_eng)['share_count'])
        
    return shares
        
# Altmetric
def collect_am_engagement(df):
    df = df[[]].copy()
    df['am_resp'] = None
    df['am_err'] = None
    df['am_ts'] = None
    
    now = datetime.datetime.now()
    
    rows = list(df.itertuples())
    for row in tqdm_notebook(rows, total=len(rows)):
        try:
            am_resp = altmetric.doi(doi=row.Index, fetch=True)
            am_err = None
        except Exception as e:
            am_resp = None
            am_err = e

        df.loc[row.Index, 'am_resp'] = json.dumps(am_resp)
        df.loc[row.Index, 'am_err'] = str(am_err)
        df.loc[row.Index, 'am_ts'] = str(now)
        
    return df

In [105]:
article_with_urls = pd.read_csv(data_folder / "articles_with_urls.csv", index_col="doi")

df_urls = article_with_urls.reset_index().melt(
    value_vars=['pmc_url', 'pmid_url', 'pkp_url', 'doi_url'],
    id_vars='doi',
    value_name="url",
    var_name="type").dropna()

In [90]:
df_urls.replace(["None", "", "null"], np.nan).dropna(how="a").type.value_counts().sort_values()

pmid_url      6925
pmc_url       7102
doi_url     201284
pkp_url     284766
Name: type, dtype: int64

In [88]:
t.type.value_counts().sort_values()

pmid_url      6887
pmc_url       7064
pkp_url     159741
doi_url     200088
Name: type, dtype: int64

In [80]:
type(t[t.type=="doi_url"].og_obj[3])

str

## Collect FB engagement

In [96]:
fb_results = collect_fb_engagement(df_urls)
fb_results.to_csv(data_folder / "fb_responses.csv")




In [97]:
fb_shares = extract_fb_shares(fb_results)
fb_shares.to_csv(data_folder / "fb_shares.csv")
fb_shares.astype(float).describe()

Unnamed: 0,pkp,pkp_ogid,pmid,pmid_ogid,pmc,pmc_ogid,doi,doi_ogid,total
count,17.0,17.0,0.0,0.0,1.0,1.0,16.0,16.0,0.0
mean,0.705882,889641100000000.0,,,0.0,1150652000000000.0,1.0,1075145000000000.0,
std,2.417765,426251300000000.0,,,,,2.607681,504292300000000.0,
min,0.0,128677000000000.0,,,0.0,1150652000000000.0,0.0,128677000000000.0,
25%,0.0,628486600000000.0,,,0.0,1150652000000000.0,0.0,683580900000000.0,
50%,0.0,828524300000000.0,,,0.0,1150652000000000.0,0.0,1092632000000000.0,
75%,0.0,1111781000000000.0,,,0.0,1150652000000000.0,0.25,1546874000000000.0,
max,10.0,1745181000000000.0,,,0.0,1150652000000000.0,10.0,1852334000000000.0,


In [8]:
ncbi_res = pd.read_csv(data_folder / "next_try.csv", index_col="doi")
pkp_res = pd.read_csv(data_folder / "full.csv", index_col="doi")
doi_res = pd.read_csv(data_folder / "responses.csv", index_col="doi")

In [36]:
url_cols = ['doi', 'type', 'url', 'og_obj', 'og_eng', 'og_err', 'ts']

In [50]:
pmid = ncbi_res[ncbi_res.pmid_url.notnull()][['pmid_og_eng', 'pmid_og_err', 'pmid_og_obj', 'pmid_url']].copy()
pmid['timestamp'] = None
pmid['type'] = "pmid_url"
pmid.columns = ['og_eng', 'og_err', 'og_obj', 'url', 'ts', 'type']
pmid = pmid.reset_index()
pmid = pmid[url_cols]

In [51]:
pmc = ncbi_res[ncbi_res.pmc_url.notnull()][['pmid_og_eng', 'pmid_og_err', 'pmid_og_obj', 'pmid_url']].copy()
pmc['timestamp'] = None
pmc['type'] = "pmc_url"
pmc.columns = ['og_eng', 'og_err', 'og_obj', 'url', 'ts', 'type']
pmc = pmc.reset_index()
pmc = pmc[url_cols]

In [52]:
pkp = pkp_res[['pkp_og_eng', 'pkp_og_err', 'pkp_og_obj', 'pkp_url']].copy()
pkp['timestamp'] = None
pkp['type'] = "pkp_url"
pkp.columns = ['og_eng', 'og_err', 'og_obj', 'url', 'ts', 'type']
pkp = pkp.reset_index()
pkp = pkp[url_cols]

In [69]:
doi = doi_res[doi_res.doi_url != "None"][['fb_engagement', 'doi_resolve_error', 'fb_og_object', 'doi_url', 'timestamp']]
doi['type'] = "doi_url"
doi.columns = ['og_eng', 'og_err', 'og_obj', 'url', 'ts', 'type']
doi = doi.reset_index()
doi = doi[url_cols]

In [125]:
pkp[pkp.og_obj.notnull()]

Unnamed: 0,doi,type,url,og_obj,og_eng,og_err,ts
45,10.1016/j.antro.2016.05.002,pkp_url,http://www.revistas.unam.mx/index.php/antropol...,"{""id"": ""1148610278554284"", ""description"": ""Afr...","{""comment_plugin_count"": 0, ""comment_count"": 1...",,
49,10.1016/j.antro.2016.09.001,pkp_url,http://www.revistas.unam.mx/index.php/antropol...,"{""id"": ""1710929332311666"", ""description"": ""Pro...","{""comment_plugin_count"": 0, ""comment_count"": 0...",,
104,10.1016/j.rai.2016.02.003,pkp_url,http://www.revistas.usp.br/rai/article/view/11...,"{""id"": ""985962438154420"", ""description"": ""COOP...","{""comment_plugin_count"": 0, ""comment_count"": 0...",,
106,10.1016/j.rai.2016.04.002,pkp_url,http://www.revistas.usp.br/rai/article/view/11...,"{""id"": ""1049925738376216"", ""description"": ""WHY...","{""comment_plugin_count"": 0, ""comment_count"": 0...",,
121,10.11156/104,pkp_url,http://www.aibr.org/OJ/index.php/aibr/article/...,"{""id"": ""1055939894495430"", ""description"": ""La ...","{""comment_plugin_count"": 0, ""comment_count"": 0...",,
126,10.11156/109,pkp_url,http://www.aibr.org/OJ/index.php/aibr/article/...,"{""id"": ""854424711271850"", ""description"": ""Apor...","{""comment_plugin_count"": 0, ""comment_count"": 0...",,
127,10.11156/11,pkp_url,http://www.aibr.org/OJ/index.php/aibr/article/...,"{""id"": ""840635409357415"", ""description"": ""Inve...","{""comment_plugin_count"": 0, ""comment_count"": 0...",,
137,10.11156/119,pkp_url,http://www.aibr.org/OJ/index.php/aibr/article/...,"{""id"": ""1082797975094465"", ""description"": ""Len...","{""comment_plugin_count"": 0, ""comment_count"": 0...",,
158,10.11156/138,pkp_url,http://www.aibr.org/OJ/index.php/aibr/article/...,"{""id"": ""727825353934128"", ""description"": ""Cons...","{""comment_plugin_count"": 0, ""comment_count"": 0...",,
163,10.11156/142,pkp_url,http://www.aibr.org/OJ/index.php/aibr/article/...,"{""id"": ""775167982554263"", ""description"": ""Las ...","{""comment_plugin_count"": 0, ""comment_count"": 0...",,


In [122]:
frames = [pmid, pmc, pkp, doi]
t = pd.concat(frames)
t = t.replace(["None", "null", ""], np.nan)

In [123]:
t.to_csv("fb_responses.csv", index=False)

## Collect Altmetric engagement

In [113]:
am_queries = collect_am_engagement(article_with_urls)
am_queries.to_csv(data_folder / "am_responses.csv")




In [121]:
x = doi_res[['am_response', 'am_response_error', 'timestamp']]
x.columns = ['am_resp', 'am_err', 'am_ts']
x.to_csv("am_responses.csv")

In [83]:
ncbi_results = pd.read_csv(data_folder / "next_try.csv")

In [68]:
df = pd.read_csv(data_folder / "responses.csv", index_col="doi")