In [102]:
import datetime, time, os, sys, re
import json
import ast
from pprint import pprint
from time import sleep
import urllib
import requests
import pandas as pd

# API wrappers
import facebook

import sqlite3 as lite

import argparse, configparser
Config = configparser.ConfigParser()
Config.read('config.cnf')

# Load config
FACEBOOK_APP_ID = Config.get('facebook', 'app_id')
FACEBOOK_APP_SECRET = Config.get('facebook', 'app_secret')
ALTMETRIC_KEY = Config.get('altmetric', 'key')

In [103]:
def init_db():
    litecon = lite.connect('data/metrics.db')

    cols = ("doi TEXT, url TEXT, og_id INTEGER, og_obj TEXT, "
            "fb_comments INTEGER, fb_comment_plugins INTEGER, fb_reactions INTEGER, fb_shares INTEGER, "
            "am_posts INTEGER, am_response TEXT, "
            "timestamp TEXT")

    with litecon:
    # set up SQL tables

        litecur = litecon.cursor()
        # the sample, with two columns for either the Tweet itself, or the error in trying to retrieve it
        litecur.execute("CREATE TABLE IF NOT EXISTS wos ({})".format(cols))

        litecur.execute("CREATE UNIQUE INDEX IF NOT EXISTS wos_doi ON wos(doi)")
        
    return litecon

In [104]:
# Retrieve App Access Token
def init_fb(app_id, app_secret):           
    payload = {'grant_type': 'client_credentials', 'client_id': app_id, 'client_secret': app_secret}
    try:
        r = requests.post('https://graph.facebook.com/oauth/access_token?', params = payload)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        print(e)
        return False
    
    r = ast.literal_eval(r.text)['access_token']
    print("Generated access token: " + r)
    
    return facebook.GraphAPI(r)

In [111]:
# Adopted from PyAltmetric (https://github.com/CenterForOpenScience/PyAltmetric)
# and also pyAltmetric (https://github.com/wearp/pyaltmetric)

class AltmetricException(Exception):
    """Base class for any pyaltmetric error."""
    pass

class JSONParseException(AltmetricException):
    """
    Failed to turn HTTP Response into JSON.
    Site is probably in the wrong format.
    """
    pass

class AltmetricHTTPException(AltmetricException):
    """A query argument or setting was formatted incorrectly."""
    def __init__(self, status_code):
        response_codes = {
            403:"You are not authorized for this call.",
            420:"Rate Limit Reached",
            502:"API is down.",
        }
        super(AltmetricHTTPException, self).__init__(
            response_codes.get(status_code, status_code)
        )

class Altmetric(object):
    def __init__(self, api_key = None, api_version = 'v1'):
        """Cache API key and version."""
        self._api_version = api_version
        if self._api_version != 'v1':
            warnings.warn("This wrapper has only been tested with API v1."
                          "If you try another version it will probably break.")

        self._api_url = "http://api.altmetric.com/{}/".format(self._api_version)

        self._api_key = {}
        if api_key:
            self._api_key = {'key': api_key}
            
    def doi(self, doi, fetch = False, **kwargs):
        if fetch:
            return self._get_altmetrics_detailed("doi", doi, **kwargs)
        else:
            return self._get_altmetrics("doi", doi, **kwargs)
        
    def uri(self, uri, fetch = False, **kwargs):
        if fetch:
            return self._get_altmetrics_detailed("uri", uri, **kwargs)
        else:
            return self._get_altmetrics("uri", uri, **kwargs)
        
    def _get_altmetrics(self, method, *args, **kwargs):
        """
        Request information from Altmetric. Return a dictionary.
        """
        request_url = self._api_url + method + "/" + "/".join([a for a in args])
        params = kwargs or {}
        params.update(self._api_key)
        response = requests.get(request_url, params = params)
        if response.status_code == 200:
            try:
                return response.json()
            except ValueError as e:
                raise JSONParseException(e.message)
        elif response.status_code in (404, 400):
            return {}
        else:
            raise AltmetricHTTPException(response.status_code)
            
    def _get_altmetrics_detailed(self, method, *args, **kwargs):
        """
        Request information from Altmetric. Return a dictionary.
        """
        request_url = self._api_url + "fetch/" + method + "/" + "/".join([a for a in args])
        params = kwargs or {}
        params.update(self._api_key)
        response = requests.get(request_url, params = params)
        if response.status_code == 200:
            try:
                return response.json()
            except ValueError as e:
                raise JSONParseException(e.message)
        elif response.status_code in (404, 400):
            return {}
        else:
            raise AltmetricHTTPException(response.status_code)

In [109]:
def __save_row(doi, url, timestamp, fb_response, altmetric_response, fb_error = None, altmetric_error = None):
    '''
    Do the actual SQLite update with the info collected
    '''    
    if 'og_object' in fb_response:
        og_id = fb_response['og_object']['id']
        og_object = json.dumps(fb_response['og_object'])
        fb_comment_count = fb_response['engagement']['comment_count']
        fb_comment_plugin_count = fb_response['engagement']['comment_plugin_count']
        fb_reaction_count = fb_response['engagement']['reaction_count']
        fb_share_count = fb_response['engagement']['share_count']
    else:
        eng = None
        og_id = None
        og_object = None
        fb_comment_count = None
        fb_comment_plugin_count = None
        fb_reaction_count = None
        fb_share_count = None
    
    try:
        am_posts_count = altmetric_response['counts']['facebook']['posts_count']
    except:
        am_posts_count = None
    
    row = (doi, url, og_id, og_object,
           fb_comment_count,
           fb_comment_plugin_count,
           fb_reaction_count,
           fb_share_count,
           am_posts_count,
           json.dumps(altmetric_response),
           timestamp)
    
    with litecon:
        litecur = litecon.cursor()

        if fb_error: 
            try: 
                m = fb_error.reason
            except:
                m = str(fb_error)
            litecur.execute('UPDATE wos SET error = ?, modified = ? WHERE tweet_id = ?', (m, now, tweet_id))    

        else:
            litecur.execute('''INSERT OR IGNORE INTO wos VALUES (?,?,?,?,?,?,?,?,?,?,?)''', row)

In [124]:
litecon = init_db()
fb_graph = init_fb(FACEBOOK_APP_ID, FACEBOOK_APP_SECRET)
altmetric = Altmetric(api_key = ALTMETRIC_KEY)

df = pd.read_csv("data/wos_100k.csv", encoding = 'utf8')
dois = df.doi

Generated access token: 287299458433880|6Y_ml710QWnU7HBYLWjaneoWVKU


In [151]:
for doi in dois.head(300):
    now = datetime.datetime.now()
    try:
        r = requests.head('https://doi.org/{}'.format(doi), allow_redirects=True, timeout=3)
        url = r.url
    except requests.exceptions.Timeout:
        url = "timeout"
    except requests.exceptions.TooManyRedirects:
        url = "too_many_redirects"
    except requests.exceptions.RequestException as e:
        url = str(e)
        
    print(doi, url)
    
    try:
        fb_response = fb_graph.get_object(id=url, fields="engagement, og_object")
    except:
        fb_response = {}
    
    try:
        am_response = altmetric.doi(doi=doi, fetch=True)
    except:
        am_response = None

    __save_row(doi, url, now.strftime("%Y-%m-%d %H:%M:%S"), fb_response, am_response)
    new = datetime.datetime.now()
    delta = new - now
    if delta.seconds < 1:
        time.sleep(1- delta.total_seconds())

10.1063/1.4898570 http://aip.scitation.org/doi/10.1063/1.4898570
10.1103/physrevlett.104.215902 https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.104.215902
10.1016/j.chc.2009.03.002 http://linkinghub.elsevier.com/retrieve/pii/S105649930900025X
10.1016/j.tele.2013.08.003 http://linkinghub.elsevier.com/retrieve/pii/S0736585313000415
10.1371/journal.pone.0007695 http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007695
10.1007/s11442-011-0878-x https://link.springer.com/article/10.1007%2Fs11442-011-0878-x
10.1111/j.1365-2486.2011.02512.x http://onlinelibrary.wiley.com/resolve/doi?DOI=10.1111/j.1365-2486.2011.02512.x
10.1103/physrevlett.112.155702 https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.112.155702
10.1103/physrevx.4.011019 https://journals.aps.org/prx/abstract/10.1103/PhysRevX.4.011019
10.1128/mcb.01197-14 http://mcb.asm.org/content/35/7/1157
10.1016/j.jglr.2009.12.001 http://linkinghub.elsevier.com/retrieve/pii/S0380133009002196
10.1016/j.jorganch

10.2353/ajpath.2010.100377 http://linkinghub.elsevier.com/retrieve/pii/S0002944010629033
10.1016/j.scient.2011.08.008 http://linkinghub.elsevier.com/retrieve/pii/S1026309811001581
10.1016/j.technovation.2012.03.003 http://linkinghub.elsevier.com/retrieve/pii/S0166497212000326
10.1016/j.jpba.2010.01.012 http://linkinghub.elsevier.com/retrieve/pii/S0731708510000154
10.1016/j.poly.2011.10.040 http://linkinghub.elsevier.com/retrieve/pii/S0277538711006814
10.1088/0957-0233/25/7/075011 http://iopscience.iop.org/article/10.1088/0957-0233/25/7/075011/meta;jsessionid=999716E87F70A8451491F2E13FDA66DB.ip-10-40-1-105
10.1007/s10646-009-0319-0 https://link.springer.com/article/10.1007%2Fs10646-009-0319-0
10.1021/jf401837a http://pubs.acs.org/doi/abs/10.1021/jf401837a
10.1074/jbc.m113.540088 http://www.jbc.org/content/289/19/13232
10.1128/jvi.00728-12 http://jvi.asm.org/content/86/16/8482
10.1051/0004-6361/200913937 timeout
10.1088/1751-8113/42/5/055303 timeout
10.1093/nar/gkp350 ('Connection aborte

10.1080/14772019.2014.936974 http://www.tandfonline.com/doi/full/10.1080/14772019.2014.936974
10.1088/1367-2630/13/4/045017 http://iopscience.iop.org/article/10.1088/1367-2630/13/4/045017/meta;jsessionid=AD3F2A6DEF1ADEE11281A9BE916C7108.c1.iopscience.cld.iop.org
10.1089/ars.2013.5757 http://online.liebertpub.com/doi/abs/10.1089/ars.2013.5757
10.1038/bmt.2009.326 http://www.nature.com/bmt/journal/v45/n7/full/bmt2009326a.html
10.1258/ar.2011.110463 timeout
10.1111/obr.12166 http://onlinelibrary.wiley.com/resolve/doi?DOI=10.1111/obr.12166
10.1103/physrevb.79.235313 https://journals.aps.org/prb/abstract/10.1103/PhysRevB.79.235313
10.1128/aac.00771-15 http://aac.asm.org/content/59/9/5413
10.1016/j.geomphys.2012.07.008 http://linkinghub.elsevier.com/retrieve/pii/S0393044012001428
10.1016/j.toxicon.2012.07.016 http://linkinghub.elsevier.com/retrieve/pii/S0041010112005612
10.1016/j.jaridenv.2014.01.002 timeout
10.1088/0004-637x/693/1/847 timeout
10.1111/j.1364-3703.2009.00583.x http://onlineli

10.1002/pc.21050 http://onlinelibrary.wiley.com/resolve/doi?DOI=10.1002/pc.21050
10.1375/twin.14.2.111 timeout
10.1016/j.jim.2009.07.003 http://linkinghub.elsevier.com/retrieve/pii/S0022175909002002
10.1002/pssb.200983085 http://onlinelibrary.wiley.com/resolve/doi?DOI=10.1002/pssb.200983085
10.1063/1.4733344 http://aip.scitation.org/doi/10.1063/1.4733344
10.1016/j.ijintrel.2013.08.009 http://linkinghub.elsevier.com/retrieve/pii/S0147176713000916
10.1016/j.techfore.2011.01.010 http://linkinghub.elsevier.com/retrieve/pii/S0040162511000138
10.1016/j.tourman.2011.06.011 http://linkinghub.elsevier.com/retrieve/pii/S0261517711001221
10.1080/01431161.2011.576709 http://www.tandfonline.com/doi/abs/10.1080/01431161.2011.576709
10.1159/000342080 https://www.karger.com/Article/Abstract/342080
10.1186/1471-2172-14-16 https://bmcimmunol.biomedcentral.com/articles/10.1186/1471-2172-14-16
10.1007/s10695-012-9701-6 https://link.springer.com/article/10.1007%2Fs10695-012-9701-6
10.1007/s10620-012-2412-0

# Problems

Problems that I encountered are documented here

## 1. Facebook API returning less shares than Altmetric.com

Example DOI: [10.1186/1741-7007-10-51](10.1186/1741-7007-10-51) which resolves to this URL https://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51

Detailed Altmetric results for the DOI: [Altmetric results](https://biomedcentral.altmetric.com/details/799209/facebook)

http://www.biomedcentral.com/1741-7007/10/51/

In [58]:
url = "http://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51"
doi = "10.1186/1741-7007-10-51"

### Results per Altmetric API

In [53]:
alt_url = altmetric.uri(url, fetch=True)
alt_doi = altmetric.doi(doi, fetch=True)

print("DOI and ULR have same altmetric_id: {}".format(alt_url['altmetric_id'] == alt_doi['altmetric_id']))

print("FB shares: {}".format(alt_doi['counts']['facebook']['posts_count']))

DOI and ULR have same altmetric_id: True
FB shares: 38


### Results per Facebook API

In [67]:
fb_url = fb_graph.get_object(url, fields="engagement, og_object")
fb_doi = fb_graph.get_object("http://dx.doi.org/" + doi, fields="engagement, og_object")

print("DOI and ULR have same og_object_id: {}".format(fb_url['og_object']['id'] == fb_doi['og_object']['id']))

print("FB shares for URL: {}".format(fb_url['engagement']['share_count']))
print("FB shares for DOI: {}".format(fb_doi['engagement']['share_count']))

DOI and ULR have same og_object_id: False
FB shares for URL: 1
FB shares for DOI: 0


## 2. URL <-> OpenGraph object ID

Not cool behaviour...

In [99]:
url_base = "bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51"
doi = "10.1186/1741-7007-10-51"

url_base = "www.nature.com/news/the-future-of-dna-sequencing-1.22787"
doi = "10.1038/550179a"

urls = ['http://' + url_base,
        'http://' + url_base + '/',
        'https://' + url_base,
        'https://' + url_base + '/']

dois = ['http://dx.doi.org/' + doi,
        'https://dx.doi.org/' + doi,
        'http://doi.org/' + doi,
        'https://doi.org/' + doi]

url_results = {}
doi_results = {}

for url in urls:
    try:
        url_results[url] = fb_graph.get_object(url, fields="og_object")['og_object']['id']
    except:
        url_results[url] = None
        
for doi in dois:
    try:
        doi_results[doi] = fb_graph.get_object(doi, fields="og_object")['og_object']['id']
    except:
        doi_results[doi] = None
        
pprint(url_results)
pprint(doi_results)

{'http://www.nature.com/news/the-future-of-dna-sequencing-1.22787': '1472429859490322',
 'http://www.nature.com/news/the-future-of-dna-sequencing-1.22787/': '1446759318778508',
 'https://www.nature.com/news/the-future-of-dna-sequencing-1.22787': '1472429859490322',
 'https://www.nature.com/news/the-future-of-dna-sequencing-1.22787/': '1649534128454620'}
{'http://doi.org/10.1038/550179a': None,
 'http://dx.doi.org/10.1038/550179a': '1472429859490322',
 'https://doi.org/10.1038/550179a': None,
 'https://dx.doi.org/10.1038/550179a': None}


In [100]:
url_base = "bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51"
doi = "10.1186/1741-7007-10-51"

urls = ['http://' + url_base,
        'http://' + url_base + '/',
        'https://' + url_base,
        'https://' + url_base + '/']

dois = ['http://dx.doi.org/' + doi,
        'https://dx.doi.org/' + doi,
        'http://doi.org/' + doi,
        'https://doi.org/' + doi]

url_results = {}
doi_results = {}

for url in urls:
    try:
        url_results[url] = fb_graph.get_object(url, fields="og_object")['og_object']['id']
    except:
        url_results[url] = None
        
for doi in dois:
    try:
        doi_results[doi] = fb_graph.get_object(doi, fields="og_object")['og_object']['id']
    except:
        doi_results[doi] = None
        
pprint(url_results)
pprint(doi_results)

{'http://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51': '1246877015357759',
 'http://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51/': '2058851297473624',
 'https://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51': '2058851297473624',
 'https://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51/': '2058851297473624'}
{'http://doi.org/10.1186/1741-7007-10-51': '2058851297473624',
 'http://dx.doi.org/10.1186/1741-7007-10-51': '2058851297473624',
 'https://doi.org/10.1186/1741-7007-10-51': '2058851297473624',
 'https://dx.doi.org/10.1186/1741-7007-10-51': '2058851297473624'}
