In [25]:
import datetime, time, os, sys, re
import json
from pprint import pprint
from time import sleep
import pandas as pd

# Import Altmetrics tools
from AltmetricsToolBox.Altmetric import Altmetric
from AltmetricsToolBox.Facebook import Facebook
from AltmetricsToolBox.DBConnection import DBConnection

# Load config
import argparse, configparser
Config = configparser.ConfigParser()
Config.read('config.cnf')

FACEBOOK_APP_ID = Config.get('facebook', 'app_id')
FACEBOOK_APP_SECRET = Config.get('facebook', 'app_secret')
ALTMETRIC_KEY = Config.get('altmetric', 'key')

In [1]:
a = {
    'doi': 'TEXT',
    'a': 'INTEGER',
    'b': 'TEXT',
}

In [3]:
["{} {}".format(key,val) for key, val in a.items()] 

['a INTEGER', 'doi TEXT', 'b TEXT']

In [2]:
def init_db():
    litecon = lite.connect('data/scielo.db')
    
    cols = ("doi TEXT, url TEXT, og_id INTEGER, og_obj TEXT, "
            "fb_comments INTEGER, fb_comment_plugins INTEGER, fb_reactions INTEGER, fb_shares INTEGER, "
            "am_posts INTEGER, am_response TEXT, "
            "timestamp TEXT")

    with litecon:
    # set up SQL tables

        litecur = litecon.cursor()
        # the sample, with two columns for either the Tweet itself, or the error in trying to retrieve it
        litecur.execute("CREATE TABLE IF NOT EXISTS wos ({})".format(cols))

        litecur.execute("CREATE UNIQUE INDEX IF NOT EXISTS wos_doi ON wos(doi)")
        
    return litecon

In [3]:
# Retrieve App Access Token
def init_fb(app_id, app_secret):           
    payload = {'grant_type': 'client_credentials', 'client_id': app_id, 'client_secret': app_secret}
    try:
        r = requests.post('https://graph.facebook.com/oauth/access_token?', params = payload)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        print(e)
        return False
    
    r = ast.literal_eval(r.text)['access_token']
    print("Generated access token: " + r)
    
    return facebook.GraphAPI(r)

In [4]:
# Adopted from PyAltmetric (https://github.com/CenterForOpenScience/PyAltmetric)
# and also pyAltmetric (https://github.com/wearp/pyaltmetric)

class AltmetricException(Exception):
    """Base class for any pyaltmetric error."""
    pass

class JSONParseException(AltmetricException):
    """
    Failed to turn HTTP Response into JSON.
    Site is probably in the wrong format.
    """
    pass

class AltmetricHTTPException(AltmetricException):
    """A query argument or setting was formatted incorrectly."""
    def __init__(self, status_code):
        response_codes = {
            403:"You are not authorized for this call.",
            420:"Rate Limit Reached",
            502:"API is down.",
        }
        super(AltmetricHTTPException, self).__init__(
            response_codes.get(status_code, status_code)
        )

class Altmetric(object):
    def __init__(self, api_key = None, api_version = 'v1'):
        """Cache API key and version."""
        self._api_version = api_version
        if self._api_version != 'v1':
            warnings.warn("This wrapper has only been tested with API v1."
                          "If you try another version it will probably break.")

        self._api_url = "http://api.altmetric.com/{}/".format(self._api_version)

        self._api_key = {}
        if api_key:
            self._api_key = {'key': api_key}
            
    def doi(self, doi, fetch = False, **kwargs):
        if fetch:
            return self._get_altmetrics_detailed("doi", doi, **kwargs)
        else:
            return self._get_altmetrics("doi", doi, **kwargs)
        
    def uri(self, uri, fetch = False, **kwargs):
        if fetch:
            return self._get_altmetrics_detailed("uri", uri, **kwargs)
        else:
            return self._get_altmetrics("uri", uri, **kwargs)
        
    def _get_altmetrics(self, method, *args, **kwargs):
        """
        Request information from Altmetric. Return a dictionary.
        """
        request_url = self._api_url + method + "/" + "/".join([a for a in args])
        params = kwargs or {}
        params.update(self._api_key)
        response = requests.get(request_url, params = params)
        if response.status_code == 200:
            try:
                return response.json()
            except ValueError as e:
                raise JSONParseException(e.message)
        elif response.status_code in (404, 400):
            return {}
        else:
            raise AltmetricHTTPException(response.status_code)
            
    def _get_altmetrics_detailed(self, method, *args, **kwargs):
        """
        Request information from Altmetric. Return a dictionary.
        """
        request_url = self._api_url + "fetch/" + method + "/" + "/".join([a for a in args])
        params = kwargs or {}
        params.update(self._api_key)
        response = requests.get(request_url, params = params)
        if response.status_code == 200:
            try:
                return response.json()
            except ValueError as e:
                raise JSONParseException(e.message)
        elif response.status_code in (404, 400):
            return {}
        else:
            raise AltmetricHTTPException(response.status_code)

In [5]:
def __save_row(doi, url, timestamp, fb_response, altmetric_response, fb_error = None, altmetric_error = None):
    '''
    Do the actual SQLite update with the info collected
    '''    
    if 'og_object' in fb_response:
        og_id = fb_response['og_object']['id']
        og_object = json.dumps(fb_response['og_object'])
        fb_comment_count = fb_response['engagement']['comment_count']
        fb_comment_plugin_count = fb_response['engagement']['comment_plugin_count']
        fb_reaction_count = fb_response['engagement']['reaction_count']
        fb_share_count = fb_response['engagement']['share_count']
    else:
        eng = None
        og_id = None
        og_object = None
        fb_comment_count = None
        fb_comment_plugin_count = None
        fb_reaction_count = None
        fb_share_count = None
    
    try:
        am_posts_count = altmetric_response['counts']['facebook']['posts_count']
    except:
        am_posts_count = None
    
    row = (doi, url, og_id, og_object,
           fb_comment_count,
           fb_comment_plugin_count,
           fb_reaction_count,
           fb_share_count,
           am_posts_count,
           json.dumps(altmetric_response),
           timestamp)
    
    with litecon:
        litecur = litecon.cursor()

        if fb_error: 
            try: 
                m = fb_error.reason
            except:
                m = str(fb_error)
            litecur.execute('UPDATE wos SET error = ?, modified = ? WHERE tweet_id = ?', (m, now, tweet_id))    

        else:
            litecur.execute('''INSERT OR IGNORE INTO wos VALUES (?,?,?,?,?,?,?,?,?,?,?)''', row)

In [43]:
litecon = init_db()
fb_graph = init_fb(FACEBOOK_APP_ID, FACEBOOK_APP_SECRET)
altmetric = Altmetric(api_key = ALTMETRIC_KEY)

df = pd.read_csv("data/scielo.csv", encoding = 'utf8')
df = df.replace(r'\s+', "", regex=True).fillna('')
dois = df.DOI

Generated access token: 287299458433880|6Y_ml710QWnU7HBYLWjaneoWVKU


In [44]:
for doi in dois:
    if doi:
        now = datetime.datetime.now()
        if doi[:3] == '10.':

            try:
                r = requests.head('https://doi.org/{}'.format(doi), allow_redirects=True, timeout=3)
                url = r.url
            except requests.exceptions.Timeout:
                url = "timeout"
            except requests.exceptions.TooManyRedirects:
                url = "too_many_redirects"
            except requests.exceptions.RequestException as e:
                url = str(e)

            print(doi, url)

            try:
                fb_response = fb_graph.get_object(id=url, fields="engagement, og_object")
            except:
                fb_response = {}

            try:
                am_response = altmetric.doi(doi=doi, fetch=True)
            except:
                am_response = None

        else:
            url = doi
            print(doi, url)

            try:
                fb_response = fb_graph.get_object(id=doi, fields="engagement, og_object")
            except:
                fb_response = {}

            try:
                am_response = altmetric.uri(uri=doi, fetch=True)
            except:
                am_response = None

        __save_row(doi, url, now.strftime("%Y-%m-%d %H:%M:%S"), fb_response, am_response)
        new = datetime.datetime.now()
        delta = new - now
        if delta.seconds < 1:
            time.sleep(1 - delta.total_seconds())

10.1590/0103-8478cr20160846 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0103-84782017000400503&lng=en&tlng=en
10.1590/0103-8478cr20170140 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0103-84782017000700603&lng=en&tlng=en
10.1590/1679-395155900 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-39512016000400891&lng=pt&tlng=pt
10.1590/2175-3369.009.003.ao03 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S2175-33692017000300414&lng=pt&tlng=pt
10.1590/2175-623656967 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S2175-62362017000300981&lng=pt&tlng=pt
10.1590/s0034-759020170409 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400401&lng=pt&tlng=pt
10.1590/s0034-759020170409 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400401&lng=pt&tlng=pt
http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1981-812220160002&lng=en&nrm=iso http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1981-81222

10.1590/18094449201700500005 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0104-83332017000200306&lng=es&tlng=es
10.1590/18094449201700500006 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0104-83332017000200307&lng=pt&tlng=pt
10.1590/18094449201700500007 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0104-83332017000200308&lng=pt&tlng=pt
10.1590/18094449201700500008 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0104-83332017000200309&lng=pt&tlng=pt
10.1590/18094449201700500009 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0104-83332017000200501&lng=es&tlng=es
10.1590/1981-6723.3516 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1981-67232017000100404&lng=en&tlng=en
10.1590/1981-8637201600030000103074 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1981-86372016000400429&lng=en&tlng=en
10.1590/1981-8637201600030000113044 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1981-86372016000400434&lng=en&tlng=en
10.1590/

10.1590/S0034-759020160301 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902016000300265&lng=pt&tlng=pt
10.1590/S0034-759020160304 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902016000300302&lng=en&tlng=en
10.1590/S0034-759020160305 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902016000300315&lng=en&tlng=en
10.1590/s0034-759020170101 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000100008&lng=pt&tlng=pt
10.1590/s0034-759020170201 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000200120&lng=pt&tlng=pt
10.1590/s0034-759020170201 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000200120&lng=pt&tlng=pt
10.1590/s0034-759020170401 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400300&lng=en&tlng=en
10.1590/s0034-759020170401 http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400300&lng=en&tlng=en
10.1590/s0034-759020170406 http:

http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1983-804220160003&lng=en&nrm=iso http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1983-804220160003&lng=en&nrm=iso
http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1983-804220170001&lng=en&nrm=iso http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1983-804220170001&lng=en&nrm=iso
http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-336920160003&lng=en&nrm=iso http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-336920160003&lng=en&nrm=iso
http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-336920170003&lng=en&nrm=iso http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-336920170003&lng=en&nrm=iso
http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2316-401820170001&lng=pt&nrm=iso http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2316-401820170001&lng=pt&nrm=iso
http://www.scielo.br/scielo.php?script=sci_serial&pid=0034-7329&lng=en&nrm=iso http://www.scielo.br/scielo.php

# Problems

Problems that I encountered are documented here

## 1. Facebook API returning less shares than Altmetric.com

Example DOI: [10.1186/1741-7007-10-51](10.1186/1741-7007-10-51) which resolves to this URL https://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51

Detailed Altmetric results for the DOI: [Altmetric results](https://biomedcentral.altmetric.com/details/799209/facebook)

http://www.biomedcentral.com/1741-7007/10/51/

In [103]:
url = 'https://www.scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400401&lng=pt&tlng=pt/'
doi = '10.1590/2175-623656967'

In [99]:
url = "http://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51"
doi = "10.1186/1741-7007-10-51"

### Results per Altmetric API

In [104]:
alt_url = altmetric.uri(url, fetch=True)
alt_doi = altmetric.doi(doi, fetch=True)

print("DOI and ULR have same altmetric_id: {}".format(alt_url['altmetric_id'] == alt_doi['altmetric_id']))

print("FB shares: {}".format(alt_doi['counts']['facebook']['posts_count']))

KeyError: 'altmetric_id'

### Results per Facebook API

In [94]:
fb_url = fb_graph.get_object(url, fields="engagement, og_object")
fb_doi = fb_graph.get_object("http://dx.doi.org/" + doi, fields="engagement, og_object")

print("DOI and ULR have same og_object_id: {}".format(fb_url['og_object']['id'] == fb_doi['og_object']['id']))

print("FB shares for URL: {}".format(fb_url['engagement']['share_count']))
print("FB shares for DOI: {}".format(fb_doi['engagement']['share_count']))

DOI and ULR have same og_object_id: False
FB shares for URL: 0
FB shares for DOI: 0


## 2. URL <-> OpenGraph object ID

Not cool behaviour...

In [108]:
url_base = "bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51"
doi = "10.1186/1741-7007-10-51"

url_base = "www.nature.com/news/the-future-of-dna-sequencing-1.22787"
doi = "10.1038/550179a"

url_base = 'scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400401&lng=pt&tlng=pt'
doi = '10.1186/1741-7007-10-51'

urls = ['http://' + url_base,
        'http://' + url_base + '/',
        'https://' + url_base,
        'https://' + url_base + '/']

dois = ['http://dx.doi.org/' + doi,
        'https://dx.doi.org/' + doi,
        'http://doi.org/' + doi,
        'https://doi.org/' + doi]

url_results = {}
doi_results = {}

for url in urls:
    try:
        url_results[url] = fb_graph.get_object(url, fields="og_object, engagement")['id']
    except:
        url_results[url] = None
        
for doi in dois:
    try:
        doi_results[doi] = fb_graph.get_object(doi, fields="og_object, engagement")['engagement']
    except:
        doi_results[doi] = None
        
pprint(url_results)
pprint(doi_results)

{'http://scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400401&lng=pt&tlng=pt': 'http://scielo.br/scielo.php',
 'http://scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400401&lng=pt&tlng=pt/': 'http://scielo.br/scielo.php',
 'https://scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400401&lng=pt&tlng=pt': 'https://scielo.br/scielo.php',
 'https://scielo.br/scielo.php?script=sci_arttext&pid=S0034-75902017000400401&lng=pt&tlng=pt/': 'https://scielo.br/scielo.php'}
{'http://doi.org/10.1186/1741-7007-10-51': {'comment_count': 0,
                                            'comment_plugin_count': 0,
                                            'reaction_count': 0,
                                            'share_count': 0},
 'http://dx.doi.org/10.1186/1741-7007-10-51': {'comment_count': 0,
                                               'comment_plugin_count': 0,
                                               'reaction_count': 0,
                      

In [100]:
url_base = "bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51"
doi = "10.1186/1741-7007-10-51"

urls = ['http://' + url_base,
        'http://' + url_base + '/',
        'https://' + url_base,
        'https://' + url_base + '/']

dois = ['http://dx.doi.org/' + doi,
        'https://dx.doi.org/' + doi,
        'http://doi.org/' + doi,
        'https://doi.org/' + doi]

url_results = {}
doi_results = {}

for url in urls:
    try:
        url_results[url] = fb_graph.get_object(url, fields="og_object")['og_object']['id']
    except:
        url_results[url] = None
        
for doi in dois:
    try:
        doi_results[doi] = fb_graph.get_object(doi, fields="og_object")['og_object']['id']
    except:
        doi_results[doi] = None
        
pprint(url_results)
pprint(doi_results)

{'http://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51': '1246877015357759',
 'http://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51/': '2058851297473624',
 'https://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51': '2058851297473624',
 'https://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-10-51/': '2058851297473624'}
{'http://doi.org/10.1186/1741-7007-10-51': '2058851297473624',
 'http://dx.doi.org/10.1186/1741-7007-10-51': '2058851297473624',
 'https://doi.org/10.1186/1741-7007-10-51': '2058851297473624',
 'https://dx.doi.org/10.1186/1741-7007-10-51': '2058851297473624'}
