In [293]:
import datetime
import configparser
import json
import math
import requests
import queue

import pandas as pd
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from facebook import GraphAPI, GraphAPIError

## Step 0 - Define helpers, load configs, etc

In [291]:
tqdm.pandas()

input_csv = "../data/plos2016.csv"
urls_csv = "../data/urls.csv"
query_csv = "../data/queries.csv"
og_csv = "../data/og_objects.csv"

batchsize = 50

In [291]:
# Load config
Config = configparser.ConfigParser()
Config.read('../config.cnf')
FACEBOOK_APP_ID = Config.get('facebook', 'app_id')
FACEBOOK_APP_SECRET = Config.get('facebook', 'app_secret')

In [325]:
access_token = get_fb_access_token(FACEBOOK_APP_ID, FACEBOOK_APP_SECRET)
temp_token = "EAACEdEose0cBALqZB4Nfwk1HGrLYNkgT7MiaKcgUdbEaoWOtYX2VzjlIRANN6Q46isNuVjyhsgPuhE81sYAc0hFxS7TyCwy8pTf0u8XcJgrb5GaRkFBmC8npkDBam6qYK8K7DTNasTQN2u1MsZAPlD5Wd2oFzL7Nl3s9md9pDLkqVuSnuFoiOxARb1MDVqTZAk3pjq5KwZDZD"
fb_graph = GraphAPI(temp_token, version="2.10")

Generated access token: 287299458433880|6Y_ml710QWnU7HBYLWjaneoWVKU


In [3]:
def get_fb_access_token(app_id, app_secret):
    payload = {'grant_type': 'client_credentials',
               'client_id': app_id,
               'client_secret': app_secret}

    try:
        response = requests.post('https://graph.facebook.com/oauth/access_token?', params = payload)
    except requests.exceptions.RequestException:
        raise Exception()

    access_token = json.loads(response.text)['access_token']
    print("Generated access token: " + access_token)

In [114]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(1, len(seq), size))

In [426]:
def query_url(url):
    result = {}
    try:
        r = fb_graph.get_object(id=url.strip(), fields="engagement,og_object")
    except Exception as e:
        result['received'] = datetime.datetime.now()
        result['err_msg'] = str(e)
        return result
        
    result['received'] = datetime.datetime.now()
    result['err_msg'] = None
    result['fb_url'] = r['id']
    
    if 'og_object' in r:
        result["og_obj"] = r['og_object']
        result["og_eng"]  = r['engagement']
    
    return result

In [427]:
def query_urls(urls):
    results = {}
    
    try:
        responses = fb_graph.get_objects(
            ids=[url.strip() for url in urls],
            fields="engagement,og_object")
    except Exception as e:
        raise

    received = datetime.datetime.now()

    for url, r in responses.items():        
        result = {}

        result['received'] = received
        result['err_msg'] = None
        result['fb_url'] = r['id']

        if 'og_object' in r:
            result["og_obj"] = r['og_object']
            result["og_eng"]  = r['engagement']

        results[url] = result
    return results 

## Step 1 - Load input data

In [4]:
plos2016 = pd.read_csv(input_csv)
plos2016 = plos2016.set_index("doi")
plos2016['publication_date'] = plos2016['publication_date'].map(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ"))
plos2016['created_on'] = datetime.datetime.now()

In [477]:
sample = plos2016.sample(625)

## Step 2 - Prepare URLs

In [478]:
# Create DF for URLs
urls = pd.DataFrame(columns=['doi', 'url', 'type', 'added_on'])

In [479]:
templates = {"doi": "https://doi.org/{}",
             "doi_old": "http://dx.doi.org/{}",
             "landing": "http://journals.plos.org/plosone/article?id={}",
             "authors": "http://journals.plos.org/plosone/article/authors?id={}",
             "metrics": "http://journals.plos.org/plosone/article/metrics?id={}",
             "comments": "http://journals.plos.org/plosone/article/comments?id={}",
             "related": "http://journals.plos.org/plosone/article/related?id={}",
             "pdf": "http://journals.plos.org/plosone/article/file?id={}&type=printable"}

In [480]:
i = 0
for doi in tqdm(sample.index.tolist()):
    for type, template in templates.items():
        urls.loc[i] = [doi, template.format(doi), type, datetime.datetime.now()]
        i = i + 1

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))




## Step 3 - Run queries

In [481]:
# Create DF for queries
query_columns = ["url_id", "error_msg", "queried_at"]
queries = pd.DataFrame(columns=query_columns)

# Create DF for graph objects
og_columns = ["og_id", "query_id", "received_at", "fb_url",
              "og_description", "og_title", "og_type", "og_updated_time",
              "reactions", "shares", "comments", "plugin_comments"]
og_objects = pd.DataFrame(columns=og_columns)

In [483]:
def process_result(url_id, result, queries, og_objects, query_f, og_f):
    query_id = queries.shape[0]
    queries.loc[query_id] = [url_id, result['err_msg'], str(now)]
    queries.loc[[query_id]][query_columns].to_csv(query_f, header=False, )
    
    # query_f.write([query_id, url_id, result['err_msg'], str(now)])
                             
    # if result, record og object
    if 'og_obj' in result:
        i = og_objects.shape[0]

        og_id = result['og_obj']['id']
        reactions = int(result['og_eng']['reaction_count'])
        shares = int(result['og_eng']['share_count'])
        comments = int(result['og_eng']['comment_count'])
        plugin_comments = int(result['og_eng']['comment_plugin_count'])

        for field in ['description', 'title', 'type', 'updated_time']:
            try:
                og_objects.loc[i, "og_{}".format(field)] = result['og_obj'][field]
            except:
                og_objects.loc[i, "og_{}".format(field)] = None
        
        og_objects.loc[i, "fb_url"] = result["fb_url"]
        og_objects.loc[i, "og_id"] = og_id
        og_objects.loc[i, "query_id"] = query_id
        og_objects.loc[i, "received_at"] = str(result['received'])
        og_objects.loc[i, ["reactions", "shares", "comments", "plugin_comments"]] = [reactions, shares, comments, plugin_comments]
        
        og_objects.loc[[i]][og_columns].to_csv(og_f, header=False)
        
        # og_f.write([i]+og_objects.loc[i].tolist())

In [484]:
def process_url(batch, queries, og_objects, query_f, og_f):
    """"""
    now = datetime.datetime.now()
    result = query_url(batch.url)
    process_result(batch.name, result, queries, og_objects, query_f, og_f)

In [485]:
def process_batch(batch, queries, og_objects, query_f, og_f, failed_batches):
    """"""
    try:
        now = datetime.datetime.now()
        results = query_urls(batch.url.tolist())

        # successful batch query
        for url, result in results.items():
            url_id = batch[batch.url == url].index[0]
            process_result(url_id, result, queries, og_objects, query_f, og_f) 
        
    # failed batch query
    except GraphAPIError as e: 
        failed_batches.put((e, batch_ind))

In [486]:
with open(query_csv, "w") as query_f, open(og_csv, "w") as og_f:
    # Write column labels 
    queries.loc[[]].to_csv(query_f)
    og_objects.loc[[]].to_csv(og_f)

    # Keep track of indices that failed during batchmode
    failed_batches = queue.Queue()

    # Initialise indices for batches
    batch_indices = chunker(urls.index, batchsize)

    # Keep appending in batches of 50
    for batch_ind in tqdm(batch_indices,
                          total=len(urls)//batchsize,
                          desc="Batches"):
        batch = urls.loc[batch_ind] 
        process_batch(batch, queries, og_objects, query_f, og_f, failed_batches)
    
    # Process failed batches
    pbar = tqdm(total=failed_batches.qsize()*batchsize,
                desc="Failed batches")
    while not failed_batches.empty():
        e, bad_batch = failed_batches.get()
        if len(bad_batch) > 4:
            batch_indices = chunker(bad_batch, math.ceil(len(bad_batch)/2))
                
            for batch_ind in batch_indices:
                batch = urls.iloc[batch_ind]
                
                q_len = failed_batches.qsize()
                process_batch(batch, queries, og_objects, query_f, og_f, failed_batches)
                if failed_batches.qsize() == q_len:
                    pbar.update(len(batch_ind))
                    
        else:
            for i in bad_batch:
                process_url(urls.loc[i], queries, og_objects, query_f, og_f)
                pbar.update(1)
    pbar.close()

HBox(children=(IntProgress(value=0, description='Batches'), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', description='Failed batches', max=1), HTML(value='')))




In [487]:
og_objects[["reactions", "shares", "comments", "plugin_comments"]] = og_objects[["reactions", "shares", "comments", "plugin_comments"]].astype(int)

## Step 4 - Analysis

In [489]:
a = og_objects.merge(queries, left_on="query_id", right_index=True, how="left")
b = a.merge(urls, left_on="url_id", right_index=True, how="left")
c = b.merge(sample, left_on="doi", right_index=True, how="left")

In [500]:
dupl_queries = og_objects[og_objects.og_id.duplicated(keep=False)].query_id

In [502]:
mean = b[~b.query_id.isin(dupl_queries)].groupby("type")[['shares']].mean()
count = b[~b.query_id.isin(dupl_queries)].groupby("type")[['shares']].count()

mean['size'] = b.groupby("type").size()
mean

Unnamed: 0_level_0,shares,size
type,Unnamed: 1_level_1,Unnamed: 2_level_1
landing,1.659574,517
pdf,0.102564,39


In [373]:
b.groupby("type")[['reactions', 'shares', 'comments', 'plugin_comments']].mean()

Unnamed: 0_level_0,reactions,shares,comments,plugin_comments
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
authors,68.0,17.0,18.0,0.0
doi,68.0,17.0,18.0,0.0
doi_old,92.666667,8.0,8.333333,0.0
landing,30.888889,2.777778,2.777778,0.0
pdf,0.0,0.0,0.0,0.0


In [498]:
c[c.doi=="10.1371/journal.pone.0146621"][["type", "shares", "fb_url"]]

Unnamed: 0,type,shares,fb_url
343,doi_old,100,http://dx.doi.org/10.1371/journal.pone.0146621
347,landing,100,http://journals.plos.org/plosone/article?id=10...
349,metrics,100,http://journals.plos.org/plosone/article/metri...


In [493]:
# Total shares by article
c.groupby(["doi"])[['shares']].sum().sort_values("shares", ascending=False).head()

Unnamed: 0_level_0,shares
doi,Unnamed: 1_level_1
10.1371/journal.pone.0164733,381
10.1371/journal.pone.0146621,300
10.1371/journal.pone.0163477,260
10.1371/journal.pone.0154218,124
10.1371/journal.pone.0156752,123
