# Collect PLOS2016 - Facebook Engagement

In [1]:
import datetime
import configparser
import json
import math
import requests
import queue
import csv
import pickle

import pandas as pd
from ratelimit import limits, RateLimitException, sleep_and_retry
from facebook import GraphAPI, GraphAPIError

try: 
    # for notebook
    get_ipython
    from tqdm._tqdm_notebook import tqdm_notebook as tqdm  
except: 
    # for commandline
    from tqdm import tqdm 
tqdm.pandas()

## Constants

In [2]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [3]:
input_csv = "../data/plos2016.csv"
urls_csv = "../data/urls.csv"
query_csv = "../data/queries.csv"
og_csv = "../data/og_objects.csv"

config_file = "../config.cnf"

batchsize = 50
sample_size = None
continue_crawl = True

In [4]:
# Load config
Config = configparser.ConfigParser()
Config.read(config_file)

FACEBOOK_APP_ID = Config.get('facebook', 'app_id')
FACEBOOK_APP_SECRET = Config.get('facebook', 'app_secret')
FACEBOOK_USER_TOKEN = Config.get('facebook', 'user_token')

RATELIMIT = int(Config.get('ratelimit', 'period'))

## Step 1 - Load FB credentials

In [5]:
def get_app_access(app_id, app_secret, version="2.10"):
    """Exchange a short-lived user token for a long-lived one"""
    payload = {'grant_type': 'client_credentials',
               'client_id': app_id,
               'client_secret': app_secret}

    try:
        response = requests.post(
            'https://graph.facebook.com/oauth/access_token?', params=payload)
    except requests.exceptions.RequestException:
        raise Exception()

    token = json.loads(response.text)
    token['created'] = str(datetime.datetime.now())
    return token


def extend_user_access(user_token, app_id, app_secret, version="2.10"):
    """Uses a short-lived user token to create a long lived one"""
    payload = {'grant_type': 'fb_exchange_token',
               'client_id': app_id,
               'client_secret': app_secret,
               'fb_exchange_token': user_token}

    try:
        response = requests.post(
            'https://graph.facebook.com/oauth/access_token?', params=payload)
    except requests.exceptions.RequestException:
        raise Exception()

    token = json.loads(response.text)
    token['created'] = str(datetime.datetime.now())
    return token


def token_expiry(token):
    remain = datetime.timedelta(seconds=token['expires_in'])
    created = datetime.datetime.strptime(token['created'], "%Y-%m-%d %H:%M:%S.%f")
    print("Token expires {}\n{} left".format(str(created+remain), str(remain)))


def expires_soon(token, tolerance=1):
    remain = datetime.timedelta(seconds=token['expires_in'])
    created = datetime.datetime.strptime(token['created'], "%Y-%m-%d %H:%M:%S.%f")
    now = datetime.datetime.now()
    
    if (now - created+remain).days < tolerance:
        return True
    else:
        return False

In [6]:
# fb_graph, token = get_app_access(FACEBOOK_APP_ID, FACEBOOK_APP_SECRET)
try:
    with open("token.pkl", "rb") as pkl:
        token = pickle.load(pkl)
        print("Found pickled token")
    
    if expires_soon(token):
        token = extend_user_access(FACEBOOK_USER_TOKEN, FACEBOOK_APP_ID, FACEBOOK_APP_SECRET)
        print("Created new token, because of soon expiry")
except FileNotFoundError:
    print("No token found. Creating new one...")
    token = extend_user_access(FACEBOOK_USER_TOKEN, FACEBOOK_APP_ID, FACEBOOK_APP_SECRET)
    
print("Saving token")
token_expiry(token)
with open("token.pkl", "wb") as pkl:
    pickle.dump(token, pkl)

Found pickled token
Saving token
Token expires 2018-09-06 18:44:28.689354
59 days, 23:52:41 left


In [7]:
fb_graph = GraphAPI(token['access_token'], version="2.10")

## Step 2 - Load and prepare URLs

In [13]:
raw = pd.read_csv(urls_csv, index_col="url_id")
urls = raw

query_columns = ["url_id", "error_msg", "queried_at"]
queries = pd.DataFrame(columns=query_columns)

if continue_crawl:
    queries = pd.read_csv(query_csv, index_col="query_id")
    urls = urls.drop(queries[old_queries.error_msg.isnull()].url_id)
    
if sample_size:
    urls = urls.sample(sample_size)
    
    
og_columns = ["og_id", "query_id", "received_at", "fb_url",
              "og_description", "og_title", "og_type", "og_updated_time",
              "reactions", "shares", "comments", "plugin_comments"]
og_objects = pd.DataFrame(columns=og_columns)

## Step 3 - Run queries

In [None]:
def process_result(url_id, result, queries, og_objects, query_f, og_f, now):
    query_id = queries.shape[0]
    queries.loc[query_id] = [url_id, result['err_msg'], str(now)]
    
    query_f.writerow([query_id, url_id, result['err_msg'], str(now)])
                             
    # if result, record og object
    if 'og_obj' in result:
        i = og_objects.shape[0]

        og_id = result['og_obj']['id']
        reactions = int(result['og_eng']['reaction_count'])
        shares = int(result['og_eng']['share_count'])
        comments = int(result['og_eng']['comment_count'])
        plugin_comments = int(result['og_eng']['comment_plugin_count'])

        for field in ['description', 'title', 'type', 'updated_time']:
            try:
                og_objects.loc[i, "og_{}".format(field)] = result['og_obj'][field]
            except:
                og_objects.loc[i, "og_{}".format(field)] = None
        
        og_objects.loc[i, "fb_url"] = result["fb_url"]
        og_objects.loc[i, "og_id"] = og_id
        og_objects.loc[i, "query_id"] = query_id
        og_objects.loc[i, "received_at"] = str(result['received'])
        og_objects.loc[i, ["reactions", "shares", "comments", "plugin_comments"]] = [reactions, shares, comments, plugin_comments]
         
        og_f.writerow(og_objects.loc[i][og_columns].tolist())

In [26]:
def query_url(url):
    result = {}
    try:
        r = fb_graph.get_object(id=url.strip(), fields="engagement,og_object")
    except Exception as e:
        result['received'] = datetime.datetime.now()
        result['err_msg'] = str(e)
        print(e)
        return result
        
    result['received'] = datetime.datetime.now()
    result['err_msg'] = None
    result['fb_url'] = r['id']
    
    if 'og_object' in r:
        result["og_obj"] = r['og_object']
        result["og_eng"]  = r['engagement']
    
    return result


def query_urls(urls):
    results = {}
    
    try:
        responses = fb_graph.get_objects(
            ids=[url.strip() for url in urls],
            fields="engagement,og_object")
    except Exception as e:
        print(e)
        raise

    received = datetime.datetime.now()

    for url, r in responses.items():        
        result = {}

        result['received'] = received
        result['err_msg'] = None
        result['fb_url'] = r['id']

        if 'og_object' in r:
            result["og_obj"] = r['og_object']
            result["og_eng"]  = r['engagement']

        results[url] = result
    return results 

In [87]:
@sleep_and_retry
@limits(calls=1, period=RATELIMIT)
def process_url(batch, queries, og_objects, query_f, og_f):
    """"""
    try:
        now = datetime.datetime.now()
        result = query_url(batch.url)
        process_result(batch.name, result, queries, og_objects, query_f, og_f, now)
    except GraphAPIError as e: 
        query_f.writerow([queries.shape[0], batch.name, e, str(now)])

        
@sleep_and_retry
@limits(calls=1, period=RATELIMIT)
def process_batch(batch, queries, og_objects, query_f, og_f, failed_batches):
    """"""
    try:
        now = datetime.datetime.now()
        results = query_urls(batch.url.tolist())

        # successful batch query
        for url, result in results.items():
            url_id = batch[batch.url == url].index[0]
            process_result(url_id, result, queries, og_objects, query_f, og_f, now) 
        
    # failed batch query
    except GraphAPIError as e: 
        failed_batches.put((e, batch.index))
        
        # Process failed batches
        pbar = tqdm(total=failed_batches.qsize()*batchsize, desc="Failed batches")
        while not failed_batches.empty():
            e, bad_batch = failed_batches.get()
            if len(bad_batch) > 4:
                batch_indices = chunker(bad_batch, math.ceil(len(bad_batch)/2))

                for batch_ind in batch_indices:
                    batch = urls.loc[batch_ind]

                    q_len = failed_batches.qsize()
                    process_batch(batch, queries, og_objects, query_writer, og_writer, failed_batches)
                    if failed_batches.qsize() == q_len:
                        pbar.update(len(batch_ind))

            else:
                for i in bad_batch:
                    process_url(urls.loc[i], queries, og_objects, query_writer, og_writer,)
                    pbar.update(1)
        pbar.close()

In [None]:
if continue_crawl:
    write_mode = "a"
else:
    write_mode = "w"
    
with open(query_csv, write_mode) as query_f, open(og_csv, write_mode) as og_f:
    query_writer = csv.writer(query_f, delimiter=",")
    og_writer = csv.writer(og_f, delimiter=",")
    
    # Write column labels 
    if not continue_crawl:
        query_writer.writerow(["query_id"] + queries.columns.tolist())
        og_writer.writerow(og_objects.columns.tolist())

    # Keep track of indices that failed during batchmode
    failed_batches = queue.Queue()

    # Initialise indices for batches
    if len(urls) < batchsize:
        batchsize = len(urls)
    batch_indices = chunker(urls.index, batchsize)

    # Keep appending in batches of 50
    for batch_ind in tqdm(batch_indices, total=len(urls)//batchsize, desc="Batches"):
        batch = urls.loc[batch_ind] 
        process_batch(batch, queries, og_objects, query_writer, og_writer, failed_batches)
    
    # Process failed batches
    pbar = tqdm(total=failed_batches.qsize()*batchsize, desc="Failed batches")
    while not failed_batches.empty():
        e, bad_batch = failed_batches.get()
        if len(bad_batch) > 4:
            batch_indices = chunker(bad_batch, math.ceil(len(bad_batch)/2))
            
            for batch_ind in batch_indices:
                batch = urls.loc[batch_ind]
                
                q_len = failed_batches.qsize()
                process_batch(batch, queries, og_objects, query_writer, og_writer, failed_batches)
                if failed_batches.qsize() == q_len:
                    pbar.update(len(batch_ind))
                    
        else:
            for i in bad_batch:
                process_url(urls.loc[i], queries, og_objects, query_writer, og_writer,)
                pbar.update(1)
    pbar.close()

HBox(children=(IntProgress(value=0, description='Batches', max=3683), HTML(value='')))