In [1]:
# Install Facebook API SDK

#! pip install -e git+https://github.com/mobolic/facebook-sdk.git#egg=facebook-sdk

# If above does not work, try cutting & pasting into command prompt

In [2]:
import facebook
import json
import csv

# Set Up Graph Model

In [3]:

# Generate your temporary token at https://developers.facebook.com/tools/explorer/ and paste below (Needs a Facebook account)

temporary_token = ''
graph = facebook.GraphAPI(access_token=temporary_token, version='2.9')

## Define Article functions

### Get formatted IDs for use in API

In [4]:
def get_fb_id(pub_id, post_id):
    '''
    Return a well-formatted Facebook ID for posts, to be used in other functions.

    Args:
        pub_id: string or int for the publisher ID on Facebook
        post_id: string or int for the publisher ID on Facebook
    
    Returns:
        post: Facebook ID in the format of "'pub_id'_'post_id'"
    '''
    # Assure IDs are strings or ints
    try:
        isinstance(int(pub_id), int) or isinstance(pub_id, str)
        isinstance(int(post_id), int) or isinstance(post_id, str)
    except:
        print('Input error: Facebook ID must be an integer or a string integer.')
        return
    
    # Convert non-string IDs to strings
    if not isinstance(pub_id, str):
        pub_id = str(pub_id)
    
    if not isinstance(post_id, str):
        post_id = str(post_id)    
    
    fb_id = "_".join([pub_id, post_id])
    
    return fb_id

### Get News Story from Publisher

In [5]:
def get_article(fb_id):
    '''
    Return the news story article object, defined by the Publisher ID and Post ID.
    
    Args:
        fb_id: string ID composed of publisher and post ID, as returned by get_fb_id()
    
    Returns:
        article: dict object
    '''
    # Get post from API (returns dict)
    args = {'fields' : 'id, message, created_time, permalink_url, type, updated_time'}
    article = graph.get_object(id=fb_id,  **args)
    
    return article

### Add total counts

In [6]:
def article_stats(fb_id, article=dict()):
    '''
    Get total counts of likes, shares and comments, return dict with these fields.
    
    If the second input argument is passed (as dict), these fields are added to the input ('article').
    '''
    # Assure that 'article' is dict
    try:
        is_dict = isinstance(article, dict)
        if not is_dict:
            raise TypeError('Not dict')
    except TypeError:
        print('Input error: Second argument, if specified, must be a dictonary.')
        return
    
    # Likes
    obj = graph.get_connections(id=fb_id, connection_name='likes', summary=True, limit=0)
    article['likes_count'] = obj['summary']['total_count']

    # Shares
    obj = graph.get_object(id=fb_id, fields='shares')
    article['shares_count'] = obj['shares']['count']
    
    # Comments
    obj = graph.get_connections(id=fb_id, connection_name='comments', summary=True, limit=0)
    article['comments_count'] = obj['summary']['total_count']
    
    return article

### Save Article data to disk

In [7]:
def save_article_json(fb_id, article, topic, publisher):
    '''
    Save Article data in JSON format into file named 'topic_publisher_articles.json'
    '''
    # Add Topic and Publisher metadata to article object
    article['topic'] = topic
    article['publisher'] = publisher
    
    # Add Reactions and Comments files to metadata
    article['reactions_file'] = '{}_{}_reactions.csv'.format(topic,publisher)
    article['comments_file'] = '{}_{}_comments.json'.format(topic,publisher)
    
    # Save data in JSON format to disk
    with open('{}_{}_article.json'.format(topic,publisher), 'w') as f:
        json.dump(article, f, ensure_ascii=True)


### Fetch and save Article data


In [8]:
def get_article_data(fb_id, topic, publisher):
    '''
    Get and save Article data into JSON file.
    
    Has the following keys:
    'id', 'message', 'created_time', 'permalink_url', 'type', 'updated_time', 
    'likes_count', 'shares_count', 'comments_count', 
    'topic', 'publisher', 'reactions_file', 'comments_file'
    
    If you are having trouble reading the file, try pasting the content here: https://jsonlint.com/
    '''
    # Fetch article object
    article = get_article(fb_id)
    
    # Add stats
    article_stats(fb_id, article);
    
    # Print Article info
    print('  Processed article metadata')

    # Add metadata and save in JSON format to disk
    save_article_json(fb_id, article, topic, publisher)


## Define Reaction functions

In [9]:
def get_reactions(fb_id):
    '''
    Get set of ID, name and type of reaction for each user (in dict), return list of dicts.
    '''
    # Step through pagination of results
    page_lim = 1000
    reactions = list()
    paging_after = ''
    more_data = True
    
    while more_data:
        obj = graph.get_connections(id=fb_id, connection_name='reactions', limit=page_lim, after=paging_after)
        if obj['data']:
            reactions.extend(obj['data'])
            paging_after = obj['paging']['cursors']['after']
        else:
            more_data = False
    
    return reactions


In [10]:
def save_reactions_csv(fb_id, reactions, topic, publisher):
    '''
    Save Reactions data in CSV format into file named 'topic_publisher_reactions.csv'
    '''
    
    # ToDo: Some names give errors, suspect encoding problem. If time and possible, feel free to fix name.
    #       For now, have excluded name (and possible gender classification) from Reactions.
    list_react = [[fb_id, d['id'], d['type']] for d in reactions]
    #list_react = [[d['id'], d['name'], d['type']] for d in reactions]
    
    # Save Reactions in CSV format to disk
    with open('{}_{}_reactions.csv'.format(topic,publisher), 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['parent_id','id','type'])
        writer.writerows(list_react)


In [11]:
def get_reactions_data(fb_id, topic, publisher):
    '''
    Get and save Reactions to a news article in a CSV file.
    
    Has the following headers:
    'parent_id': ID of the article
    'id': Anonymised user ID
    'type': Kind of reaction. One of LIKE, LOVE, HAHA, WOW, SAD, ANGRY
    '''
    # Fetch article object
    reactions = get_reactions(fb_id)
    
    # Print number of reactions
    print('  Processed {} reactions'.format(len(reactions)))
    
    # Save data in CSV format to disk
    save_reactions_csv(fb_id, reactions, topic, publisher)


## Define Comment functions

### Get comments 

In [26]:
def get_comments(fb_id):
    '''
    Steps through and retreives comments and replies (only first level replies). Returns list of dicts.
    '''
    # Step through pagination of results
    page_lim = 1000
    comments = list()
    paging_after = ''
    more_data = True

    # Loop through root comments one-by-one and get their replies
    while more_data:
        args = {'fields' : 'id, created_time, from, message, comment_count, like_count'}
        obj_parent = graph.get_connections(id=fb_id, connection_name='comments', limit=1, after=paging_after, **args)
        
        if obj_parent['data']:
            # Add root comment and add fields to improve data handling
            comments.extend(obj_parent['data'])
            comments[-1]['parent_id'] = fb_id
            comments[-1]['type'] = 'root'
            
            # Add list of likes of root comment (if any)
            obj_parent_likes = graph.get_connections(id=obj_parent['data'][0]['id'], connection_name='likes', limit=10000)
            if obj_parent['data'][0]['like_count']:
                comments[-1]['likes'] = obj_parent_likes['data']
            else:
                comments[-1]['likes'] = []
            
            # Add replies to root comment (if any) and add fields to help data handling
            if obj_parent['data'][0]['comment_count']:
                obj_child = graph.get_connections(id=obj_parent['data'][0]['id'], connection_name='comments', limit=10000, after='', **args)
                for d in obj_child['data']:
                    d.update({'parent_id':obj_parent['data'][0]['id'], 'type':'reply'})
                    obj_child_likes = graph.get_connections(id=d['id'], connection_name='likes', limit=10000)
                    if obj_child['data'][0]['like_count']:
                        d.update({'likes':obj_child_likes['data']})
                    else:
                        d.update({'likes':[]})
                                  
                comments.extend(obj_child['data'])

            # Get next root object pointer
            paging_after = obj_parent['paging']['cursors']['after']
        else:
            more_data = False
    
    return comments


In [13]:
def save_comments_json(fb_id, comments, topic, publisher):
    '''
    Save Comments in JSON format into file named 'topic_publisher_comments.json'
    '''
    # Save data in JSON format to disk
    with open('{}_{}_comments.json'.format(topic,publisher), 'w') as f:
        json.dump(comments, f, ensure_ascii=True)


In [14]:
def get_comments_data(fb_id, topic, publisher):
    '''
    Get and save Reactions to a news article in a CSV file.
    
    Has the following headers:
    'parent_id': ID of the article
    'id': Anonymised user ID
    'type': Kind of reaction. One of LIKE, LOVE, HAHA, WOW, SAD, ANGRY
    '''
    # Fetch article object
    comments = get_comments(fb_id)
    
    # Print number of comments
    print('  Processed {} comments'.format(len(comments)))

    # Save data in CSV format to disk
    save_comments_json(fb_id, comments, topic, publisher)


## Iterate through list of news articles

In [15]:
def read_news_stories(filename):
    
    articles = list()
    
    with open(filename) as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            articles.append(row)
    
    return articles

### Run! 

In [None]:
articles = read_news_stories('topic_publisher_list.csv')

# Remove first line containing headers
articles = articles[1:]

for i, row in enumerate(articles):
    publisher, topic, pub_id, post_id, permalink_url = row
   
    fb_id = get_fb_id(pub_id, post_id)

    # Get data!
    print('Processing file {} of {}: {} {}'.format(i+1, len(articles), topic, publisher))

    get_article_data(fb_id, topic, publisher)
    get_reactions_data(fb_id, topic, publisher)
    get_comments_data(fb_id, topic, publisher)
        
print('Done!')

Processing file 1 of 44: Grenfell HuffingtonPost
  Processed article metadata
  Processed 9883 reactions
  Processed 458 comments
Processing file 2 of 44: Grenfell DailyMail
  Processed article metadata
  Processed 6118 reactions
  Processed 501 comments
Processing file 3 of 44: Grenfell Standard
  Processed article metadata
  Processed 37969 reactions
  Processed 573 comments
Processing file 4 of 44: Grenfell Guardian
  Processed article metadata
  Processed 28298 reactions
  Processed 1374 comments
Processing file 5 of 44: Grenfell Independent
  Processed article metadata
  Processed 24522 reactions
  Processed 1419 comments
Processing file 6 of 44: Grenfell Telegraph
  Processed article metadata
  Processed 16900 reactions
  Processed 699 comments
Processing file 7 of 44: Grenfell TheSun
  Processed article metadata
  Processed 2365 reactions
  Processed 403 comments
Processing file 8 of 44: Grenfell BBC
  Processed article metadata
  Processed 21939 reactions
  Processed 874 commen