https://github.com/minimaxir/facebook-page-post-scraper/blob/master/examples/how_to_build_facebook_scraper.ipynb

In [2]:
from urllib.request import urlopen
import json
import datetime
import csv
import time
import dateutil.parser

APP_ID = "263167120760118"
APP_SECRET = "66c0e13ad59facee726d4c607116ae37"

access_token = APP_ID + "|" + APP_SECRET

page_id = 'merckcap'

In [3]:
def request_until_succeed(url):
    success = False
    while success is False:
        try: 
            response = urlopen(url)
            if response.getcode() == 200:
                success = True
        except Exception as e:
            print(e)
            time.sleep(5)
            
            print("Error for URL %s: %s" % (url, datetime.datetime.now()))

    return response.read()

In [4]:
def testFacebookPageData(page_id, access_token):
    
    # construct the URL string
    base = "https://graph.facebook.com/v2.4"
    node = "/" + page_id
    parameters = "/?access_token=%s" % access_token
    url = base + node + parameters
    
    #  print(url)
    
    # retrieve data
    data = json.loads(request_until_succeed(url))
    print(json.dumps(data, indent=4, sort_keys=True))
    

facebookpagedata = testFacebookPageData(page_id, access_token)

{
    "id": "157141811314444",
    "name": "Merck Capacity Advancement Program"
}


In [5]:
def testFacebookPageFeedData(page_id, access_token):
    
    # construct the URL string
    base = "https://graph.facebook.com/v2.4"
    node = "/" + page_id + "/feed" # changed
    parameters = "/?access_token=%s" % access_token
    url = base + node + parameters
    
    # retrieve data
    data = json.loads(request_until_succeed(url))
        
    print(json.dumps(data, indent=4, sort_keys=True))
    

facebookpagefeeddata = testFacebookPageFeedData(page_id, access_token)

{
    "data": [
        {
            "created_time": "2017-04-18T05:35:03+0000",
            "id": "157141811314444_422573441437945",
            "message": "A message from H. E. First Lady of C.A.R.-Madame Brigitte Touadera- Champion of Merck More Than a Mother http://ow.ly/ozEf30aVZ7c \nFollow Merck more than a Mother on Twitter - www.twitter.com/merckandmothers"
        },
        {
            "created_time": "2017-04-17T12:00:20+0000",
            "id": "157141811314444_422185258143430",
            "message": "Merck Capacity Advancement Program - Merck Diabetes And Hypertension 2016 Awards http://ow.ly/pNsa30aUoH7"
        },
        {
            "created_time": "2017-04-17T09:00:15+0000",
            "id": "157141811314444_422102074818415",
            "message": "Merck More Than A Mother - Aisha Tasila (Sierra Leone) tells us her story of suffering. Watch more - http://ow.ly/6tdu30agOxe"
        },
        {
            "created_time": "2017-04-17T06:00:08+0000",
            

In [6]:
def getFacebookPageFeedData(page_id, access_token, num_statuses):
    
    # construct the URL string
    base = "https://graph.facebook.com"
    node = "/" + page_id + "/feed" 
    # parameters = "/?fields=message,link,created_time,type,name,id,likes.limit(1).summary(true),comments.limit(1).summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # changed
    parameters = "/?fields=message,link,created_time,type,name,id,likes.summary(true),comments.summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # changed
    url = base + node + parameters
            
    # retrieve data
    data = json.loads(request_until_succeed(url))
    
    return data

test_status = getFacebookPageFeedData(page_id, access_token, 1)["data"][0]
print(json.dumps(test_status, indent=4, sort_keys=True))

{
    "comments": {
        "data": [],
        "summary": {
            "can_comment": false,
            "order": "ranked",
            "total_count": 0
        }
    },
    "created_time": "2017-04-18T05:35:03+0000",
    "id": "157141811314444_422573441437945",
    "likes": {
        "data": [],
        "summary": {
            "can_like": false,
            "has_liked": false,
            "total_count": 0
        }
    },
    "link": "http://ow.ly/ozEf30aVZ7c",
    "message": "A message from H. E. First Lady of C.A.R.-Madame Brigitte Touadera- Champion of Merck More Than a Mother http://ow.ly/ozEf30aVZ7c \nFollow Merck more than a Mother on Twitter - www.twitter.com/merckandmothers",
    "name": "H. E. First Lady of C.A.R.-Madame Brigitte Touadera- Champion of Merck...",
    "type": "video"
}


In [7]:
 test = getFacebookPageFeedData(page_id, access_token, 100)

In [8]:
print(len(test['data']))
test['data'][0]['likes']

100


{'data': [],
 'summary': {'can_like': False, 'has_liked': False, 'total_count': 0}}

In [9]:
def processFacebookPageFeedStatus(status):
    
    # The status is now a Python dictionary, so for top-level items,
    # we can simply call the key.
    
    # Additionally, some items may not always exist,
    # so must check for existence first
    
    status_id = status['id']
    status_message = '' if 'message' not in status.keys() else status['message'].encode('utf-8')
    link_name = '' if 'name' not in status.keys() else status['name'].encode('utf-8')
    status_type = status['type']
    status_link = '' if 'link' not in status.keys() else status['link']
       
    
    # Time needs special care since a) it's in UTC and
    # b) it's not easy to use in statistical programs.
    
    status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
    status_published = status_published + datetime.timedelta(hours=-5) # EST
    status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs
    
    # Nested items require chaining dictionary keys.
    
    num_likes = 0 if 'likes' not in status.keys() else status['likes']['summary']['total_count']
    num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
    num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']
    
    # return a tuple of all processed data
    return (status_id, status_message, link_name, status_type, status_link,
           status_published, num_likes, num_comments, num_shares)

processed_test_status = processFacebookPageFeedStatus(test_status)
print(dateutil.parser.parse(processed_test_status[5]).date())

2017-04-18


In [10]:
def scrapeFacebookPageFeedStatus(page_id, access_token):
    with open('%s_facebook_statuses.csv' % page_id, 'w') as file:
        w = csv.writer(file)
        w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
           "status_published", "num_likes", "num_comments", "num_shares"])
        
        has_next_page = True
        num_processed = 0   # keep a count on how many we've processed
        scrape_starttime = datetime.datetime.now()
        
        print("Scraping %s Facebook Page: %s\n" % (page_id, scrape_starttime))
        
        statuses = getFacebookPageFeedData(page_id, access_token, 100)
        
        while has_next_page:
            processdate = []
            for status in statuses['data']:
                processdate = processFacebookPageFeedStatus(status)
                w.writerow(processdate)
                
                # output progress occasionally to make sure code is not stalling
                num_processed += 1
                if num_processed % 1000 == 0:
                    print("%s Statuses Processed: %s" % (num_processed, datetime.datetime.now()))
                    
            # if there is no next page, we're done.
            if 'paging' in statuses.keys():
                statuses = json.loads(request_until_succeed(statuses['paging']['next']))
            else:
                has_next_page = False
                
        
        print("\nDone!\n%s Statuses Processed in %s" % (num_processed, datetime.datetime.now() - scrape_starttime))


scrapeFacebookPageFeedStatus(page_id, access_token)

Scraping merckcap Facebook Page: 2017-04-18 12:11:32.275485

<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>
Error for URL https://graph.facebook.com/merckcap/feed/?fields=message,link,created_time,type,name,id,likes.summary(true),comments.summary(true),shares&limit=100&access_token=263167120760118|66c0e13ad59facee726d4c607116ae37: 2017-04-18 12:11:58.538320
<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>
Error for URL https://graph.facebook.com/v2.8/157141811314444/feed?fields=message,link,created_time,type,name,id,likes.summary%28true%29,comments.summary%28true%29,shares&limit=100&__paging_token=enc_AdDRrV6moTZCGuQBnByJTsvuD6CY1E5NufCLMT6Eztnx5sFvOyMponl

In [79]:
list_companies = ["merckcap","GrantThorntonIndiaLLP"]
# list_companies = ["walmart", "cisco", "pepsi", "facebook"]
def scrapeFacebookPageFeedStatus(list_companie, access_token):
    for page_id in list_companie:
        with open('%s_facebook_statuses.csv' % page_id, 'w') as file:
            w = csv.writer(file)
            w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
           "status_published", "num_likes", "num_comments", "num_shares"])
        
            has_next_page = True
            num_processed = 0   # keep a count on how many we've processed
            scrape_starttime = datetime.datetime.now()
        
            print("Scraping %s Facebook Page: %s\n" % (page_id, scrape_starttime))
        
            statuses = getFacebookPageFeedData(page_id, access_token, 100)
        
            while has_next_page:
                for status in statuses['data']:
                    w.writerow(processFacebookPageFeedStatus(status))
                
                    # output progress occasionally to make sure code is not stalling
                    num_processed += 1
                    if num_processed % 1000 == 0:
                        print("%s Statuses Processed: %s" % (num_processed, datetime.datetime.now()))
                        
                # if there is no next page, we're done.
                if 'paging' in statuses.keys():
                    statuses = json.loads(request_until_succeed(statuses['paging']['next']))
                else:
                    has_next_page = False
                
        
            print("\nDone!\n%s Statuses Processed in %s" % (num_processed, datetime.datetime.now() - scrape_starttime))
        
scrapeFacebookPageFeedStatus(list_companies, access_token)

Scraping merckcap Facebook Page: 2017-04-17 16:12:35.789214

1000 Statuses Processed: 2017-04-17 16:13:11.101583

Done!
1507 Statuses Processed in 0:00:55.360347
Scraping GrantThorntonIndiaLLP Facebook Page: 2017-04-17 16:13:31.156058


Done!
451 Statuses Processed in 0:00:20.530785


In [11]:
#list_companies = ["walmart", "cisco", "pepsi", "facebook"]
list_companies = ["merckcap","GrantThorntonIndiaLLP","deloitte","KPMG","pwcus"]
def scrapeFacebookPageFeedStatus(list_companie, access_token):
    for page_id in list_companie:
        with open('%s_facebook_statuses.csv' % page_id, 'w') as file:
            w = csv.writer(file)
            w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
           "status_published", "num_likes", "num_comments", "num_shares"])
        
            has_next_page = True
            num_processed = 0   # keep a count on how many we've processed
            scrape_starttime = datetime.datetime.now()
        
            print("Scraping %s Facebook Page: %s\n" % (page_id, scrape_starttime))
        
            statuses = getFacebookPageFeedData(page_id, access_token, 100)
        
            while has_next_page:
                processdate = []
                for status in statuses['data']:
                    processdate = processFacebookPageFeedStatus(status)
                    w.writerow(processdate)
                
                    # output progress occasionally to make sure code is not stalling
                    num_processed += 1
                    if num_processed % 1000 == 0:
                        print("%s Statuses Processed: %s" % (num_processed, datetime.datetime.now()))
                
                # if there is no next page, we're done.
                if 'paging' in statuses.keys():
                    statuses = json.loads(request_until_succeed(statuses['paging']['next']))
                    # print(processdate[5])
                else:
                    has_next_page = False
                
        
            print("\nDone!\n%s Statuses Processed in %s" % (num_processed, datetime.datetime.now() - scrape_starttime))
        
scrapeFacebookPageFeedStatus(list_companies, access_token)

Scraping merckcap Facebook Page: 2017-04-18 12:17:27.762127

1000 Statuses Processed: 2017-04-18 12:18:06.458040
<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>
Error for URL https://graph.facebook.com/v2.8/157141811314444/feed?fields=message,link,created_time,type,name,id,likes.summary%28true%29,comments.summary%28true%29,shares&limit=100&__paging_token=enc_AdCKkJwOPhOei8ChlxYSv2rZBZAml4WmqBewov8e4HdAs6HOqsJZBfrNrvPOrDHYZBeZAaqsB3ZBPxyO2kCzRxbURDZCgNnsztGZBZBYuZCwKjDKbGwNXOhAZDZD&access_token=263167120760118|66c0e13ad59facee726d4c607116ae37&until=1450680192: 2017-04-18 12:18:53.700539

Done!
1509 Statuses Processed in 0:01:27.550117
Scraping GrantThorntonIndiaLLP Facebook Page: 2017-04-18 12:18:55.318745


Done!
451 Statuses Processed in 0:00:20.984164
Scraping deloitte Facebook Page: 2017-04-18 12:19:16.308910

1000 St

###### Date Comparision

In [103]:
#list_companies = ["walmart", "cisco", "pepsi", "facebook","EY-195665063800329"]
list_companies = ["merckcap","GrantThorntonIndiaLLP","deloitte","KPMG","pwcus"]
def scrapeFacebookPageFeedStatus(list_companie, access_token):
    for page_id in list_companie:
        # with open('S:\\%s_facebook_statuses_time.csv' % page_id, 'w') as file:
        with open('%s_facebook_statuses_time.csv' % page_id, 'w') as file:
            w = csv.writer(file)
            w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
           "status_published", "num_likes", "num_comments", "num_shares"])
        
            has_next_page = True
            num_processed = 0   # keep a count on how many we've processed
            scrape_starttime = datetime.datetime.now()
        
            print("Scraping %s Facebook Page: %s\n" % (page_id, scrape_starttime))
        
            statuses = getFacebookPageFeedData(page_id, access_token, 100)
        
            while has_next_page:
                processdate = []
                for status in statuses['data']:
                    processdate = processFacebookPageFeedStatus(status)
                    w.writerow(processdate)
                
                    # output progress occasionally to make sure code is not stalling
                    num_processed += 1
                    if num_processed % 1000 == 0:
                        print("%s Statuses Processed: %s" % (num_processed, datetime.datetime.now()))
                
                # if there is no next page, we're done.
                if 'paging' in statuses.keys():
                    statuses = json.loads(request_until_succeed(statuses['paging']['next']))
                    if dateutil.parser.parse(processdate[5]).date() < datetime.datetime(2016, 8, 25).date():
                        has_next_page = False
                else:
                    has_next_page = False
                
        
            print("\nDone!\n%s Statuses Processed in %s" % (num_processed, datetime.datetime.now() - scrape_starttime))
        
scrapeFacebookPageFeedStatus(list_companies, access_token)

Scraping merckcap Facebook Page: 2017-04-18 11:39:46.952040

1000 Statuses Processed: 2017-04-18 11:40:22.745085

Done!
1000 Statuses Processed in 0:00:38.802927
Scraping GrantThorntonIndiaLLP Facebook Page: 2017-04-18 11:40:25.767969


Done!
100 Statuses Processed in 0:00:07.241920
Scraping deloitte Facebook Page: 2017-04-18 11:40:33.018890


Done!
200 Statuses Processed in 0:00:27.591003
Scraping KPMG Facebook Page: 2017-04-18 11:41:00.617895


Done!
200 Statuses Processed in 0:00:29.932800
Scraping pwcus Facebook Page: 2017-04-18 11:41:30.558697


Done!
300 Statuses Processed in 0:00:24.123063


In [84]:
datetime.datetime(2016, 8, 25).date()

datetime.date(2016, 8, 25)

In [86]:
dateutil.parser.parse(processed_test_status[5]).date() > datetime.datetime(2015, 8, 25).date()

True

In [95]:
dateutil.parser.parse(processed_test_status[5]).date() - datetime.datetime(2015, 8, 25).date()

datetime.timedelta(601)

In [94]:
dateutil.parser.parse(processed_test_status[5]).date()

datetime.date(2017, 4, 17)