In [26]:
# import lib
import pandas as pd
import numpy as np
from facebook_scraper import get_posts
import datetime

In [27]:
# Facebook scraper 
list_of_eateries = ['primadeli', 'Polar.Puffs.Cakes', 'oldchangkee1956','MrBeanSingapore','yakunkayatoastsg','toastboxsingapore','breadtalksingapore','DominosSG','singaporecanadianpizza','JollibeeSG','coffeebean.sg','StarbucksSingapore','lihosg','eachacupsingapore','KFC.SG','LongJohnSilvers.Sg','SubwaySingapore','mosburgersg','BurgerKingSG','PizzaHutSingapore','fishandco.sg'] # just insert Facebook page name into list
list_of_post = []

for eatery in list_of_eateries:
    for post in get_posts(eatery, pages= 1): # pages = number of pages to scrape
        list_of_post.append(post)

print("Total number of post: ", len(list_of_post))

Total number of post:  42


In [2]:
# list_of_eateries = ['primadeli', 'Polar.Puffs.Cakes']
# list_of_post = []

# # append FB post into list_of_post
# for eatery in list_of_eateries:
#     for post in get_posts(eatery, pages=2):
#         list_of_post.append(post)

In [50]:
# data cleaning
# convert time column from series to datetime
df = pd.DataFrame(list_of_post)
df['time'] = pd.to_datetime(df['time'])

In [51]:
today = datetime.datetime.now().date()
week_ago = today - datetime.timedelta(days=7) # date from a week ago
print("Date now: ",today)
print("Date week ago: ",week_ago)

Date now:  2020-12-01
Date week ago:  2020-11-24


In [52]:
delta = today - week_ago
date_range = []
for i in range(delta.days + 1):
    day = week_ago + datetime.timedelta(days=i)
    date_range.append(day)
print("Date that are considered: ",date_range)

Date that are considered:  [datetime.date(2020, 11, 24), datetime.date(2020, 11, 25), datetime.date(2020, 11, 26), datetime.date(2020, 11, 27), datetime.date(2020, 11, 28), datetime.date(2020, 11, 29), datetime.date(2020, 11, 30), datetime.date(2020, 12, 1)]


In [53]:
# drop dates not within 1 week
in_range = []
for date in df['time']:
    in_range.append(date in date_range)
in_range = np.array(in_range)
df = df[in_range]

In [54]:
# remove all blank post
df = df[df['text'] != ""]
print("number of empty rows: ", sum(df['text'] == ""))

number of empty rows:  0


In [55]:
# remove all emoji, ignore warning
import re

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

# deEmojify every post
for i in range(len(df['text'])):
    df['text'][i] = deEmojify(df['text'][i])

KeyError: 8

In [56]:
# remove all links from post
count = 0
for message in df['text']:
    df['text'][count]= re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", message)
    count += 1

In [57]:
# remove all punctuations from post
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

stored_message = ""
count = 0
for message in df['text']:
    for letter in message:
        if letter not in punctuations:
            stored_message = stored_message + letter # store all non-punc alphanumeric in stored_message
    df['text'][count] = stored_message # input stored_message into corresponding post
    stored_message = ''
    count += 1

In [58]:
# remove all \n (new line formatting)
count = 0
for message in df['text']:
    df['text'][count] = message.replace('\n',' ')
    count += 1

In [59]:
# drop unncessary columns
df = df.drop(['post_id','shared_text','video_thumbnail','video_id','likes','comments','shares','user_id','images','post_text'],axis=1) 
df['label'] = ''
df['classif_text'] = ''

In [60]:
# import best model from classif_model.ipynb
import pickle

filename = 'deals_classif.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [61]:
# predictions of model
predictions= loaded_model.predict(df['text'])
df['label'] = predictions

In [62]:
# drop non-deals entries
deals_list = df[~df['label'].isin(['Non'])]
deals_list = deals_list.drop(['label','classif_text'],axis=1)

In [63]:
deals_list

Unnamed: 0,text,time,image,video,video_thumbnail,post_url,link
6,Celebrate Mr Bean’s 25th anniversary!\n\nWith ...,2020-11-30 19:10:29,https://scontent.fsin9-2.fna.fbcdn.net/v/t1.0-...,,,https://facebook.com/story.php?story_fbid=2331...,
7,Celebrate Mr Bean’s 25th Birthday with our Ann...,2020-11-26 09:51:54,,https://scontent.fsin9-2.fna.fbcdn.net/v/t42.9...,https://scontent.fsin9-1.fna.fbcdn.net/v/t15.5...,https://facebook.com/watch?v=133475114932784,
13,"Food, friends, and family. Celebrate the most ...",2020-11-28 12:00:27,,https://scontent.fsin9-1.fna.fbcdn.net/v/t42.1...,https://scontent.fsin9-1.fna.fbcdn.net/v/t15.1...,https://facebook.com/watch?v=2075944449206116,https://www.breadtalk.com.sg/shop


In [64]:
# export predictions and post to excel 
deals_list.to_excel("deals_list.xlsx")  

In [28]:
# import tweepy
# from tweepy import OAuthHandler
# import pandas as pd

In [29]:
# access_token = 'XXXXXX'
# access_token_secret = 'XXXXXX'
# consumer_key = 'XXXXXX'
# consumer_secret = 'XXXXXX'


In [30]:
# auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# auth.set_access_token(access_token, access_token_secret)

In [31]:
# api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

# tweets = []

# count = 1

In [32]:
# for tweet in tweepy.Cursor(api.search, q="@KFC_SG", count=450, since='2020-11-20').items(50000):
	
# 	print(count)
# 	count += 1

# 	try: 
# 		data = [tweet.created_at, tweet.id, tweet.text, tweet.user._json['screen_name'], tweet.user._json['name'], tweet.user._json['created_at'], tweet.entities['urls']]
# 		data = tuple(data)
# 		tweets.append(data)

# 	except tweepy.TweepError as e:
# 		print(e.reason)
# 		continue

# 	except StopIteration:
# 		break

# df = pd.DataFrame(tweets, columns = ['created_at','tweet_id', 'tweet_text', 'screen_name', 'name', 'account_creation_date', 'urls'])

1
2


In [None]:
# https://github.com/Nonnecke/ScrapingTweets/blob/master/python_Twitter_scrape
# https://medium.com/citrispolicylab/a-simple-guide-to-scrape-tweets-using-python-ba7c691b6efa