In [8]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import re

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    


In [2]:
xl = pd.read_csv('leader.csv')

In [3]:
xl

Unnamed: 0,Name,Facebook,Twitter,Instagram
0,HANNAH YEOH,https://www.facebook.com/hannahyeoh/,https://twitter.com/hannahyeoh/,https://www.instagram.com/hannahyeoh/
1,NIK NAZMI BIN NIK AHMAD,https://www.facebook.com/niknazminikahmad/,https://twitter.com/niknazmi/,https://www.instagram.com/niknazminikahmad/
2,RINA MOHD HARUN,https://www.facebook.com/rinaharunsrikandi/,https://twitter.com/rinamohdharun/,https://www.instagram.com/rinamohdharun/
3,SHAHIDAN BIN KASSIM,https://www.facebook.com/ybdssk/,https://twitter.com/dshahidankassim,
4,DATO' DR. HAJI ABD RAHMAN BIN DAUD,https://www.facebook.com/CikguRahman.PakatanHa...,https://twitter.com/arda11557,
5,USTAZ HASHIM JASIN,https://www.facebook.com/hashimjasinakil/,https://twitter.com/hashimjasinakil,
6,DR MAHATHIR BIN MOHAMAD,https://www.facebook.com/TunDrMahathir/,https://twitter.com/chedetofficial,
7,DATO' IR. NAWAWI BIN AHMAD,https://www.facebook.com/Yb-Datoir-Hj-Nawawi-A...,https://twitter.com/nawawikawi,
8,MUKHRIZ MAHATHIR,https://www.facebook.com/MukhrizMahathirFC/,https://twitter.com/mukhrizmahathir,
9,ABD. GHANI BIN AHMAD,https://www.facebook.com/ustazghani4jerlun/,https://twitter.com/abdghani_57,


In [4]:
len(xl.index)

154

In [5]:
def ObtainFb(xl):
    """
    Obtain the name of the leader and his/her facebook follower count
    """
    FollowerDf = pd.DataFrame()
    for i in list(range(0,len(xl.index))): 
        raw_html = simple_get(xl.iloc[i,1])
        if raw_html == None:
            numberOfFollowers = None
        else:
            html = BeautifulSoup(raw_html, 'html.parser')
            listOfTab = [item['data-key'] for item in html.find_all('div', attrs={'data-key' : True})]

            tab = html.find('div',attrs = {'data-key' : 'tab_community'})
            if tab == None:
                numberOfFollowers = None
            else:
                nextURL = tab.a["href"]
                nextURL = "https://www.facebook.com/" + nextURL

                next_raw_html = simple_get(nextURL)
                next_html = BeautifulSoup(next_raw_html, 'html.parser')

                numberOfFollowers = next_html.select("div._3xom")[1].text

                temp = pd.DataFrame({'name':xl.iloc[i,0], 'FB':numberOfFollowers}, index=[i])
                FollowerDf = FollowerDf.append(temp)
    return FollowerDf
    

def ObtainTwit(xl):
    """
    Obtain the Twitter follower and post counts from Social Blade 
    """
    FollowerDf1 = pd.DataFrame()
    for i in list(range(0,len(xl.index))): 
            raw_html = xl.iloc[i,2]
            if raw_html == None:
                numberOfFollowers = None
            else:
                regex1 = re.compile(r"(com/)([a-zA-Z]+)")
                match = regex1.search(raw_html)
                #For twitter
                socialBladeURL = "https://socialblade.com/twitter/user/"+match[2]+"/monthly"

                raw_html2 = simple_get(socialBladeURL)
                html = BeautifulSoup(raw_html2, 'html.parser')
                k =html.find_all("script", attrs = {"type":"text/javascript"})
                
                
                
                regex2 = re.compile(r"(2018-05-08,)(\d+)")
                
                #Follower count
                match1 = regex2.search(k[6].text)
                if match1 is not None:
                    numberOfFollowers1 = match1[2]
                else:
                    numberOfFollowers1 = None
                    
                #Tweet count
                match2 = regex2.search(k[10].text)
                if match2 is not None:
                    numberOfFollowers2 = match2[2]
                else:
                    numberOfFollowers2 = None
            temp = pd.DataFrame({'TWFollower':numberOfFollowers1, "TWTweet":numberOfFollowers2}, index=[i])
            FollowerDf1 = FollowerDf1.append(temp)
            
    return FollowerDf1

def joinColumns(*dataframes):
    return pd.concat(dataframes,1)

In [6]:
ObtainFb(xl)

Unnamed: 0,name,FB
0,HANNAH YEOH,295K
1,NIK NAZMI BIN NIK AHMAD,113K
2,RINA MOHD HARUN,9101
3,SHAHIDAN BIN KASSIM,8545
4,DATO' DR. HAJI ABD RAHMAN BIN DAUD,750
5,USTAZ HASHIM JASIN,22536
6,DR MAHATHIR BIN MOHAMAD,3.5J
7,DATO' IR. NAWAWI BIN AHMAD,676
8,MUKHRIZ MAHATHIR,515K
9,ABD. GHANI BIN AHMAD,2153


In [9]:
ObtainTwit(xl)

Unnamed: 0,TWFollower,TWTweet
0,180908,20065
1,81456,37129
2,,
3,,
4,222,0
5,,
6,457407,1513
7,,
8,,
9,,


In [90]:
raw_html = "https://twitter.com/guanenglim"
if raw_html == None:
    numberOfFollowers = None
else:
    regex1 = re.compile(r"(com/)([a-zA-Z]+)")
    match = regex1.search(raw_html)
    #For twitter
    socialBladeURL = "https://socialblade.com/twitter/user/"+match[2]+"/monthly"

    raw_html2 = simple_get(socialBladeURL)
    html = BeautifulSoup(raw_html2, 'html.parser')
    k =html.find_all("script", attrs = {"type":"text/javascript"})

In [91]:
k[10]

<script type="text/javascript">
		g = new Dygraph(

		// containing div
		document.getElementById('TotalTweetsGained'),
		// CSV or path to a CSV file.
			"Date,Total Tweets\n" + "2018-05-12,15907\n" +"2018-05-13,15912\n" +"2018-05-14,15916\n" +"2018-05-15,15916\n" +"2018-05-16,15916\n" +"2018-05-17,15916\n" +"2018-05-18,15919\n" +"2018-05-19,15926\n" +"2018-05-20,15929\n" +"2018-05-21,15930\n" +"2018-05-22,15932\n" +"2018-05-23,15932\n" +"2018-05-24,15932\n" +"2018-05-25,15934\n" +"2018-05-26,15936\n" +"2018-05-27,15937\n" +"2018-05-28,15938\n" +"2018-05-29,15938\n" +"2018-05-30,15938\n" +"2018-05-31,15938\n" +"2018-06-01,15939\n" +"2018-06-02,15941\n" +"2018-06-03,15943\n" +"2018-06-04,15943\n" +"2018-06-05,15943\n" +"2018-06-06,15943\n" +"2018-06-07,15943\n" +"2018-06-08,15946\n" +"2018-06-09,15946\n" +"2018-06-10,15947\n" +"2018-06-11,15948\n" +"2018-06-12,15953\n" +"2018-06-13,15956\n" +"2018-06-14,15958\n" +"2018-06-15,15958\n" +"2018-06-16,15958\n" +"2018-06-17,15958\n" +"2018-0

In [43]:
raw_html = simple_get(socialBladeURL1)
html = BeautifulSoup(raw_html, 'html.parser')

TypeError: object of type 'NoneType' has no len()

In [89]:
k[10]

<script type="text/javascript">
		g = new Dygraph(

		// containing div
		document.getElementById('TotalTweetsGained'),
		// CSV or path to a CSV file.
			"Date,Total Tweets\n" + "2018-07-12,629\n" , {
				title: 'Total Tweets Posted for RinaMohdHarun',
				legend: 'always',
				ylabel: false,
				titleHeight: 20,
				labelsDivStyles: { 'background': 'none', 'margin-top': '-10px', 'text-align': 'right', },
				strokeWidth: 1,
				colors: ["#dd2323", "#dd2323", "#dd2323", "#dd2323"],
				labelsKMB: true,
				maxNumberWidth: 10
			}
		);
		</script>

In [67]:
import re
#First assume that the twitter accounts exist for the users
for i in list(range(0,len(xl.index))):


HANNAH YEOH
2
180908
NIK NAZMI BIN NIK AHMAD
2
81456


In [53]:
raw_html = xl.iloc[1,2]

In [64]:
socialBladeURL

'https://socialblade.com/instagram/user/hannahyeoh/monthly'

In [43]:
raw_html = simple_get(socialBladeURL1)
html = BeautifulSoup(raw_html, 'html.parser')

TypeError: object of type 'NoneType' has no len()

In [40]:
k[6].text
regex2 = re.compile(r"(2018-05-08,)(\d+)")
match1 = regex2.search(k[6].text)
numberOfFollowers = match1[2]

In [41]:
numberOfFollowers

'81456'