## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

> Import libraries

In [3]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys
import re
# import fire

from dotenv import load_dotenv
load_dotenv()

True

### Gather Data

In [4]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('finding',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

> Scrape data from [africafreak.com](https://africafreak.com/100-most-influential-twitter-users-in-africa)

In [5]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa', tag='h2')

In [6]:
non_govt_influencers = res

In [7]:
non_govt_influencers = pd.Series(non_govt_influencers)

In [8]:
afriq_users_handle = [i.split('(')[-1].strip(')') for i in non_govt_influencers]
afriq_users_handle=afriq_users_handle[:100]

In [9]:
afriq_users_handle = pd.DataFrame(afriq_users_handle, columns=['handles'])

In [10]:
afriq_users_handle

Unnamed: 0,handles
0,@gettleman
1,@a24media
2,@andiMakinana
3,@AfricaCheck
4,@JamesCopnall
...,...
95,@Julius_S_Malema
96,@News24
97,@SAPresident
98,@GarethCliff


In [11]:
 afriq_users_handle.to_csv('scraped_handles/top_100_influencers.csv')

> Scrape data from [atlanticcouncil.org](https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa)

In [12]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = get(url).content
res = get_elements(response, tag='blockquote')
res[:2]

["The Deputy Prime Minister Themba Masuku has today met representatives of the private sector and employees' unions to map a collaborative effort in the fight against #COVID19. pic.twitter.com/EIYNGOEKRN— Eswatini Government (@EswatiniGovern1) March 20, 2020",
 'GUIDELINES FOR SCHOOLS IN #MALAWI ON THE PREVENTION AND MANAGEMENT OF #COVID19 #CORONAVIRUS pic.twitter.com/PL9R4XvGV3— Malawi Government (@MalawiGovt) March 18, 2020']

In [13]:
afriq_govt = []
afriq_govt_handle = []
for r in res:
    split_data = r.split('— ',maxsplit=1)[1].rsplit('(',maxsplit=1)
    name = split_data[0].split(',')[0].strip()
    handle =  split_data[1].rsplit(')',maxsplit=1)[0]
    user = str(name), str(handle)
    afriq_govt.append(user)
    afriq_govt_handle.append(handle)

In [14]:
res_ = simple_get(url)
res = get_elements(res_, search={'find_all':{'class_':'wp-block-embed__wrapper'}})

findaing all of {'class_': 'wp-block-embed__wrapper'}


In [15]:
x= pd.DataFrame({'names':res})
x['names'] = x[x['names'].apply(lambda x: "twitter.com" in x)]
x.dropna(inplace=True)
links = x.names.values

In [16]:
for link in links:
    name = link.split('/')[3]
    handle = '@'+name
    user= str(name), str(handle)
    afriq_govt.append(user)
    afriq_govt_handle.append(handle)

In [26]:
afriq_govt_handle[:1]

Unnamed: 0,handles
0,@EswatiniGovern1


In [27]:
afriq_govt_handle = pd.DataFrame(afriq_govt_handle, columns=['handles'])

In [28]:
afriq_govt_handle.to_csv('scraped_handles/africa_govt_covid_resp.csv')

#### Get Data From Twitter

> Importing libraies & preparing api-keys

In [29]:
import tweepy
from tweepy import OAuthHandler
from tweepy import API
from tweepy import Cursor
from datetime import datetime, date, time, timedelta
from collections import Counter
import sys
import csv

In [30]:
API_key="API_key"
API_secret_key="API_secret_key"
Access_token="Access_token"
Access_token_secret="Access_token_secret"
print(API_key, API_secret_key, Access_token, Access_token_secret)

API_key API_secret_key Access_token Access_token_secret


In [31]:
API_key = os.environ.get(API_key)
API_secret_key = os.environ.get(API_secret_key)
Access_token = os.environ.get(Access_token)
Access_token_secret=os.environ.get(Access_token_secret)

In [32]:
auth = OAuthHandler(API_key, API_secret_key)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
auth_api = API(auth)

> Testing Api

In [33]:
search_words = "#wildfires"
date_since= "2018-11-16"

In [34]:
# Collect tweets
tweets = tweepy.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(2)
# Iterate and print tweets
for tweet in tweets:
    print(tweet.text)

RT @thetoonguy: On the #Arctic #wildfires from my @Gocomics page: https://t.co/Yo761wIu0x
#arcticheatwave #ArcticSeaIceDay #polarbear #Glob…
RT @EthosLifestyle: ARCTIC WILDFIRES: 5 REASONS WHY “ALARM BELLS SHOULD BE RINGING"

1. #Carbonemissions
2. #Permafrost loss
3. The sufferi…


> influential African Twitter Data

In [27]:
#start_time = timer()
#print(end_time-start_time)

In [28]:
def get_tweets(handles, csvfile=None, cols=None):
    
        df = pd.DataFrame(columns=cols)
        
         #
        if not cols is None:
            cols = cols
        else:
            cols = ['id', 'created_at', 
                    'favorite_count', 'retweet_count','screen_name','hashtags',
                    'user_mentions']
        #df = pd.DataFrame(columns=cols)

        
        if not csvfile is None:
            #If the file exists, then read the existing data from the CSV file.
            if os.path.exists(csvfile):
                df = pd.read_csv(csvfile, header=0)
            

        for handle in handles:
            
            page = Cursor(api.user_timeline, id=handle, include_rts=False, count=200).pages()

                # the you receive from the Twitter API is in a JSON format and has quite an amount of information attached
            print(page)
           # break
            for status in page:
#                 print()

                    new_entry = []
                    #print(status)
                    #This would give us the latest user data
                    status = status[-1]._json

            
                    new_entry += [status['id'], status['created_at'],
                                  #status['source'], status['text'],
                                  status['favorite_count'], status['retweet_count']]

                    new_entry.append(status['user']['screen_name'])

                    hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
                    new_entry.append(hashtags) #append the hashtags

                    #
                    mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
                    new_entry.append(mentions) #append the user mentions
                    single_tweet_df = pd.DataFrame([new_entry], columns=cols)
                    df = df.append(single_tweet_df, ignore_index=True)
            return df

In [50]:
def get_tweets(screen_name):

    #initialize a list to hold all the tweepy Tweets
    alltweets = []  

    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name,  include_rts=False,count=200)

    #save most recent tweets
    alltweets.extend(new_tweets)

    #save the id of the oldest tweet minus one
    oldest = alltweets[-1].id - 1

    #keep grabbing tweets until there are no tweets left to grab. 
    # Limit set to around 3k tweets, can be edited to preferred number.
    while len(new_tweets) > 0:
        print("getting tweets before %s" % (oldest))

        #all subsiquent requests use the max_id arg to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name,count=200, max_id=oldest)

        #save most recent tweets
        alltweets.extend(new_tweets)

        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

        print("...%s tweets downloaded so far" % (len(alltweets)))

    #transform the tweets into a 2D array that will populate the csv 
    outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]

    #write the csv  
    with open('%s_tweets.csv' % screen_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["id","created_at","text"])
        writer.writerows(outtweets)

    pass

In [52]:
# # fetch the unique handles from the top_100 and leaders dataframes
# # convert to list the merge them to be one list.
# top_100 = afriq_users_handle.handles.unique()
# africa_govt_response = afriq_govt_handle.handles.unique()
# l1 = top_100.astype(str).tolist() 
# l2 = africa_govt_response.astype(str).tolist()
# accounts = l1 + l2

# if __name__ == '__main__':
#     #loop through the handles in the list
#     for i,name in enumerate(accounts):
#         get_tweets(name)