## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

> Import libraries

In [3]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys
import re
# import fire

from dotenv import load_dotenv
load_dotenv()

True

### Gather Data

In [4]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('finding',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

> Scrape data from [africafreak.com](https://africafreak.com/100-most-influential-twitter-users-in-africa)

In [7]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa', tag='h2')

In [8]:
non_govt_influencers = res

In [9]:
non_govt_influencers = pd.Series(non_govt_influencers)

In [10]:
afriq_users_handle = [i.split('(')[-1].strip(')') for i in non_govt_influencers]
afriq_users_handle=afriq_users_handle[:100]

> correct incorrect tweets

In [44]:
afriq_users_handle[12] = '@beyondsafari'

> pars to DataFrame

In [39]:
df_afriq_users_handle = pd.DataFrame(afriq_users_handle, columns=['handles'])

In [40]:
df_afriq_users_handle

Unnamed: 0,handles
0,@gettleman
1,@a24media
2,@andiMakinana
3,@AfricaCheck
4,@JamesCopnall
...,...
95,@Julius_S_Malema
96,@News24
97,@SAPresident
98,@GarethCliff


In [14]:
df_afriq_users_handle.to_csv('scraped_handles/top_100_influencers.csv')

> Scrape data from [atlanticcouncil.org](https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa)

In [15]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = get(url).content
res = get_elements(response, tag='blockquote')
res[:2]

["The Deputy Prime Minister Themba Masuku has today met representatives of the private sector and employees' unions to map a collaborative effort in the fight against #COVID19. pic.twitter.com/EIYNGOEKRN— Eswatini Government (@EswatiniGovern1) March 20, 2020",
 'GUIDELINES FOR SCHOOLS IN #MALAWI ON THE PREVENTION AND MANAGEMENT OF #COVID19 #CORONAVIRUS pic.twitter.com/PL9R4XvGV3— Malawi Government (@MalawiGovt) March 18, 2020']

In [16]:
afriq_govt = []
afriq_govt_handle = []
for r in res:
    split_data = r.split('— ',maxsplit=1)[1].rsplit('(',maxsplit=1)
    name = split_data[0].split(',')[0].strip()
    handle =  split_data[1].rsplit(')',maxsplit=1)[0]
    user = str(name), str(handle)
    afriq_govt.append(user)
    afriq_govt_handle.append(handle)

In [17]:
res_ = simple_get(url)
res = get_elements(res_, search={'find_all':{'class_':'wp-block-embed__wrapper'}})

findaing all of {'class_': 'wp-block-embed__wrapper'}


In [18]:
x= pd.DataFrame({'names':res})
x['names'] = x[x['names'].apply(lambda x: "twitter.com" in x)]
x.dropna(inplace=True)
links = x.names.values

In [19]:
for link in links:
    name = link.split('/')[3]
    handle = '@'+name
    user= str(name), str(handle)
    afriq_govt.append(user)
    afriq_govt_handle.append(handle)

In [20]:
afriq_govt_handle[:1]

['@EswatiniGovern1']

In [21]:
afriq_govt_handle = pd.DataFrame(afriq_govt_handle, columns=['handles'])

In [22]:
afriq_govt_handle.to_csv('scraped_handles/africa_govt_covid_resp.csv')

#### Get Data From Twitter

> Importing libraies & preparing api-keys

In [23]:
import tweepy
from tweepy import OAuthHandler
from tweepy import API
from tweepy import Cursor
from datetime import datetime, date, time, timedelta
from collections import Counter
import sys
import csv

In [24]:
API_key="API_key"
API_secret_key="API_secret_key"
Access_token="Access_token"
Access_token_secret="Access_token_secret"
print(API_key, API_secret_key, Access_token, Access_token_secret)

API_key API_secret_key Access_token Access_token_secret


In [25]:
API_key = os.environ.get(API_key)
API_secret_key = os.environ.get(API_secret_key)
Access_token = os.environ.get(Access_token)
Access_token_secret=os.environ.get(Access_token_secret)

In [26]:
auth = OAuthHandler(API_key, API_secret_key)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
auth_api = API(auth)

> Testing Api

In [27]:
search_words = "#wildfires"
date_since= "2018-11-16"

In [28]:
# Collect tweets
tweets = tweepy.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(2)
# Iterate and print tweets
for tweet in tweets:
    print(tweet.text)

RT @m_parrington: Mid-July view of #Siberia/#ArcticCircle #wildfires with #CopernicusAtmosphere Monitoring Service GFAS #opendata. Arctic d…
#napa #santarosa #grimesfire #ventura #fresnocounty #riversidecounty #veyowestfire #sonomacounty #benfire… https://t.co/TkKznU3P5l


> influential African Twitter Data

>> Define functions

In [48]:
def get_tweets(handles):
    
    cols = ['id', 'name', 'screen_name', 'description', 
            'statuses_count', 'friends_count', 'followers_count', 
            'account_age_days', 'avg_daily_tweets', 'hashtags',
            'user_mentions','favorite_count', 'retweet_count',]
    
    # dataframe that would be returned at the end
    #df = pd.DataFrame(columns=cols)
    
    value_list = []
    handle_data = []
    off_users = []
            
    if len(handles) > 0:    
        for handle in handles:
            print("Getting data for " + handle)
            try:
                item = auth_api.get_user(handle)
            except tweepy.TweepError as e:
                continue
            value_list+= item.id_str, item.name, item.screen_name,\
            item.description, item.statuses_count, item.friends_count, item.followers_count
            
            #get average daily tweets
            
            no_tweets = item.statuses_count
            account_created_date = item.created_at
            delta = datetime.utcnow() - account_created_date
            account_age_days = delta.days
            value_list.append(str(account_age_days))
            #print(str(account_age_days))
            if account_age_days > 0:
                   value_list.append(int(float(no_tweets)/float(account_age_days)))
                    
                    
                    
            hashtags = []
            mentions = []
            favorite_count =[]
            retweet_count=[]
            tweet_count = 0
            end_date = datetime.utcnow() - timedelta(days=30)
            
#             try:
#                 items =  
#             except:
                

            for status in Cursor(auth_api.user_timeline, id=handle).items():
                tweet_count+= 1
                if hasattr(status, "entities"):
                    entities = status.entities

                # get hashtags
                if "hashtags" in entities:
                    for ent in entities["hashtags"]:
                        if ent is not None:
                            if "text" in ent:
                                hashtag = ent["text"]
                                if hashtag is not None:
                                    hashtags.append(hashtag)
                # get usermentions
                if "user_mentions" in entities:
                    for ent in entities["user_mentions"]:
                        if ent is not None:
                            if "screen_name" in ent:
                                name = ent["screen_name"]
                                if name is not None:
                                    mentions.append(name)

                # get retweets    
                if hasattr(status, "retweet_count"):
                    retweets = status.retweet_count
                    if retweets is not None:
                        retweet_count.append(retweets)
                        
                # favorite count     
                if hasattr(status, "favorite_count"):
                    likes = status.favorite_count 
                    if likes is not None:
                        favorite_count.append(likes)
                if status.created_at < end_date:
                    break
                    
            
            value_list.append(len(hashtags))
            value_list.append(len(mentions))
            value_list.append(sum(favorite_count))
            value_list.append(sum(retweet_count))
            #handle_data.extend(value_list)
            #print(value_list)
            #break
    df= pd.DataFrame([value_list], columns=cols)
    return df

In [49]:
# if len(account_list) > 0:
#   for target in account_list:
#     print("Getting data for " + target)
#     item = auth_api.get_user(target)
#     print("name: " + item.name)
#     print("screen_name: " + item.screen_name)
#     print("description: " + item.description)
#     print("statuses_count: " + str(item.statuses_count))
#     print("friends_count: " + str(item.friends_count))
#     print("followers_count: " + str(item.followers_count))
    
    
# tweets = item.statuses_count
# account_created_date = item.created_at
# delta = datetime.utcnow() - account_created_date
# account_age_days = delta.days
# print("Account age (in days): " + str(account_age_days))
# if account_age_days > 0:
# print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
    
# hashtags = []
# mentions = []
# favorite_count =[]
# retweet_count=[]
# tweet_count = 0
# end_date = datetime.utcnow() - timedelta(days=30)

#     for status in Cursor(auth_api.user_timeline, id=target).items():
#         tweet_count += 1
#         if hasattr(status, "entities"):
#         entities = status.entities
        
#         # get hashtags
#         if "hashtags" in entities:
#             for ent in entities["hashtags"]:
#             if ent is not None:
#                 if "text" in ent:
#                     hashtag = ent["text"]
#                 if hashtag is not None:
#                     hashtags.append(hashtag)
#         value_list+=len(hashtags)
#         # get usermentions
#         if "user_mentions" in entities:
#             for ent in entities["user_mentions"]:
#                 if ent is not None:
#                     if "screen_name" in ent:
#                         name = ent["screen_name"]
#                         if name is not None:
#                             mentions.append(name)
#         value_list+=len(mentions)
                                              
#         # get retweets    
#         if hasattr(status, "retweet_count"):
#             retweets = status.retweet_count
#             if retweets is not None:
#                 retweet_count.append(retweets)
#         value_list+=sum(retweet_count)

#         # favorite count     
#         if hasattr(status, "favorite_count"):
#             likes = status.favorite_count 
#             if likes is not None:
#                 favorite_count.append(likes)
#         value_list+=sum(retweet_count)
#         if status.created_at < end_date:
#         break

In [60]:
test =afriq_users_handle[:5]
test

['@gettleman', '@a24media', '@andiMakinana', '@AfricaCheck', '@JamesCopnall']

In [61]:
test_=get_tweets(test)

Getting data for @gettleman
Getting data for @a24media
Getting data for @andiMakinana
Getting data for @AfricaCheck
Getting data for @JamesCopnall


ValueError: 13 columns passed, passed data had 65 columns

In [63]:
test_

NameError: name 'test_' is not defined

In [54]:
x.names.values

array(['\nhttps://twitter.com/TsholetsaDomi/status/1238324860536922112\n',
       '\nhttps://twitter.com/Azali_officiel/status/1239649350747332613\n',
       '\nhttps://twitter.com/SE_Rajoelina/status/1241101811647500288\n',
       '\nhttps://twitter.com/PKJugnauth/status/1240740484714319872\n',
       '\nhttps://twitter.com/AbiyAhmedAli/status/1240291553056260099\n',
       '\nhttps://twitter.com/PR_Paul_BIYA/status/1239988020763398147\n',
       '\nhttps://twitter.com/MinistereComCG/status/1239695479476293632\n'],
      dtype=object)