## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

In [36]:
# $ python3 -m venv venv
# $ . ./venv/bin/activate

> Import libraries

In [287]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys
import re
# import fire

from dotenv import load_dotenv
load_dotenv()

# twitter api
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

In [288]:
API_key="API_key"
API_secret_key="API_secret_key"
Access_token="Access_token"
Access_token_secret="Access_token_secret"
print(API_key, API_secret_key, Access_token, Access_token_secret)

API_key API_secret_key Access_token Access_token_secret


In [289]:
API_key = os.environ.get(API_key)
API_secret_key = os.environ.get(API_secret_key)
Access_token = os.environ.get(Access_token)
Access_token_secret=os.environ.get(Access_token_secret)

### Gather Data

In [62]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('finding',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

> Scrape data from [africafreak.com](https://africafreak.com/100-most-influential-twitter-users-in-africa)

In [63]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa', tag='h2')
non_govt_influencers = res
non_govt_influencers[:5]

['100. Jeffrey Gettleman (@gettleman)',
 '99. Africa24 Media (@a24media)',
 '98. Scapegoat (@andiMakinana)',
 '97. Africa Check (@AfricaCheck)',
 '96. James Copnall (@JamesCopnall)']

> Scrape data from [atlanticcouncil.org](https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa)

In [237]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = get(url).content
res = get_elements(response, tag='blockquote')
res[:2]

["The Deputy Prime Minister Themba Masuku has today met representatives of the private sector and employees' unions to map a collaborative effort in the fight against #COVID19. pic.twitter.com/EIYNGOEKRN— Eswatini Government (@EswatiniGovern1) March 20, 2020",
 'GUIDELINES FOR SCHOOLS IN #MALAWI ON THE PREVENTION AND MANAGEMENT OF #COVID19 #CORONAVIRUS pic.twitter.com/PL9R4XvGV3— Malawi Government (@MalawiGovt) March 18, 2020']

In [242]:
afriq_govt = []
afriq_govt_handle = []
for r in res:
    split_data = r.split('— ',maxsplit=1)[1].rsplit('(',maxsplit=1)
    name = split_data[0].split(',')[0].strip()
    handle =  split_data[1].rsplit(')',maxsplit=1)[0]
    user = str(name), str(handle)
    afriq_govt.append(user)
    afriq_govt_handle.append(handle)

In [244]:
res_ = simple_get(url)
res = get_elements(res_, search={'find_all':{'class_':'wp-block-embed__wrapper'}})

findaing all of {'class_': 'wp-block-embed__wrapper'}


In [245]:
x= pd.DataFrame({'names':res})
x['names'] = x[x['names'].apply(lambda x: "twitter.com" in x)]
x.dropna(inplace=True)
links = x.names.values

In [246]:
for link in links:
    name = link.split('/')[3]
    handle = '@'+name
    user= str(name), str(handle)
    afriq_govt.append(user)
    afriq_govt_handle.append(handle)

In [248]:
afriq_govt_handle

['@EswatiniGovern1',
 '@MalawiGovt',
 '@hagegeingob',
 '@FinanceSC',
 '@PresidencyZA',
 '@mohzambia',
 '@edmnangagwa',
 '@MinSantedj',
 '@hawelti',
 '@StateHouseKenya',
 '@PaulKagame',
 '@M_Farmaajo',
 '@SouthSudanGov',
 '@SudanPMHamdok',
 '@TZSpokesperson',
 '@KagutaMuseveni',
 '@angola_Mirex',
 '@willynyamitwe',
 '@Cherif_MZ',
 '@Presidence_RDC',
 '@PresidentABO',
 '@PresidenceBenin',
 '@rochkaborepf',
 '@PresidenciaCV',
 '@AOuattara_PRCI',
 '@Presidency_GMB',
 '@NAkufoAddo',
 '@President_GN',
 '@USEmbalo',
 '@PresidenceMali',
 '@CheikhGhazouani',
 '@IssoufouMhm',
 '@MBuhari',
 '@Macky_Sall',
 '@PresidentBio',
 '@MSPS_Togo',
 '@TsholetsaDomi',
 '@Azali_officiel',
 '@SE_Rajoelina',
 '@PKJugnauth',
 '@AbiyAhmedAli',
 '@PR_Paul_BIYA',
 '@MinistereComCG']

> scrape data from [enitiate.solutions](https://enitiate.solutions/top-18-african-heads-of-states-on-twitter/)

In [578]:
url = 'https://enitiate.solutions/top-18-african-heads-of-states-on-twitter/'
response = get(url).content

In [609]:
soup = BeautifulSoup(response, 'lxml')

In [622]:
# rows = soup.find_all('div')
# rows

In [588]:
# soup.prettify

#### Get Data From Twitter