In [2]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys
import fire
import json
import re
import string
import matplotlib.pyplot as plt

In [3]:
def simple_get(url):
    
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):

    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

In [4]:
url = "https://africafreak.com/100-most-influential-twitter-users-in-africa"
response = simple_get(url)
res = get_elements(response, tag='h2')
    

In [5]:
influencers = res[:len(res)-5]
positions = []
names = []
handles = []

influencer_split = [influencer.split("(") for influencer in influencers]
for i in range(len(influencer_split)):
    id_name = influencer_split[i][0].split(".")
    position = id_name[0]
    fullname = id_name[1]
    names.append(fullname)
    positions.append(position)
    handle = influencer_split[i][1]
    handles.append(handle.replace(')',''))
    
positions = positions[::-1]
names = names[::-1]
handles = handles[::-1]

influencers_list = pd.DataFrame({'position':positions, 'name':names, 'handle':handles})
influencers_list.to_csv('100_influencers_africa.csv')



In [6]:
#Assignnment 1: 10 african Influencers from url
top_10_african_influencers = influencers_list[:10]
top_10_african_influencers.to_csv('10_african_influencers.csv', index=False)

In [7]:
top_10_african_influencers

Unnamed: 0,position,name,handle
0,1,Trevor Noah,@Trevornoah
1,2,Gareth Cliff,@GarethCliff
2,3,Jacob G,@SAPresident
3,4,News24,@News24
4,5,Julius Sello Malema,@Julius_S_Malema
5,6,Helen Zille,@helenzille
6,7,mailandguardian,@mailandguardian
7,8,5FM,@5FM
8,9,loyiso gola,@loyisogola
9,10,Computicket,@Computicket


In [9]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = simple_get(url)
res_gov = get_elements(response, tag='blockquote')
res_countries = get_elements(response, tag='strong')


In [10]:
unwanted_countries = {'Botswana','Comoros','Lesotho','Madagascar','Mauritius','Mozambique','Ethiopia','Cameroon','Central African Republic','Congo-Brazzaville','Equatorial Guinea','São Tomé\xa0and Príncipe','Liberia'} 
  
countries = [country for country in res_countries if country not in unwanted_countries] 



In [11]:
handles = []
for res in res_gov:
    handle = re.findall(r"@\w+", res)
    if type(handle) is list:
        hand = handle[-1]
    handles.append(hand)
    
presidents_list = pd.DataFrame({'handles':handles, 'country':countries})
presidents_list.to_csv('twitterhandles_africanpresidents.csv')

ten_presidents_list= presidents_list[:10]
ten_presidents_list.to_csv('10_african_leaders.csv', index=False)


In [12]:
#Assignnment 2: 10 african leaders from url
ten_presidents_list

Unnamed: 0,handles,country
0,@EswatiniGovern1,Eswatini
1,@MalawiGovt,Malawi
2,@hagegeingob,Namibia
3,@FinanceSC,Seychelles
4,@PresidencyZA,South Africa
5,@mohzambia,Zambia
6,@edmnangagwa,Zimbabwe
7,@MinSantedj,Djibouti
8,@hawelti,Eritrea
9,@StateHouseKenya,Kenya
