##### import libraries

In [41]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys
import re

### A

##### functions for search with with bs4

In [158]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content#.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('finding',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
# if get_ipython().__class__.__name__ == '__main__':
#     fire(get_tag_elements)

#### Web Scrape https://www.socialbakers.com/statistics/twitter/profiles/nigeria/ for influential handles

In [86]:
#PATH = os.path.join("C:\\","Users","frevert","Documents","py")
def table_to_df(table):
    print(table)
    return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.findAll('tr')])

def next_page(soup):
	return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "https://www.socialbakers.com/statistics/twitter/profiles/nigeria"

counter = 0
while True:
	print(counter)
	page = get(url)#, print(page)
	soup = BeautifulSoup(page.content, 'lxml')#,print(soup) 
	table = soup.find(name='table', attrs={'class':'brand-table-list'}), print(type(table))
	#res = res.append(table_to_df(table))
	#res.to_csv('project_files/datasets/scraped_hanldes', index=None, sep=';', encoding='iso-8859-1')
	url = next_page(soup)
	counter += 1

`NB:` The website serves only 10 names per page... hence to get the top 100, we have to loop 10 times 

In [182]:
def get_pages (func= get_elements, country = 'nigeria', pages = 2):
    influencers = []
    url = f'https://www.socialbakers.com/statistics/twitter/profiles/{country}'
    while pages:
        print(url)
        names = func(url,search={'find_all':{'class_':'brand-table-list'}}, tag='h2')#[4:14]
        influencers.append(names)
        page = get(url)
        soup = BeautifulSoup(page.content, 'lxml')
        url = "https://www.socialbakers.com" + soup.find('a', attrs={'rel':'next'}).get('href')
        pages-=1
    return influencers

In [None]:
x = get_pages(country='nigeria', pages=3)

https://www.socialbakers.com/statistics/twitter/profiles/nigeria


In [181]:
x

[['Largest Audience',
  'Fastest-Growing Profiles in Nigeria',
  'Nigeria Social Marketing Reports',
  'Twitter Profiles Stats in Nigeria',
  'Wizkid (@wizkidayo)',
  'ITS DONJAZZY AGAIN😉 (@DONJAZZY)',
  'Tiwa Savage ⚔️ (@TiwaSavage)',
  'Channels Television (@channelstv)',
  'Burna Boy (@burnaboy)',
  'Punch Newspapers (@MobilePunch)',
  'Muhammadu Buhari (@MBuhari)',
  "D'banj D Kokomaster (@iamdbanj)",
  'Banky Wellington (@BankyW)',
  'Atiku Abubakar (@atiku)',
  'Glossary & Metrics Overview',
  'Current Twitter statistics',
  'What is a Twitter Account?',
  'What is Twitter Monitoring?',
  'What is the Process of Twitter Monitoring?',
  'Twitter Profiles Monitoring',
  'Solutions',
  'Solutions',
  'Platform',
  'Platform',
  'Free Tools',
  'Free Tools',
  'News & Insight',
  'News & Insight',
  'Statistics',
  'Statistics',
  'Company',
  'Company',
  '\n',
  <tr>
  <td class="item-count-td brand-table-first-nr">
  <div class="item item-count">
  					1
  				</div>
  </td>
  <t

In [154]:
urls = 'https://www.socialbakers.com/statistics/twitter/profiles/nigeria/page-6-10'

In [165]:
'@Wazobia_FM' in get(urls).text

False

In [161]:
get_elements(urls, search={'find_all':{'class_':'brand-table-list'}}, tag='h2')

findaing all of {'class_': 'brand-table-list'}


['Largest Audience',
 'Fastest-Growing Profiles in Nigeria',
 'Nigeria Social Marketing Reports',
 'Twitter Profiles Stats in Nigeria',
 'Wizkid (@wizkidayo)',
 'ITS DONJAZZY AGAIN😉 (@DONJAZZY)',
 'Tiwa Savage ⚔️ (@TiwaSavage)',
 'Channels Television (@channelstv)',
 'Burna Boy (@burnaboy)',
 'Punch Newspapers (@MobilePunch)',
 'Muhammadu Buhari (@MBuhari)',
 "D'banj D Kokomaster (@iamdbanj)",
 'Banky Wellington (@BankyW)',
 'Atiku Abubakar (@atiku)',
 'Glossary & Metrics Overview',
 'Current Twitter statistics',
 'What is a Twitter Account?',
 'What is Twitter Monitoring?',
 'What is the Process of Twitter Monitoring?',
 'Twitter Profiles Monitoring',
 'Solutions',
 'Solutions',
 'Platform',
 'Platform',
 'Free Tools',
 'Free Tools',
 'News & Insight',
 'News & Insight',
 'Statistics',
 'Statistics',
 'Company',
 'Company',
 '\n',
 <tr>
 <td class="item-count-td brand-table-first-nr">
 <div class="item item-count">
 					1
 				</div>
 </td>
 <td class="name">
 <div class="item">
 <a 

In [160]:
# get_elements(urls, tag='h2')
x

['Largest Audience',
 'Fastest-Growing Profiles in Nigeria',
 'Nigeria Social Marketing Reports',
 'Twitter Profiles Stats in Nigeria',
 'Wizkid (@wizkidayo)',
 'ITS DONJAZZY AGAIN😉 (@DONJAZZY)',
 'Tiwa Savage ⚔️ (@TiwaSavage)',
 'Channels Television (@channelstv)',
 'Burna Boy (@burnaboy)',
 'Punch Newspapers (@MobilePunch)',
 'Muhammadu Buhari (@MBuhari)',
 "D'banj D Kokomaster (@iamdbanj)",
 'Banky Wellington (@BankyW)',
 'Atiku Abubakar (@atiku)',
 'Glossary & Metrics Overview',
 'Current Twitter statistics',
 'What is a Twitter Account?',
 'What is Twitter Monitoring?',
 'What is the Process of Twitter Monitoring?',
 'Twitter Profiles Monitoring',
 'Solutions',
 'Solutions',
 'Platform',
 'Platform',
 'Free Tools',
 'Free Tools',
 'News & Insight',
 'News & Insight',
 'Statistics',
 'Statistics',
 'Company',
 'Company',
 '\n',
 <tr>
 <td class="item-count-td brand-table-first-nr">
 <div class="item item-count">
 					1
 				</div>
 </td>
 <td class="name">
 <div class="item">
 <a 

### B

##### function for search with  block for scrapy

In [3]:
import logging
import re
import scrapy

In [4]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from googlesearch import search

In [5]:
def get_urls(tags, n, language):
    urls = []
    for tag in tags:
        print('searching google... for '+tag)
        tag_url = [url for url in 
               search(tag+' twitter', stop=n, lang=language, country='Nigeria')][:n]
        urls.extend(tag_url)
    print('done searching', '\ncollecting tweets only')
    for idx, i in enumerate(urls):
        if 'hashtag' in i:
            urls.pop(idx)
    print('done') 
    return urls

##### Gather Tweet Links (using hashtags & popular figures search) across the following doamins

- a. Economy

- b. Social values (sport, education, human rights, etc.)

- c. Cultural (entertainment, fashion, art, etc)

- d. Public health

#### _______________________________________________________________________________________________________________________________

- Economy hashtags/keyowrds

     #CBN

In [85]:
economic_tags = ['#CBN', '#nigerianbanks', 'money', 'capital']

In [86]:
economic = get_urls( economic_tags, 5, 'en')
economic

searching google... for #CBN
searching google... for #nigerianbanks
searching google... for money
searching google... for capital
done searching 
collecting tweets only
done


['https://twitter.com/cenbank?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor',
 'https://twitter.com/cenbank/status/1285256112938287104?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/cenbank/status/1285241320139829248?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/cenbank/status/1285237002124890112?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/cenbank/status/1177266807276744720?lang=en',
 'https://twitter.com/nigerianbank',
 'https://twitter.com/nairametrics/status/1179716119126511616',
 'https://twitter.com/TrafficChiefNG/status/1091090035175153664',
 'https://twitter.com/cenbank/status/1220391725388259329',
 'https://optinmonster.com/make-money-on-twitter/',
 'https://twitter.com/money?lang=en',
 'https://www.lifehack.org/articles/money/7-creative-and-effective-ways-make-money-twitter.html',
 'https://www.lifehack.org/articles/money/7-creative-and-effective-ways-make-money-twitter.ht

- Social values (sport, education, human rights, etc.) hashtags

        #FAAN, 

In [40]:
social_tags = ['#FAAN', '#AuditMoneyTrail', ' #NDDCProbe']

In [41]:
social = get_urls(social_tags, 5, 'en')
social

['https://twitter.com/FAAN_Official?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor',
 'https://twitter.com/FAAN_Official/status/1285903613198569472?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/FAAN_Official/status/1285539167108321280?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/FAAN_Official/status/1285129614252220418?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/faannig?lang=en',
 'https://twitter.com/hashtag/auditmoneytrail',
 'https://twitter.com/Connected_dev/status/1267505124181762049',
 'https://twitter.com/Magarya/statuses/1276042280240119808',
 'https://twitter.com/UzoHans',
 'https://twitter.com/Magarya/statuses/1285532536945483776',
 'https://twitter.com/hashtag/NDDCProbe?src=hashtag_click',
 'https://twitter.com/TosinOlugbenga/status/1285242363884052482',
 'https://twitter.com/ayemojubar?lang=en',
 'https://twitter.com/ARISEtv/status/1285186570790875138',
 'https://twitt

- Cultural (entertainment, fashion, art, etc) hashtags

 #bbnajia2020, #MercyEkeBrands

In [42]:
cultural_tags = ['#LayconVerified', '#bbnajia2020', '#MercyEkeBrands']

In [43]:
culture = get_urls( cultural_tags, 5, 'en')
culture

['https://twitter.com/Itslaycon/status/1285845805698551809?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/Al_ameen_Yabo/status/1285886012527382529?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/adeyefa_peter/status/1285898200046612482?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/search?q=%23LayconVerified+twitter&ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Esearch',
 'https://twitter.com/hashtag/LayconVerified?src=hash',
 'https://twitter.com/Official_mohnyc/status/1285970534837166080?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/Al_ameen_Yabo/status/1285886012527382529?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/MRtim__1911/status/1285890988372897792?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Etweet',
 'https://twitter.com/search?q=%23bbnajia2020+twitter&ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Esearch',
 'https:/

- Public health hashtags

In [52]:
publicHealth_tags = ['#covid19 nigeria ', '#corona nigeria ', '#coronavirus nigeria ', '#healthcare nigeria']

In [53]:
publicHealth = get_urls(publicHealth_tags, 5, 'en')
publicHealth

['https://twitter.com/ncdcgov?lang=en',
 'https://twitter.com/digicommsng?lang=en',
 'https://twitter.com/hashtag/covid19nigeria?lang=en',
 'https://twitter.com/epidalert?lang=en',
 'https://twitter.com/ncdcgov/media',
 'https://twitter.com/ncdcgov?lang=en',
 'https://twitter.com/ukinnigeria?lang=en',
 'https://twitter.com/ncdcgov/media',
 'https://twitter.com/ncdcgov/status/1245788324415049729?lang=en',
 'https://twitter.com/whonigeria?lang=en',
 'https://twitter.com/ncdcgov?lang=en',
 'https://twitter.com/hashtag/coronavirusnigeria?lang=en',
 'https://twitter.com/ncdcgov/media',
 'https://twitter.com/chikwe_i?lang=en',
 'https://twitter.com/hashtag/coronavirusinnigeria?lang=en',
 'https://twitter.com/nighealthwatch?lang=en',
 'https://twitter.com/fmohnigeria',
 'https://twitter.com/nmanigeria?lang=en',
 'https://twitter.com/dreoehanire?lang=en',
 'https://twitter.com/lsmoh?lang=en']

In [67]:
# for idx, i in enumerate(publicHealth):
#     print(idx, i)