### Web Scrape for influential Twitter handles & tweets of popular #hashtags

sites:
- [socialbakers](https://www.socialbakers.com/statistics/twitter/profiles)

- [google](https://www.google.com)

##### import libraries

In [1]:
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException
from googlesearch import search
import pandas as pd
import os, sys
from time import sleep

A

Get handles of top influencers in a country from Top to bottom

> Using selenium

In [4]:
def get_pages(country = 'nigeria',
              driver_path = r'/home/patrick/geckodriver', 
              name_= 'New Lover.', work_email='love.wins@dummy.com',
              phone = 12345, job_title = 'Lover',
              company = 'Love', pages=70):
    
    """
    change default country name in `country param` to your choice country
    
    website serves only 10 names per page, hence the pages * 10 is the number of handles to be gotten
    
    but this is no guarrantee that you will get exactly pages * 10 handles.
    
    This script assumes the following to work:
    
    1. You have a vpn on if your ip is not supported by socialbakers.com
    
    2. You are strictly looking to filter the top influencers by your specified country
    
        The tag and class_name used are for this purpose.
        
        You can however modify it, to implement your own version
    
    3. You have selenium installed ...
    
    
    Change your driver path to the path where it was downloaded on your machine
    If you are using chrome, change Firefox to Chrome

    """
    
    base_url =f'https://www.socialbakers.com/statistics/twitter/profiles/{country}/'
    count = 0
    names = []
    
    driver = webdriver.Firefox(executable_path=driver_path)
    
    driver.get(base_url) 
    sleep(15)  
    
    table = driver.find_elements_by_class_name('acc-placeholder-img') 
    index =len(table)
    
    print(f"@ page {count+1}")
    for row in table:
            name = row.text.split(' ')[-1]
            if '@' in name:
                names.append(name.lstrip('(').rstrip(')'))
                
    # utility functions
    def next_page():
        try:
            next_btn_class = driver.find_elements_by_class_name('more-center-link')[0]
            next_btn = next_btn_class.find_elements_by_tag_name('a')
            return next_btn[0].click() 
        except ElementClickInterceptedException:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    def parse_names():
        if count <=pages:
            print(f"@ page - {count+1}")
            print('table length @',len(table))
            print('index @',index)
            for row in table[index:]:
                if row.text is not None:
                    name = row.text.split(' ')[-1]
                    if '@' in name:
                        names.append(name.lstrip('(').rstrip(')'))
    
                                       
                                       
    next_page()
    sleep(15)
                
    
    if count == 0:
        
        # get the element of the pop-up modal form
        form = driver.find_element_by_id('frm-showShowMoreMarketoForm-mktoForm-mktoForm')
        
        #send payload to fill the form 
        name_ele = form.find_element_by_id('frm-showShowMoreMarketoForm-mktoForm-mktoForm-FullName')
        name_ele.send_keys(name_) 
        email_ele = form.find_element_by_id('frm-showShowMoreMarketoForm-mktoForm-mktoForm-Email')
        email_ele.send_keys(work_email)
        phone_ele = form.find_element_by_id('frm-showShowMoreMarketoForm-mktoForm-mktoForm-Phone')
        phone_ele.send_keys(phone)
        job_ele = form.find_element_by_id('frm-showShowMoreMarketoForm-mktoForm-mktoForm-Job_Title__c')
        job_ele.send_keys(job_title)
        company_ele = form.find_element_by_id('frm-showShowMoreMarketoForm-mktoForm-mktoForm-Company')
        company_ele.send_keys(company)
        company_ele.submit()
        
        # return to main page and parse table
    
        table = driver.find_elements_by_class_name('acc-placeholder-img')
        count+=1
        print(f"@ page {count+1}")
        for row in table[index:]:
            name = row.text.split(' ')[-1]
            if '@' in name:
                names.append(name.lstrip('(').rstrip(')'))     
           
        
    for i in range(pages):
        table = driver.find_elements_by_class_name('acc-placeholder-img') 
        count+=1
        parse_names()
        index =len(table)
        assert index == len(table),'Not Equal'
        next_page()
        sleep(15)
        
        
    path = os.getcwd()
    return pd.Series(names).drop_duplicates().to_csv(os.path.join(path, "handles.csv"))

In [5]:
get_pages()

@ page 1
@ page 2
@ page - 3
table length @ 28
index @ 28
@ page - 4
table length @ 38
index @ 28
@ page - 5
table length @ 48
index @ 38
@ page - 6
table length @ 58
index @ 48
@ page - 7
table length @ 68
index @ 58
@ page - 8
table length @ 78
index @ 68
@ page - 9
table length @ 88
index @ 78
@ page - 10
table length @ 98
index @ 88
@ page - 11
table length @ 108
index @ 98
@ page - 12
table length @ 118
index @ 108
@ page - 13
table length @ 128
index @ 118
@ page - 14
table length @ 138
index @ 128
@ page - 15
table length @ 148
index @ 138
@ page - 16
table length @ 158
index @ 148
@ page - 17
table length @ 168
index @ 158
@ page - 18
table length @ 178
index @ 168
@ page - 19
table length @ 188
index @ 178
@ page - 20
table length @ 198
index @ 188
@ page - 21
table length @ 208
index @ 198
@ page - 22
table length @ 218
index @ 208
@ page - 23
table length @ 228
index @ 218
@ page - 24
table length @ 238
index @ 228
@ page - 25
table length @ 248
index @ 238
@ page - 26
table

B.

Find Tweets popular to certain hashtags

> Using googlesearch

In [None]:
def get_urls(tags, num, language):
    
    """
    tag id a lost of tags to search tweets
    num is the number of tweets to search for in a tag
    language is the language of the tweet to search for
    
    """
    tweets = []
    for tag in tags:
        print('searching google... for '+tag)
        tag_url = [url for url in 
               search(tag+' twitter', stop=num, lang=language, country='Nigeria')][:n]
        tweets.extend(tag_url)
    print('done searching', '\ncollecting tweets only')
    for idx, i in enumerate(urls):
        if 'hashtag' in i:
            tweets.pop(idx)
    print('done') 
    return pd.Series(tweets).to_csv(os.path.join(path, "tweets.csv"))

In [157]:
pd.Series([1,2,3,4,4,5,5])

0    1
1    2
2    3
3    4
5    5
dtype: int64