In [1]:
# Run this for followers
import pandas as pd
import numpy as np
import time

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
 
from bs4 import BeautifulSoup as bs
import time
import json

import requests
from datetime import datetime

https://allofyourbases.com/2018/01/16/mining-twitter-with-selenium/

In [2]:
# Run these function definitions
def scrape_followers(driver):
    # initial wait for the search results to load
    wait = WebDriverWait(driver, 10)

    try:
        # wait until the first search result is found. Search results will be tweets, which are html list items and have the class='data-item-id':
        wait.until(EC.visibility_of_element_located(
            (By.CSS_SELECTOR, "div[data-user-id]")))

        # scroll down to the last tweet until there are no more tweets:
        while True:

            # extract all the tweets:
            followers = driver.find_elements_by_css_selector(
                "div[data-user-id]")

            # find number of visible tweets:
            number_of_followers = len(followers)

            # keep scrolling:
            driver.execute_script(
                "arguments[0].scrollIntoView();", followers[-1])

            try:
                # wait for more tweets to be visible:
                wait.until(wait_for_more_than_n_elements_to_be_present(
                    (By.CSS_SELECTOR, "div[data-user-id]"), number_of_followers))

            except TimeoutException:
                # if no more are visible the "wait.until" call will timeout. Catch the exception and exit the while loop:
                break

        # extract the html for the whole lot:
        page_source = driver.page_source

    except TimeoutException:

        # if there are no search results then the "wait.until" call in the first "try" statement will never happen and it will time out. So we catch that exception and return no html.
        page_source = None

    return page_source


def extract_followers(page_source):

    soup = bs(page_source, 'lxml')

    followers = []

    for div in soup.find_all("a", class_='fullname'):
        follower = {
            'screen_name': div['href'][1:],
            'full_name': div.get_text().strip()
        }

        followers.append(follower)

    return followers[1:]


def init_driver():

    # do not load images
    chromeOptions = webdriver.ChromeOptions()
    prefs = {'profile.managed_default_content_settings.images':2}
    chromeOptions.add_experimental_option("prefs", prefs)
    
    # initiate the driver:
    driver = webdriver.Chrome(
        '/Users/fernandojavier/Desktop/MSDS/Portfolio/Protest Spread in Twitter/chromedriver', 
        chrome_options=chromeOptions)
    # set a default wait time for the browser [5 seconds here]:
    driver.wait = WebDriverWait(driver, 5)

    return driver


def close_driver(driver):

    driver.close()

    return


def login_twitter(driver, username, password):

    # open the web page in the browser:
    driver.get("https://twitter.com/login")

    # find the boxes for username and password
    username_field = driver.find_element_by_class_name("js-username-field")
    password_field = driver.find_element_by_class_name("js-password-field")

    # enter your username:
    username_field.send_keys(username)
    driver.implicitly_wait(1)

    # enter your password:
    password_field.send_keys(password)
    driver.implicitly_wait(1)

    # click the "Log In" button:
    driver.find_element_by_class_name("EdgeButtom--medium").click()

    return


class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False


def search_twitter(driver, query):

    # wait until the search box has loaded:
    box = driver.wait.until(EC.presence_of_element_located((By.NAME, "q")))

    # find the search box in the html:
    driver.find_element_by_name("q").clear()

    # enter your search string in the search box:
    box.send_keys(query)

    # submit the query (like hitting return):
    box.submit()

    # initial wait for the search results to load
    wait = WebDriverWait(driver, 5)

    try:
        # wait until the first search result is found. Search results will be tweets, which are html list items and have the class='data-item-id':
        wait.until(EC.visibility_of_element_located(
            (By.CSS_SELECTOR, "li[data-item-id]")))

        # scroll down to the last tweet until there are no more tweets:
        while True:

            # extract all the tweets:
            tweets = driver.find_elements_by_css_selector("li[data-item-id]")

            # find number of visible tweets:
            number_of_tweets = len(tweets)

            # keep scrolling:
            driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])

            try:
                # wait for more tweets to be visible:
                wait.until(wait_for_more_than_n_elements_to_be_present(
                    (By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))

            except TimeoutException:
                # if no more are visible the "wait.until" call will timeout. Catch the exception and exit the while loop:
                break

        # extract the html for the whole lot:
        page_source = driver.page_source

    except TimeoutException:

        # if there are no search results then the "wait.until" call in the first "try" statement will never happen and it will time out. So we catch that exception and return no html.
        page_source = None

    return page_source


def extract_tweets(page_source, driver):

    soup = bs(page_source, 'lxml')

    tweets = []
    for li in soup.find_all("li", class_='js-stream-item'):

        # If our li doesn't have a tweet-id, we skip it as it's not going to be a tweet.
        if 'data-item-id' not in li.attrs:
            continue

        else:
            tweet = {
                'tweet_id': li['data-item-id'],
                'parent_tweet_id': None,
                'text': None,
                'user_id': None,
                'user_screen_name': None,
                'user_name': None,
                'retweets': 0,
                'likes': 0,
                'replies': 0,
                'timestamp': 0,
                'date': '',
                'reply': 0,
                'retweet': 0
            }

            # Tweet Text
            text_p = li.find("p", class_="tweet-text")
            if text_p is not None:
                tweet['text'] = text_p.get_text()

            # Tweet User ID, User Screen Name, User Name
            user_details_div = li.find("div", class_="tweet")
            if user_details_div is not None:
                tweet['user_id'] = user_details_div['data-user-id']
                tweet['user_screen_name'] = user_details_div['data-screen-name']
                tweet['user_name'] = user_details_div['data-name']

            # Tweet Retweets
            retweet_span = li.select(
                "span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount")
            if retweet_span is not None and len(retweet_span) > 0:
                tweet['retweets'] = int(
                    retweet_span[0]['data-tweet-stat-count'])

            # Tweet Likes
            like_span = li.select(
                "span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount")
            if like_span is not None and len(like_span) > 0:
                tweet['likes'] = int(like_span[0]['data-tweet-stat-count'])

            # Tweet Replies
            reply_span = li.select(
                "span.ProfileTweet-action--reply > span.ProfileTweet-actionCount")
            if reply_span is not None and len(reply_span) > 0:
                tweet['replies'] = int(reply_span[0]['data-tweet-stat-count'])

            date_span = li.find("span", class_="js-short-timestamp")
            if date_span is not None:
                tweet['timestamp'] = date_span['data-time']
                tweet['date'] = datetime.utcfromtimestamp(
                    int(tweet['timestamp'])
                ).strftime('%Y-%m-%d %H:%M:%S')

            tweets.append(tweet)

            driver.execute_script(
                "document.getElementById('" +
                str(li['id']) +
                "').querySelector('.tweet').click()")

            wait = WebDriverWait(driver, 2)

            try:
                # wait until the first search result is found. Search results will be tweets, which are html list items and have the class='data-item-id':
                wait.until(EC.visibility_of_element_located(
                    (By.CSS_SELECTOR, "div[data-has-parent-tweet]")))

                # scroll down to the last tweet until there are no more tweets:
                while True:

                    # extract all the tweets:
                    replies = driver.find_elements_by_css_selector(
                        "div[data-has-parent-tweet]")

                    # find number of visible tweets:
                    number_of_replies = len(replies)

                    if number_of_replies == 0:
                        break

                    # keep scrolling:
                    try:

                        driver.execute_script(
                            "arguments[0].scrollIntoView();", replies[-1])

                    except Exception:
                        break

                    try:
                        # wait for more tweets to be visible:
                        wait.until(wait_for_more_than_n_elements_to_be_present(
                            (By.CSS_SELECTOR, "div[data-has-parent-tweet]"), number_of_replies))

                    except TimeoutException:
                        # if no more are visible the "wait.until" call will timeout. Catch the exception and exit the while loop:
                        break

                # extract the html for the whole lot:
                page_source_replies = driver.page_source

            except TimeoutException:

                # if there are no search results then the "wait.until" call in the first "try" statement will never happen and it will time out. So we catch that exception and return no html.
                page_source_replies = None

            soup_replies = bs(page_source_replies, 'lxml')

            for div in soup_replies.find_all("div", class_='permalink-descendant-tweet'):

                # If our li doesn't have a tweet-id, we skip it as it's not going to be a tweet.
                if 'data-item-id' not in div.attrs:
                    continue

                else:
                    reply = {
                        'tweet_id': div['data-item-id'],
                        'parent_tweet_id': li['data-item-id'],
                        'text': None,
                        'user_id': div['data-user-id'],
                        'user_screen_name': div['data-screen-name'],
                        'user_name': div['data-name'],
                        'retweets': 0,
                        'likes': 0,
                        'replies': 0,
                        'timestamp': 0,
                        'date': '',
                        'reply': 1,
                        'retweet': 0
                    }

                    # Tweet Text
                    text_p_reply = div.find("p", class_="tweet-text")
                    if text_p_reply is not None:
                        reply['text'] = text_p_reply.get_text()

                    # Tweet Retweets
                    retweet_span = div.select(
                        "div.ProfileTweet-action--retweet span.ProfileTweet-actionCountForPresentation")
                    if retweet_span is not None and len(retweet_span) > 0 and retweet_span[0].get_text().strip() != '':
                        reply['retweets'] = int(
                            retweet_span[0].get_text())

                    # Tweet Likes
                    like_span = div.select(
                        "div.ProfileTweet-action--favorite span.ProfileTweet-actionCountForPresentation")
                    if like_span is not None and len(like_span) > 0 and like_span[0].get_text().strip() != '':
                        reply['likes'] = int(
                            like_span[0].get_text())

                    # Tweet Replies
                    reply_span = div.select(
                        "div.ProfileTweet-action--reply span.ProfileTweet-actionCountForPresentation")
                    if reply_span is not None and len(reply_span) > 0 and reply_span[0].get_text().strip() != '':
                        reply['replies'] = int(
                            reply_span[0].get_text())

                    date_span = div.find("span", class_="js-short-timestamp")
                    if date_span is not None:
                        reply['timestamp'] = date_span['data-time']
                        reply['date'] = datetime.utcfromtimestamp(
                            int(reply['timestamp'])
                        ).strftime('%Y-%m-%d %H:%M:%S')

                    tweets.append(reply)

            try:
                driver.execute_script(
                    "document.querySelector('.request-retweeted-popup').click()")
            except Exception:
                continue

            time.sleep(3)

            # extract the html for the whole lot:
            page_source_retweets = driver.page_source

            soup_retweets = bs(page_source_retweets, 'lxml')

            for li2 in soup_retweets.find_all(attrs={"data-item-type": "user"}):
                # If our li doesn't have a tweet-id, we skip it as it's not going to be a tweet.
                if 'data-item-type' not in li2.attrs:
                    continue

                else:
                    retweet = {
                        'tweet_id': None,
                        'parent_tweet_id': li['data-item-id'],
                        'text': None,
                        'user_id': None,
                        'user_screen_name': None,
                        'user_name': None,
                        'retweets': 0,
                        'likes': 0,
                        'replies': 0,
                        'timestamp': 0,
                        'date': '',
                        'reply': 0,
                        'retweet': 1
                    }

                    # Tweet Text
                    text_p_retweet = li2.find("p", class_="bio")
                    if text_p_retweet is not None:
                        retweet['text'] = text_p_retweet.get_text()

                    # Tweet User ID, User Screen Name, User Name
                    user_details_div_retweet = li2.find(
                        "div", class_="account")
                    if user_details_div_retweet is not None:
                        retweet['user_id'] = user_details_div_retweet['data-user-id']
                        retweet['user_screen_name'] = user_details_div_retweet['data-screen-name']
                        retweet['user_name'] = user_details_div_retweet['data-name']

                    tweets.append(retweet)

    return tweets


def deduplicate_list_of_dicts(dct):
    '''Remove multiple postings of one user as we only need 1 instance'''
    user_list = []
    for user in dct:
        if user['user_screen_name'] not in user_list:
            user_list.append(user['user_screen_name'])
    return user_list

In [None]:
# # Do not run this if you only want the user_following
# # This will scrape the keyword, replace it with millionpeoplemarch, JanetNapoles or PDAFScam
# # Replace email and password with your own
# driver = init_driver()

# login_twitter(driver, '<email>', '<password>')

# hmtl = search_twitter(driver, '<keyword>')

# tweets_mpm = extract_tweets(hmtl, driver)

# close_driver(driver)

In [65]:
# Run this to get the distinct users from the tweets
with open('data/new_tweets_mpm.json', 'r') as infile:
    tweets_mpm_from_file = json.load(infile)
    user_list_mpm = deduplicate_list_of_dicts(tweets_mpm_from_file)

In [None]:
# # Run this for FOLLOWERS not for FOLLOWING
# # Run this only once ever as this only builds the tracker file
# # If you run this again you will lose your progress tracker
# with open('data/tweets_mpm_list.json', 'r') as outfile:
#     json.dump(user_list_mpm, outfile)

In [None]:
# # Run this to get FOLLOWERS so don't run if you want FOLLOWING users
# with open('user_followers.json', 'r') as infile:
#     user_followers = json.load(infile)
# print(len(user_followers))
    
# with open('tweets_mpm_list.json', 'r') as infile:
#     track_list = json.load(infile)
# print(len(track_list))

# for user in track_list:
#     while True:
#         try:
#             driver = init_driver()
#             login_twitter(driver, '<email>', '<password>')
#             break
#         except Exception:
#             close_driver(driver)
#             pass
#     # open the web page in the browser:
#     driver.get("https://twitter.com/" +
#                user + '/followers')
#     followers_html = scrape_followers(driver)
#     followers = extract_followers(followers_html)
#     user_followers[user] = followers
#     track_list = track_list[1:]
#     with open('tweets_mpm_list.json', 'w') as outfile:
#         json.dump(track_list, outfile)
#     with open('user_followers.json', 'w') as outfile:
#         json.dump(user_followers, outfile)
#     close_driver(driver)

In [61]:
# check if file exists
try:
    with open('data/user_following.json', 'r') as infile:
        user_following = json.load(infile)
except:
    # create file if theres no file yet
    f = open("data/user_following.json", "w+")
    f.write("{}")
    f.close()

In [72]:
# Run this to get FOLLOWING so don't run if you want FOLLOWERS users
# Run this for FOLLOWING not for FOLLOWERS

# RUN TRACKER
with open('data/user_following.json', 'r') as infile:
    following_mpm_from_file = json.load(infile)
    user_following_unique = list(set(following_mpm_from_file.keys()))
    
to_scrape = [user for user in user_list_mpm if user not in user_following_unique]

with open('data/tweets_mpm_list_following.json', 'w') as outfile:
    json.dump(to_scrape, outfile)
    
    
# SCRAPER

with open('data/user_following.json', 'r') as infile:
        user_following = json.load(infile)

print(len(set(user_following)))

with open('data/tweets_mpm_list_following.json', 'r') as infile:
    track_list = json.load(infile)
print(len(track_list))

driver = init_driver()
login_twitter(driver, 'othepjavier@gmail.com', 'TheM@trix')

for user in track_list:
    #     while True:
    #         try:
    #             driver = init_driver()
    #             login_twitter(driver, 'dybrian008@gmail.com', 'Imaginebreaker12')
    #             break
    #         except Exception:
    #             close_driver(driver)
    #             pass
    # open the web page in the browser:
    driver.get("https://twitter.com/" +
               user + '/following')
    try:
        following_html = scrape_followers(driver)
    except Exception:
        pass
    try:
        following = extract_followers(following_html)
    except Exception:
        track_list = track_list[1:]
        with open('data/tweets_mpm_list_following.json', 'w') as outfile:
            json.dump(track_list, outfile)
        continue
    user_following[user] = following
    track_list = track_list[1:]
    with open('data/tweets_mpm_list_following.json', 'w') as outfile:
        json.dump(track_list, outfile)
    with open('data/user_following.json', 'w') as outfile:
        json.dump(user_following, outfile)
    # close_driver(driver)
close_driver(driver)

5455
1008


KeyboardInterrupt: 

In [None]:
# 6463 - all users