In [9]:
###############
### Imports ###
###############

## Might need to install modules below
#!pip install selenium
#!pip install textblob
#!pip install dataframe_image
#!pip install wordcloud #or in anaconda prompt type: conda install -c https://conda.anaconda.org/conda-forge wordcloud

import os    
import re
import csv
import time
import getpass
import pandas as pd
import datetime as dt
import dataframe_image as dfi
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from time import sleep
from textblob import TextBlob
from selenium import webdriver
from wordcloud import WordCloud, STOPWORDS
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

In [10]:
#################
### FUNCTIONS ###
#################

###########################
# Scrape data from tweets #
###########################

def scrape_tweet(tweet, term):
    
    '''Extract the data from tweets related to the searched term'''
    
    #find username
    username = tweet.find_element_by_xpath('.//span').text
    
    #find twitter handle
    try:
        handle = tweet.find_element_by_xpath('.//span[contains(text(), "@")]').text
    except NoSuchElementException as error:
        handle = None
    
    #find datetime of tweet
    try:
        postdate = tweet.find_element_by_xpath('.//time').get_attribute('datetime')
    except NoSuchElementException as error:
        postdate = None
        
    #get the post's text
    try:
        text = tweet.find_element_by_xpath('.//div[2]/div[2]/div[2]').text
    except NoSuchElementException as error:
        text = None

    #get associated #'s
    associated_hashtags = []
    for word in text.split():
        if '#' in word and word.upper() != term.upper():
            associated_hashtags.append(word)
    
    #replies count
    replies = tweet.find_element_by_xpath('.//div[@data-testid="reply"]').text
    if replies == '':
        replies = 0
    
    #retweets count
    retweets = tweet.find_element_by_xpath('.//div[@data-testid="retweet"]').text
    if retweets == '':
        retweets = 0
        
    #likes count
    likes = tweet.find_element_by_xpath('.//div[@data-testid="like"]').text
    if likes == '':
        likes = 0
    
    #making tuple of the extracted tweet data
    data = (username, handle, postdate, text, associated_hashtags, replies, retweets, likes)
    return data

#######################################################
# Ask user to start scraping when prompted with 'yes' #
#######################################################

def initiate_scraping(term):
    no = 'no'
    yes = 'yes'

    initiate = input('Ready to start scraping on' + str(term) + '? Please enter "yes" or "no": ')

    while initiate.upper() != yes.upper():
        if initiate.upper() == no.upper():
            initiate = input('No worries, take your time. Please enter "yes" when you are ready: ')
        else:
            initiate = input('Not an available option. Please enter "yes" or "no": ')
    print("OK, it's scraping time!")

#####################################################################
# Confirm the number of data is equal to max_data (1000 by default) #
#####################################################################

def max_data_reached(data):
    if (len(data) == max_data):
        print('Latest 1000 tweets on ' + str(term) + ' has been successfully scraped!\n\nMoving on to the data analysis...' )    
    #the program can still carry on if the number of scraped data is more or less than max_data, it would simply be unexpected
    else:
        print('There were actually ' + str(len(data)) + ' tweets that got scraped instead of ' + str(max_data) + 
              '.\nYou may want to restart the bot if that is not what you wanted. Moving on to the analysis...')

#################################
# Save the data into a csv file #
#################################

def make_csv(data):
    with open('scraped_tweets.csv', 'w', newline = '', encoding = 'utf-8') as f:
        header = ['Username', 'Handle', 'Timestamp', 'Text', 'Associated Hashtags', 'Replies', 'Retweets', 'Likes']
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerows(data)
        print('Data are now stored in an csv file named "scraped_tweets.csv".')

########################
# Analysis of the data #
########################

##Sentiment analysis of the tweets' text using Textblob
def sentiment_analysis(tweet):
    def get_Subjectivity(text):
        return TextBlob(text).sentiment.subjectivity

    #Create a function to get the polarity
    def get_Polarity(text):
        return TextBlob(text).sentiment.polarity

    #Create two new columns ‘Subjectivity’ & ‘Polarity’
    tweet['TextBlob_Subjectivity'] = tweet['Text'].apply(get_Subjectivity)
    tweet ['TextBlob_Polarity'] = tweet['Text'].apply(get_Polarity)
    
    #Create a column with string values of the sentiment analysis
    def get_Analysis(score):
        if score < 0:
            return 'Negative'
        elif score == 0:
            return 'Neutral'
        else:
            return 'Positive'
    tweet['TextBlob_Analysis'] = tweet['TextBlob_Polarity'].apply(get_Analysis)

    return tweet

##Generate word clouds (will be applied to text and associated hashtags columns of df)
def make_WordCloud(column):
    comment_words = ''
    stopwords = set(STOPWORDS)

    # iterate through the csv file
    for val in column:

        # typecaste each val to string
        val = str(val)

        # split the value
        tokens = val.split()

        # converts each token into lowercase
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()

        comment_words += " ".join(tokens)+" "
    if pd.DataFrame(column).columns[0] == 'Text':
        wordcloud = WordCloud(width = 800, height = 800,
                    background_color = 'white',
                    stopwords = stopwords,
                    min_font_size = 10).generate(comment_words)
    else:
        wordcloud = WordCloud(width = 800, height = 800,
                    background_color = 'black',
                    stopwords = stopwords,
                    min_font_size = 10).generate(comment_words)
        
    # plot the word cloud image                      
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    if pd.DataFrame(column).columns[0] == 'Text':
        plt.title("Word Cloud of the Tweets' text section")
    else:
        plt.title("Word Cloud of Tweets' Associated Hashtags(#)")   
    
    #save each word cloud as a png to tweet out
    if pd.DataFrame(column).columns[0] == 'Text':
        plt.savefig('wordcloud_text.png', format = 'png')
    else:
        plt.savefig('wordcloud_hashtags.png', format = 'png')
    
    plt.show()  

##Function to make pie chart of the sentiment analysis
def make_pieChart(df, term):    
    #retrieve number of positive, neutral and negative tweets
    pos = len(df['TextBlob_Analysis'][df['TextBlob_Analysis'] == 'Positive'])
    neu = len(df['TextBlob_Analysis'][df['TextBlob_Analysis'] == 'Neutral'])
    neg = len(df['TextBlob_Analysis'][df['TextBlob_Analysis'] == 'Negative'])
    sentiments = [pos, neu, neg]
    labels = 'Positive (' + str(pos) + ')', 'Neutral (' + str(neu) + ')', 'Negative (' + str(neg) + ')'

    #setting up the pie chart plot
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.pie(sentiments, labels = labels, colors = ['cyan','yellow','violet'], autopct = '%1.1f%%',
            shadow = False, startangle = 90)
    ax.axis('equal')  #equal aspect ratio ensures that pie is drawn as a circle.
    plt.title('Pie Chart of the Sentiment Analysis on the latest 1000 tweets for ' + str(term))
    plt.legend(title = 'Sentiment:', loc = 'best')

    #saving pie chart to be tweeted out
    plt.savefig('piechart_sentiment.png', format = 'png')

    plt.show()

##Generate a table showing the total count of replies, retweets and likes grouped by sentiment (named as the variable RRL_table)
def make_RRL_table(df):
    #converting 'Likes' column values from str to int in cases when Likes have K & M as thousands and millions
    if type(df['Likes'][0]) == str:
        df['Likes'] = df['Likes'].replace({'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)

    #making the RRL_table
    RRL_table = df.groupby(by = 'TextBlob_Analysis').sum().iloc[:,0:3]
    RRL_table = RRL_table.rename(columns = {'Replies':'Total Replies',
                                            'Retweets':'Total Retweets', 'Likes':'Total Likes'})
    RRL_table.index.name = None
    #store the table as an image to be tweeted out
    RRL_table_styled = RRL_table.style.background_gradient() #add gradient base on the value of the cells
    #saving table as png
    dfi.export(RRL_table_styled,'RRL_table.png')
    #viewing the table
    print(RRL_table)

####################
# Show the results #
####################

def show_results():

    ##Making the scraped_tweets.csv file a dataframe 
    df = pd.read_csv('../TwitterBot/scraped_tweets.csv')

    ##Using sentiment analysis on df
    sentiment_analysis(df)

    ##Generate word clouds 
    make_WordCloud(df.Text)
    make_WordCloud(df['Associated Hashtags'])

    ##Generate pie chart
    make_pieChart(df, term)

    ##Generate RRL_table
    make_RRL_table(df)

#########################################################
# Tweet out the results obtained from the data analysis #
#########################################################

def send_tweet_or_not():
    yes = 'yes'
    no = 'no'
    
    ##Showing the average value of the subjectivity & polarity analysis results obtained from the sentiment analysis with a 
    #brief description on how to interpret the results; attached as a description to the pie chart 
    def small_desc():
        df = pd.read_csv('../TwitterBot/scraped_tweets.csv')
        sentiment_analysis(df)

        sub_mean = df.TextBlob_Subjectivity.mean()
        pol_mean = df.TextBlob_Polarity.mean()

        print('\n')
        sub_pol_mean = ('Additional information on the sentiment analysis: the average value for the subjectivity analysis is ' 
                       + str(sub_mean) 
                       + ' and for the polarity analysis it is ' 
                       + str(pol_mean) 
                       + '. Subjectivity is a float between [0,1]; where 1 means that the statement is more of a personal opinion '
                         'and where 0 means that it is more objective or factual. Polarity is a float between [-1,1]; where a float '
                         'closer to -1 means a negative statement, around 0 is considered neutral, and 1 is more positive.')
        return(sub_pol_mean)

    #prompting user to type close to close the web driver if a tweet was sent out
    def close_webDriver():
        close = 'close'

        confirm_close = input('Please type "close" in order to close the web driver when you are finished: ')

        if confirm_close.upper() == close.upper():
            driver.close()
            print('Web driver is now closing. Take care and see you again for another round of scraping!')
        else:
            print('Please type "close" in order to close the web driver.')
            close_webDriver()
    
    tweet_or_not = input('The data are now stored away. If you want to tweet out your results, please enter "yes" to send a tweet or "no" to exit: ')
    #when user wants to tweet out the results
    if tweet_or_not.upper() == yes.upper():
        additional_info = small_desc()

        ##Link to compose a tweet; should not sign you out of the Twitter account
        tweet_text = driver.get('http://www.twitter.com/compose/tweet')
        sleep(3)

        ##Sending out the following automated message
        tweet_text = driver.find_element_by_xpath('//div[@aria-label="Tweet text"][1]')
        message = ("A descriptive statistics on the latest 1000 tweets on " + str(term) + ". 2 word clouds (tweets' text & associated "
                   "hashtags; white & black background), a pie chart of a sentiment analysis done on the tweets' text and a table with "
                   "total number of replies, retweets and likes.")
        tweet_text.send_keys(message)


        ##Get directory path of all the images generated by the bot
        images = ['wordcloud_text.png', 'wordcloud_hashtags.png', 'piechart_sentiment.png', 'RRL_table.png']
        dir_path = [] #empty list to store the directory paths

        for png_file in images:
            dir_path.append(os.path.abspath(png_file))
        dir_path

        ##Upload the images generated by the bot
        tweet_img = driver.find_element_by_xpath('//div[@dir="auto"]')

        for path in dir_path:
            wait = WebDriverWait(tweet_img, 5)
            input_xpath = '//input[@type="file"]'
            image_path = path
            input_element = wait.until(EC.presence_of_element_located((By.XPATH, input_xpath)))
            input_element.send_keys(image_path)

        ##Add summary to the description of the pie chart of the sentiment analysis
        #find the 'Add description' and click it
        description = driver.find_element_by_xpath('//span[contains(text(),"Add descriptions")]').click()
        sleep(1)
        #moving to the pie chart image
        description = driver.find_element_by_xpath('//div[@aria-label="Next image"]//div[@dir="auto"]').click()
        sleep(1)
        description = driver.find_element_by_xpath('//div[@aria-label="Next image"]//div[@dir="auto"]').click()
        sleep(1)
        #add summary to pie chart description as an additional information
        description = driver.find_element_by_xpath('//textarea[@name="altTextInput"]')
        description.send_keys(additional_info)
        sleep(2)
        #find the 'Save' button and click it
        description = driver.find_element_by_xpath('(//div[@role="button"])[4]').click()
        sleep(1)

        ##Tweet out everything
        tweet_out = driver.find_element_by_xpath('(//div[@role="button"])[19]').click()
        sleep(3)
        print('A tweet about the latest 1000 tweets on ' + str(term) + ' has been made and sent out to be viewed by everyone using the '
               'Twitter account @' + handle_or_phone + '. Thank you for using this Twitter Bot Scraper.')

        ##Ask user to close the web driver if they are done
        close_webDriver()
    #when user does not want to tweet out results; web driver closes and exits program          
    elif tweet_or_not.upper() == no.upper():
        print('Thank you for using this Twitter Bot Scraper. Web driver is now closing. Take care and see you again for another round of scraping!')
        driver.close()
    else:
        print('Not an available option. Please enter either "yes" or "no".')
        send_tweet_or_not()

In [None]:
###################################
### TWITTER BOT SCRAPER PROGRAM ###
###################################

###################
# Welcome message #
###################

print('Hello there and welcome to the Twitter Bot Scraper program! In order to ensure that the bot works smoothly, please make'
      ' sure that the information provided are all correct. If you think that you made a mistake while providing certain '
      'information, you can type "hard reset" at any time to restart everything at any point. Thank you and happy scraping!\n')

###############################################
# Prompt for user's Twitter login information #
###############################################

#initialize variables that will be the id used to login into the Twitter account
login_id = handle_or_phone = term = password = ''
yes = 'yes'
no = 'no'
    
#getting login id
login_id = input('Please enter phone, email or username linked to your Twitter account:')
if login_id == 'hard reset':
    Twitter_login()
#getting Twitter handle/phone in case Twitter thinks there is unusual activity
handle_or_phone = input('Please enter your Twitter handle or phone: ') 
if handle_or_phone == 'hard reset':
    Twitter_login()
#getting the term that will be searched        
term = input('Please enter the desired search term: ')
if handle_or_phone == 'hard_reset':
    Twitter_login()
#getting password using getpass() to not openly reveal it
print('Please enter your password: ')
password = getpass.getpass()
if password == 'hard_reset':
    Twitter_login()
    
all_ids = [login_id, handle_or_phone, term, password]
    
#prompt user to confirm if the information provided are all correct 
def confirm():               
    confirmation = input('Are you sure that all the information provided is correct? Please enter "yes" or "no": ')
    if confirmation.upper() == yes.upper():
        print('Twitter Bot is ready to scrape!')
        login_id = all_ids[0]
        handle_or_phone = all_ids[1]
        term = all_ids[2]
        password = all_ids[3]
    elif confirmation.upper() == no.upper():
        print('No worries, you can now start entering your information again.\n ')
        Twitter_login(login_id, handle_or_phone, term, password)
    else:
        print('Not an available option. Please enter "yes" or "no".')
        confirm()
    
confirm()

############################################################
# Using user's information to login Twitter via Web driver #
############################################################

##Opening Chrome driver downloaded via https://chromedriver.chromium.org/downloads and selecting corresponding Chrome 
 #version (in my case I downloaded the chromedriver_win32.zip for Chrome version 96). The exe file was then added to the 
 #'final_project' --> 'TwitterBot' folder so it could be used
driver = webdriver.Chrome()
driver.maximize_window() #window is maximized in order to have access to Twitter's search query 
driver.get('http://www.twitter.com/login')
sleep(3)

##Login in with information provided above
##For username
username = driver.find_element_by_xpath('//input[@name="text"]')
username.send_keys(login_id)
username.send_keys(Keys.RETURN)
sleep(3)

##For password & login in the Twitter account
try:
    username = driver.find_element_by_xpath('//input[@name="password"]')
#Sometimes Twitter might think there is some unusual activity and demands to input the account's username or phone
except NoSuchElementException as error:
    username = driver.find_element_by_xpath('//input[@name="text"]')
    username.send_keys(handle_or_phone)
    username.send_keys(Keys.RETURN)
    sleep(3)
    #try to find input for password and try to login again
    username = driver.find_element_by_xpath('//input[@name="password"]')
    username.send_keys(password)
    username.send_keys(Keys.RETURN)
    sleep(3)
##If Twitter doesn't find any unusual activity, login normally
else:
    username.send_keys(password)
    username.send_keys(Keys.RETURN)
    sleep(3)

##Finding the term in the search query of Twitter
search_term = driver.get('http://www.twitter.com/search/')
sleep(4)
search_term = driver.find_element_by_xpath('(//input[@placeholder="Search Twitter"])[1]')
search_term.send_keys(term)
search_term.send_keys(Keys.RETURN)
sleep(2)

##Sorting by latest
driver.find_element_by_link_text('Latest').click()

##User authorization to begin scraping process
initiate_scraping(term)

######################################################################
### Scrolling Twitter website to scrape 1000 latest tweets on term ###
######################################################################

start = time.time()

##Get the desired number of tweets by scrolling the page to scrape new data
data = [] #create an empty list to store scraped tweets
unique_ids = set() #make a set of unique ids to make sure no same tweets are scraped
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True
max_data = 1000 #max number of data that will be scraped
complete = '' #complete message for visual cue of the bot's scraping progress (will be updated as more data is being gathered)

while scrolling and len(data) < max_data:
    tweets = driver.find_elements_by_xpath('//article[@role="article"]')
    for t in tweets[-15:]:
        tweet = scrape_tweet(t, term)
        if tweet:
            unique_id = ''.join(tweet[3]) #making a unique id by joining the text of each tweet
            if unique_id not in unique_ids: #append unique id in set of ids if it is not in there yet
                unique_ids.add(unique_id)
                data.append(tweet)

                #adding visual indication of how much data has been scraped at each 20% completion mark
                if (len(data)/max_data) < 0.2 and complete != ('0% complete...'):
                    complete = '0% complete...'
                    print('Scraping Starts... Now!\n' + complete)
                elif (len(data)/max_data) >= 0.2 and (len(data)/max_data) < 0.4 and complete != ('20% complete...'):
                    complete = '20% complete...'
                    print(complete)
                elif (len(data)/max_data) >= 0.4 and (len(data)/max_data) < 0.6 and complete != ('40% complete...'):
                    complete = '40% complete...'
                    print(complete)
                elif (len(data)/max_data) >= 0.6 and (len(data)/max_data) < 0.8 and complete != ('60% complete...'):
                    complete = '60% complete...'
                    print(complete)
                elif (len(data)/max_data) >= 0.8 and (len(data)/max_data) < 1.0 and complete != ('80% complete...'):
                    complete = '80% complete...'
                    print(complete)
                elif (len(data)/max_data) >= 1.0:                    
                    print('100% complete. Almost Done...')

    #check scroll position; making sure it isnt stuck or if it ever reach the end        
    scroll_attempt = 0
    while True:       
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        sleep(1)
        curr_position = driver.execute_script("return window.pageYOffset;")
        if last_position == curr_position:
            scroll_attempt += 1

            #end of scroll region
            if scroll_attempt >= 3:
                scrolling = False
                break
            else:
                sleep(2) #attempt another scroll
        else:
            last_position = curr_position
            break

##Making sure that the data list is not higher than the number of scraped data desired
while len(data) > max_data:
    data.pop() #remove last data if length of data is higher than max_data

##Visual display of scraper being done along with time taken to scrape the data
print('Scraping is now complete.')
print('\nTime taken to scrape was (in seconds): ')

end = time.time()

print(end - start)

###############################################################################
# Display results along with information about the data that just got scraped #
###############################################################################

print('\n')
max_data_reached(data)
print('\n')
make_csv(data)

show_results()
print('\n')

#################################
# Ask user to send tweet or not #
#################################

send_tweet_or_not()