# Scrape Twitter data by inputting a Twitter handle

Code is a modified version of that which can be found here:
- https://www.promptcloud.com/blog/scrape-twitter-data-using-python-r/


In [1]:
import tweepy
import csv
import json
import pandas as pd
import numpy as np

import plotly
import plotly.plotly as py
import plotly.graph_objs as go

### Load Twitter API credentials
Credentials are stored in a non-git-tracked repo in order to (attempt to) hide them from the public

In [2]:
with open('../twitter_credentials.json') as cred_data:
    info = json.load(cred_data)
    consumer_key = info['CONSUMER_KEY']
    consumer_secret = info['CONSUMER_SECRET']
    access_key = info['ACCESS_KEY']
    access_secret = info['ACCESS_SECRET']

### Function to get all tweets for a given handle
- authenticate using Twitter developer account credentials
- determine number of tweets for that user
- loop through tweets and save information about them in a CSV file for later use

NB: Twitter allows access to only 3240 tweets via this method

In [3]:
def get_all_tweets(screen_name):

    # Authorization and initialization
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)

    # initialization of a list to hold all Tweets
    all_the_tweets = []

    # We will get the tweets with multiple requests of 200 tweets each
    new_tweets = api.user_timeline(screen_name=screen_name, count=200)

    # saving the most recent tweets
    all_the_tweets.extend(new_tweets)

    # save id of 1 less than the oldest tweet

    oldest_tweet = all_the_tweets[-1].id - 1

    # grabbing tweets till none are left

    while len(new_tweets) > 0:
        # The max_id param will be used subsequently to prevent duplicates
        new_tweets = api.user_timeline(screen_name=screen_name,
        count=200, max_id=oldest_tweet)

        # save most recent tweets
        all_the_tweets.extend(new_tweets)

        # id is updated to oldest tweet - 1 to keep track
        oldest_tweet = all_the_tweets[-1].id - 1
        print ('...%s tweets have been downloaded so far' % len(all_the_tweets))

        # transforming the tweets into a 2D array that will be used to populate the csv
        outtweets = [[tweet.id_str, tweet.created_at, tweet.favorite_count, tweet.retweet_count,
        tweet.text.encode('utf-8')] for tweet in all_the_tweets]

    # writing to the csv file
    with open(screen_name + '_tweets.csv', 'w', encoding='utf8') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'created_at', 'favorites', 'retweets', 'text'])
        writer.writerows(outtweets)
        
    print(screen_name + '_tweets.csv has been created')

### Run the function above and, in doing so, create the CSV file with tweet details

In [4]:
# Enter the twitter handle of the person concerned
# twitter_handle = input("Enter the twitter handle of the person whose tweets you want to download:- ")
twitter_handle = 'MontereyAq'
get_all_tweets(twitter_handle)

...400 tweets have been downloaded so far
...600 tweets have been downloaded so far
...800 tweets have been downloaded so far
...1000 tweets have been downloaded so far
...1200 tweets have been downloaded so far
...1400 tweets have been downloaded so far
...1600 tweets have been downloaded so far
...1800 tweets have been downloaded so far
...2000 tweets have been downloaded so far
...2200 tweets have been downloaded so far
...2400 tweets have been downloaded so far
...2600 tweets have been downloaded so far
...2800 tweets have been downloaded so far
...3000 tweets have been downloaded so far
...3200 tweets have been downloaded so far
...3221 tweets have been downloaded so far
...3221 tweets have been downloaded so far
MontereyAq_tweets.csv has been created


### Read in CSV as a pandas dataframe for subsequent analysis
Could have written all data above to dataframe instead of CSV but seems worthwhile keeping the two pieces separate so that running the analysis does not require running the scraping code

In [5]:
df_tweets = pd.read_csv(twitter_handle + '_tweets.csv', parse_dates = [1])
df_tweets.set_index('created_at', inplace = True, drop = True)
df_tweets.head()

Unnamed: 0_level_0,id,favorites,retweets,text
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-09 00:09:18,1126277931024564224,2,0,b'@ramaham7 @lulu197140 https://t.co/gDy95F5xMd'
2019-05-08 23:59:46,1126275531886870529,0,0,"b""@aimeeetay @RaptorsTooth66 we really shouldn..."
2019-05-08 23:52:05,1126273596127465472,1,0,b'@winterhazelly @franzanth Yes they are! We h...
2019-05-08 22:57:15,1126259800419946497,0,156,b'RT @juliepackard: Happy 93rd birthday to Sir...
2019-05-08 22:31:03,1126253205317230592,26,2,b'@Scripps_Ocean aww lookit the cutethulhu we ...


### Add columns to detect occurrence of keywords

In [52]:
keywords = ['kelp', 'plankton', 'cuttlefish', 'ray', 'shark', 'bass', 'jelly', 'dolphin', 'whale', 'puffin', 'penguin', 'squid', 'mola', 'octopus']

col_name_list = []
# for word in keywords:
#     col_name = 'contains_word_' + word
#     col_name_list.append(col_name)
#     df_tweets[col_name] = df_tweets.apply(lambda row: row.favorites if word in row.text else 0, axis=1)

for word in keywords:
    col_name = 'contains_word_' + word
    col_name_list.append(col_name)
    df_tweets[col_name] = df_tweets.apply(lambda row: word if word in row.text else None, axis=1)
    
df_tweets['url'] = df_tweets.apply(lambda row: 'https://twitter.com/'+twitter_handle+'/status/'+str(row.id), axis=1)
df_tweets['hyperlink'] = df_tweets.apply(lambda row: '<a href=\"'+row.url+'\">'+str(row.favorites)+' favorites for this tweet!</a>', axis = 1)

In [53]:
df_tweets.head()

Unnamed: 0_level_0,id,favorites,retweets,text,contains_word_kelp,contains_word_bass,contains_word_jelly,contains_word_dolphin,contains_word_whale,contains_word_puffin,contains_word_penguin,contains_word_squid,contains_word_mola,contains_word_octopus,url,hyperlink,contains_word_cuttlefish,contains_word_shark
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-05-09 00:09:18,1126277931024564224,2,0,b'@ramaham7 @lulu197140 https://t.co/gDy95F5xMd',,,,,,,,,,,https://twitter.com/MontereyAq/status/11262779...,"<a href=""https://twitter.com/MontereyAq/status...",,
2019-05-08 23:59:46,1126275531886870529,0,0,"b""@aimeeetay @RaptorsTooth66 we really shouldn...",,,,,,,,,,,https://twitter.com/MontereyAq/status/11262755...,"<a href=""https://twitter.com/MontereyAq/status...",,
2019-05-08 23:52:05,1126273596127465472,1,0,b'@winterhazelly @franzanth Yes they are! We h...,,,,,,,,,,,https://twitter.com/MontereyAq/status/11262735...,"<a href=""https://twitter.com/MontereyAq/status...",,
2019-05-08 22:57:15,1126259800419946497,0,156,b'RT @juliepackard: Happy 93rd birthday to Sir...,,,,,,,,,,,https://twitter.com/MontereyAq/status/11262598...,"<a href=""https://twitter.com/MontereyAq/status...",,
2019-05-08 22:31:03,1126253205317230592,26,2,b'@Scripps_Ocean aww lookit the cutethulhu we ...,,,,,,,,,,,https://twitter.com/MontereyAq/status/11262532...,"<a href=""https://twitter.com/MontereyAq/status...",,


In [54]:
col_name_list

['contains_word_kelp',
 'contains_word_cuttlefish',
 'contains_word_shark',
 'contains_word_bass',
 'contains_word_jelly',
 'contains_word_dolphin',
 'contains_word_whale',
 'contains_word_puffin',
 'contains_word_penguin',
 'contains_word_squid',
 'contains_word_mola',
 'contains_word_octopus']

### Create Plotly plot and append new traces with tweet data

In [55]:
plotly_traces = []

for word in keywords:
    col_name = 'contains_word_' + word
    trace = go.Scatter(
        x = df_tweets.index,
        y = df_tweets[col_name],
        mode = 'markers',
        marker = dict(
            size = df_tweets['favorites']/50,
        ),
        text = df_tweets['hyperlink'], 
        hoverinfo = 'text'
    )
    
    plotly_traces.append(trace)
    
# oddly limit setting is not automatic... some hidden NaT, I suppose?
layout = go.Layout(
    xaxis=dict(
        range=[min(df_tweets.index), max(df_tweets.index)] 
    ),
    showlegend = False,
    hovermode = 'closest'
)
    
fig = dict(data = plotly_traces, layout = layout)
plot_url = plotly.offline.plot(fig, filename='tweet_frequency.html')

In [46]:
df_tweets.url

created_at
2019-05-09 00:09:18    https://twitter.com/MontereyAq/112627793102456...
2019-05-08 23:59:46    https://twitter.com/MontereyAq/112627553188687...
2019-05-08 23:52:05    https://twitter.com/MontereyAq/112627359612746...
2019-05-08 22:57:15    https://twitter.com/MontereyAq/112625980041994...
2019-05-08 22:31:03    https://twitter.com/MontereyAq/112625320531723...
2019-05-08 21:32:02    https://twitter.com/MontereyAq/112623835116030...
2019-05-08 21:31:01    https://twitter.com/MontereyAq/112623809580684...
2019-05-08 20:13:47    https://twitter.com/MontereyAq/112621865965799...
2019-05-08 19:37:55    https://twitter.com/MontereyAq/112620963619170...
2019-05-08 19:31:16    https://twitter.com/MontereyAq/112620796176684...
2019-05-08 19:14:58    https://twitter.com/MontereyAq/112620386060154...
2019-05-08 19:08:38    https://twitter.com/MontereyAq/112620226468576...
2019-05-08 19:01:42    https://twitter.com/MontereyAq/112620052140713...
2019-05-08 18:57:14    https://twitter.c