# Scrape Twitter data by inputting a Twitter handle

Code is a modified version of that which can be found here:
- https://www.promptcloud.com/blog/scrape-twitter-data-using-python-r/


In [18]:
import tweepy
import csv
import json
import pandas as pd
import numpy as np

import plotly
import plotly.plotly as py
import plotly.graph_objs as go

### Load Twitter API credentials
Credentials are stored in a non-git-tracked repo in order to (attempt to) hide them from the public

In [19]:
with open('../twitter_credentials.json') as cred_data:
    info = json.load(cred_data)
    consumer_key = info['CONSUMER_KEY']
    consumer_secret = info['CONSUMER_SECRET']
    access_key = info['ACCESS_KEY']
    access_secret = info['ACCESS_SECRET']

### Function to get all tweets for a given handle
- authenticate using Twitter developer account credentials
- determine number of tweets for that user
- loop through tweets and save information about them in a CSV file for later use

NB: Twitter allows access to only 3240 tweets via this method

In [20]:
def get_all_tweets(screen_name):

    # Authorization and initialization
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)

    # initialization of a list to hold all Tweets
    all_the_tweets = []

    # We will get the tweets with multiple requests of 200 tweets each
    new_tweets = api.user_timeline(screen_name=screen_name, count=200)

    # saving the most recent tweets
    all_the_tweets.extend(new_tweets)

    # save id of 1 less than the oldest tweet

    oldest_tweet = all_the_tweets[-1].id - 1

    # grabbing tweets till none are left

    while len(new_tweets) > 0:
        # The max_id param will be used subsequently to prevent duplicates
        new_tweets = api.user_timeline(screen_name=screen_name,
        count=200, max_id=oldest_tweet)

        # save most recent tweets
        all_the_tweets.extend(new_tweets)

        # id is updated to oldest tweet - 1 to keep track
        oldest_tweet = all_the_tweets[-1].id - 1
        print ('...%s tweets have been downloaded so far' % len(all_the_tweets))

        # transforming the tweets into a 2D array that will be used to populate the csv
        outtweets = [[tweet.id_str, tweet.created_at, tweet.favorite_count, tweet.retweet_count,
        tweet.text.encode('utf-8')] for tweet in all_the_tweets]

    # writing to the csv file
    with open(screen_name + '_tweets.csv', 'w', encoding='utf8') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'created_at', 'favorites', 'retweets', 'text'])
        writer.writerows(outtweets)
        
    print(screen_name + '_tweets.csv has been created')

### Run the function above and, in doing so, create the CSV file with tweet details

In [21]:
# Enter the twitter handle of the person concerned
# twitter_handle = input("Enter the twitter handle of the person whose tweets you want to download:- ")
twitter_handle = 'MontereyAq'
get_all_tweets(twitter_handle)

...400 tweets have been downloaded so far
...600 tweets have been downloaded so far
...800 tweets have been downloaded so far
...1000 tweets have been downloaded so far
...1200 tweets have been downloaded so far
...1400 tweets have been downloaded so far
...1600 tweets have been downloaded so far
...1800 tweets have been downloaded so far
...2000 tweets have been downloaded so far
...2200 tweets have been downloaded so far
...2400 tweets have been downloaded so far
...2600 tweets have been downloaded so far
...2800 tweets have been downloaded so far
...3000 tweets have been downloaded so far
...3200 tweets have been downloaded so far
...3232 tweets have been downloaded so far
...3232 tweets have been downloaded so far
MontereyAq_tweets.csv has been created


### Read in CSV as a pandas dataframe for subsequent analysis
Could have written all data above to dataframe instead of CSV but seems worthwhile keeping the two pieces separate so that running the analysis does not require running the scraping code

In [22]:
df_tweets = pd.read_csv(twitter_handle + '_tweets.csv', parse_dates = [1])
df_tweets.set_index('created_at', inplace = True, drop = True)
df_tweets.head()

Unnamed: 0_level_0,id,favorites,retweets,text
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-07 22:58:04,1125897614359007233,2,0,b'@Squidpastry ty mr mola fren'
2019-05-07 22:48:33,1125895219847688192,1,0,"b""@Squidpastry Like new rubber when it's new, ..."
2019-05-07 22:47:39,1125894996018679809,2,0,b'@pistrix the wild Monterey Bay has that pate...
2019-05-07 22:25:25,1125889400091463680,1,0,b'@kumaberi Hope to sea you soon!'
2019-05-07 22:25:07,1125889325290233856,4,0,b'@hcf64 1. Think like a kelp\n2. Hard substra...


### Add columns to detect occurrence of keywords

In [23]:
keywords = ['kelp', 'bass', 'jelly', 'dolphin', 'whale', 'puffin', 'penguin', 'squid', 'mola', 'octopus']

col_name_list = []
# for word in keywords:
#     col_name = 'contains_word_' + word
#     col_name_list.append(col_name)
#     df_tweets[col_name] = df_tweets.apply(lambda row: row.favorites if word in row.text else 0, axis=1)

for word in keywords:
    col_name = 'contains_word_' + word
    col_name_list.append(col_name)
    df_tweets[col_name] = df_tweets.apply(lambda row: word if word in row.text else None, axis=1)

In [24]:
df_tweets.head()

Unnamed: 0_level_0,id,favorites,retweets,text,contains_word_kelp,contains_word_bass,contains_word_jelly,contains_word_dolphin,contains_word_whale,contains_word_puffin,contains_word_penguin,contains_word_squid,contains_word_mola,contains_word_octopus
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-05-07 22:58:04,1125897614359007233,2,0,b'@Squidpastry ty mr mola fren',,,,,,,,,mola,
2019-05-07 22:48:33,1125895219847688192,1,0,"b""@Squidpastry Like new rubber when it's new, ...",,,,,,,,,,
2019-05-07 22:47:39,1125894996018679809,2,0,b'@pistrix the wild Monterey Bay has that pate...,,,,,,,,,,
2019-05-07 22:25:25,1125889400091463680,1,0,b'@kumaberi Hope to sea you soon!',,,,,,,,,,
2019-05-07 22:25:07,1125889325290233856,4,0,b'@hcf64 1. Think like a kelp\n2. Hard substra...,kelp,,,,,,,,,


In [25]:
col_name_list

['contains_word_kelp',
 'contains_word_bass',
 'contains_word_jelly',
 'contains_word_dolphin',
 'contains_word_whale',
 'contains_word_puffin',
 'contains_word_penguin',
 'contains_word_squid',
 'contains_word_mola',
 'contains_word_octopus']

### Create Plotly plot and append new traces with tweet data

In [36]:
plotly_traces = []

for word in keywords:
    col_name = 'contains_word_' + word
    trace = go.Scatter(
        x = df_tweets.index,
        y = df_tweets[col_name],
        mode = 'markers',
        marker = dict(
            size = df_tweets['favorites']/50,
            )
        )
    
    plotly_traces.append(trace)
    
fig = dict(data = plotly_traces)#, layout=layout)
plot_url = plotly.offline.plot(fig, filename='tweet_frequency.html')

In [32]:
df_tweets['favorites']

created_at
2019-05-07 22:58:04       2
2019-05-07 22:48:33       1
2019-05-07 22:47:39       2
2019-05-07 22:25:25       1
2019-05-07 22:25:07       4
2019-05-07 22:16:43     166
2019-05-07 22:11:58       5
2019-05-07 22:00:52       1
2019-05-07 21:51:00     931
2019-05-07 20:32:46     117
2019-05-07 20:09:44       3
2019-05-07 20:06:40      83
2019-05-07 20:01:48     563
2019-05-07 18:28:51       2
2019-05-07 16:32:57       1
2019-05-06 21:36:35       6
2019-05-06 21:28:43       0
2019-05-06 20:33:40     425
2019-05-04 16:57:04     777
2019-05-03 21:29:06    1181
2019-05-03 20:55:16       3
2019-05-03 20:45:49       1
2019-05-03 16:35:34     497
2019-05-02 22:00:18       1
2019-05-02 21:27:01       2
2019-05-02 21:24:55       3
2019-05-02 20:49:30      12
2019-05-02 20:48:30      16
2019-05-02 20:40:32       4
2019-05-02 20:38:34      17
                       ... 
2018-08-31 22:36:20       3
2018-08-31 20:52:42     327
2018-08-31 20:39:02      22
2018-08-31 20:35:45       5
2018-08-3

In [33]:
df_tweets.index

DatetimeIndex(['2019-05-07 22:58:04', '2019-05-07 22:48:33',
               '2019-05-07 22:47:39', '2019-05-07 22:25:25',
               '2019-05-07 22:25:07', '2019-05-07 22:16:43',
               '2019-05-07 22:11:58', '2019-05-07 22:00:52',
               '2019-05-07 21:51:00', '2019-05-07 20:32:46',
               ...
               '2018-08-30 18:01:36', '2018-08-30 17:01:13',
               '2018-08-30 16:59:21', '2018-08-30 16:58:30',
               '2018-08-30 16:58:13', '2018-08-30 16:56:26',
               '2018-08-30 02:37:33', '2018-08-30 02:22:41',
               '2018-08-30 00:16:40', '2018-08-30 00:15:05'],
              dtype='datetime64[ns]', name='created_at', length=3232, freq=None)