# Let's collect tweets!

Run these cells after each other, and the collection of tweets will automatically continue from the last saved tweet.

### Parameters

In [7]:
# Parameters for saving tweets
tweet_per_file = 1000
max_n_files = 100
dir_path = '../data/tweets_NY_20km'

# Parameters of the query (tweepy API.search())
q           ='*' # needs to be * if searching for location
# D- N+, W- E+
geocode     ='40.7128,-74.0060,20km' # New York
# geocode     ='51.5074,-0.1278,20km'  # London
geocode     = None
tweet_mode  ='extended'
lang        ='en' 
result_type ='recent'

## Imports

In [8]:
import tweepy
from tweepy import TweepError
import json
import os
import re
import time

from IPython.display import display, clear_output

## Load Twitter credentials, access API

In [9]:
twitter_credentials = json.load(open('./keys.json', 'r'))['twitter1']
CONSUMER_KEY = twitter_credentials['consumer_key']
CONSUMER_SECRET = twitter_credentials['consumer_secret']
token_key    = twitter_credentials['token_key']
token_secret = twitter_credentials['token_secret']

##### OAuthHandler vs. AppAuthHandler

AppAuthHandler is much better for data retrieval. It has a <b>higher rate limit</b>, and even if it reaches the limit, it <b>waits automatically</b> until it can request more data.

In [10]:
# Authenticate twitter Api
auth = tweepy.AppAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
if (not api):
    print ("Can't Authenticate")
# auth.set_access_token(token_key, token_secret)

## Get the ID of the last saved tweet

In [11]:
file_names = [file for file in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, file))]
all_tweets = []
for file_name in file_names:
    
    if file_name == 'data_merged.csv':
        continue
        
    file_path = dir_path + '/' + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        all_tweets += json.load(file)
        
# with open('./tweets_temp.json', 'r', encoding='utf-8') as file:
#     all_tweets.append(json.load(file)[0])
    
ids = [tweet['id'] for tweet in all_tweets]
if len(ids) > 0:
    last_id  = max(ids)
    first_id = min(ids)
else:
    last_id  = None
    first_id = None
print('Num. of files :', len(file_names))
print('Num. of tweets:', len(all_tweets))
print('First ID :', first_id)
print('Last ID  :', last_id)

Num. of files : 24
Num. of tweets: 24000
First ID : 1179136856128143360
Last ID  : 1184176474024239105


## Collect tweets

In [12]:
i = 0
# i_file = 0
i_file = len(file_names)

c = tweepy.Cursor(api.search, 
                  q = q, 
                  geocode = geocode, 
                  tweet_mode = tweet_mode, 
                  lang = lang, 
                  since_id = last_id,
#                  max_id=first_id,
                  result_type = result_type
                 )

# with open('./tweets_temp.json', 'r', encoding='utf-8') as file:
#     tweets = json.load(file)
#     i = len(tweets) - 1
n_errors = 0
tweets = []
while (i_file < max_n_files) & (n_errors < 10):
    try:
        for tweet in c.items():

            #get full text for tweets, skip retweets
            try:
                text = tweet.retweeted_status.full_text
                #skip retweets
                continue
            except AttributeError:
                text = tweet.full_text

            #save certain attributes (other than text)
            tweets.append(
              {
                  'id':tweet.id,
                  'text':text,
                  'created_at':str(tweet.created_at),
                  'author_name':tweet.author.name,
              })

            #save every #tweet_per_file number of tweets to a json
            i += 1
            if i >= tweet_per_file:
                with open(dir_path + '/' + 'tweets_{:03d}.json'.format(i_file), 'w', encoding='utf-8') as file:
                    json.dump(tweets, file, ensure_ascii=False, indent=4)
                i_file += 1
                i = 0
                tweets = []

            clear_output(wait=True)
            display('{}/{}'.format(i_file, i))

            if i_file >= max_n_files:
                break
    except TweepError:
        n_errors += 1
        print('Ooops, there was an error...')
        time.sleep(60)
        continue
        
print(f'Number of errors: {n_errors}')

'45/658'

Ooops, there was an error...
Number of errors: 10


In [7]:
with open('./tweets_temp.json', 'w', encoding='utf-8') as file:
    json.dump(tweets, file, ensure_ascii=False, indent=4)


In [16]:
#import Pool
from multiprocessing import Pool
#Define a worker — a function which will be executed in parallel
def worker(x):
    return x*x
#Assuming you want to use 3 processors
num_processors = 3
#Create a pool of processors
p=Pool(processes = num_processors)
#get them to work in parallel
output = p.map(worker,[i for i in range(0,20)])
print(output)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361]
