## This notebook is used to Scrape Twitter via the Twitter API

In [1]:
# import dependacies

import numpy as np
import pandas as pd
import requests
import tweepy
import json
import time

### To get the tweet_ids we will need the `twitter-archive-enhanced.csv` datset. Let's import it

In [2]:
dogs = pd.read_csv('twitter-archive-enhanced.csv')
dogs.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


### Let's configure the API

Note to self. Remember to take out the keys before submission

In [3]:
consumer_key = 'SECRET_KEY'
consumer_secret = 'SECRET_KEY'
access_token = 'SECRET_KEY'
access_secret = 'SECRET_KEY'

In [4]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

In [5]:
# test the api
test_id = dogs.loc[0,'tweet_id']
tweet = api.get_status(test_id, tweet_mode='extended')
tweet._json

{'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
 'id': 892420643555336193,
 'id_str': '892420643555336193',
 'full_text': "This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU",
 'truncated': False,
 'display_text_range': [0, 85],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [],
  'media': [{'id': 892420639486877696,
    'id_str': '892420639486877696',
    'indices': [86, 109],
    'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'url': 'https://t.co/MgUWQ76dJU',
    'display_url': 'pic.twitter.com/MgUWQ76dJU',
    'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
    'type': 'photo',
    'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
     'medium': {'w': 540, 'h': 528, 'resize': 'fit'},
     'small': {'w': 540, 'h': 528, 'resize': 'fit'},
     'large': {'w': 

### Well... It works. now lets Proceed

In [6]:
tweet_ids = dogs.tweet_id.values
lost_ids = []
count = 0
START = time.time()
with open('tweet_json.txt', 'w') as json_file:
    for tweet_id in tweet_ids:
        try:
            start = time.time()
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            json.dump(tweet._json, json_file)
            json_file.write('\n')
            end = time.time()
            print(f'No. {count} tweet with id {tweet_id} processed in {end - start} seconds')
            count += 1
        except tweepy.TweepError as e:
            lost_ids.append(tweet_id)
            print(f'No. {count} tweet with id {tweet_id} not found... {e}')
            count += 1
END = time.time()
print(f'Total time taken is {END - START}')

No. 0 tweet with id 892420643555336193 processed in 0.12189936637878418 seconds
No. 1 tweet with id 892177421306343426 processed in 0.107177734375 seconds
No. 2 tweet with id 891815181378084864 processed in 0.11233901977539062 seconds
No. 3 tweet with id 891689557279858688 processed in 0.09692525863647461 seconds
No. 4 tweet with id 891327558926688256 processed in 0.09793233871459961 seconds
No. 5 tweet with id 891087950875897856 processed in 0.0931849479675293 seconds
No. 6 tweet with id 890971913173991426 processed in 0.11676812171936035 seconds
No. 7 tweet with id 890729181411237888 processed in 0.11761116981506348 seconds
No. 8 tweet with id 890609185150312448 processed in 0.10889434814453125 seconds
No. 9 tweet with id 890240255349198849 processed in 0.12727618217468262 seconds
No. 10 tweet with id 890006608113172480 processed in 0.10259532928466797 seconds
No. 11 tweet with id 889880896479866881 processed in 0.1322004795074463 seconds
No. 12 tweet with id 889665388333682689 proce

In [7]:
tweet_ids.shape

(2356,)

### Let's view tweet_ids that couldn't be found

In [10]:
for id in lost_ids:
  print(id)
print(f"\n{len(lost_ids)} tweet_id's seem to have been deleted from the archives")

888202515573088257
873697596434513921
872668790621863937
872261713294495745
869988702071779329
866816280283807744
861769973181624320
856602993587888130
856330835276025856
851953902622658560
851861385021730816
845459076796616705
844704788403113984
842892208864923648
837366284874571778
837012587749474308
829374341691346946
827228250799742977
812747805718642688
802247111496568832
779123168116150273
775096608509886464
771004394259247104
770743923962707968
766864461642756096
759923798737051648
759566828574212096
754011816964026368
680055455951884288

29 tweet_id's seem to have been deleted from the archives
