### Gathering

In [1]:
import pandas as pd
import numpy as np
import requests
import tweepy
import tweepy_credentials as creds
import json
import time
import re

In [2]:
# create pandas DataFrame for the WeRateDogs Twitter archive
df_archive = pd.read_csv('data/twitter-archive-enhanced.csv')

# check
df_archive.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


In [3]:
## download, save, and create pandas DataFrame for the tweet image predictions
# create response object using requests.get() method
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

# Check the status where 200 being the HTTP status code for the request has succeeded.
response

<Response [200]>

In [4]:
# download the file and save it as image_predictions.tsv
with open('data/image_predictions.tsv', 'wb') as file:
    file.write(response.content)

In [5]:
# load .tsv file to pandas DataFrame
df_img = pd.read_csv('data/image_predictions.tsv', sep='\t')

# check
df_img.head(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True


In [6]:
# import Twitter api developer credentials
consumer_key = creds.consumer_key
consumer_secret = creds.consumer_secret
access_token = creds.access_token
access_secret = creds.access_secret

In [7]:
# setup for twitter api, tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

---
### Test

In [8]:
df_sample = df_archive.sample(30)

In [56]:
data = []
err = []
for i in df_sample['tweet_id']:
    try:
        data.append(api.get_status(i, tweet_mode='extended')._json)
        
    except:
        err.append(i)
        print('{} Not Found !'.format(i))

In [57]:
with open('data/tweet_sample.txt', mode='w') as file:
    json.dump(data, file)

In [58]:
data2 = pd.read_json('data/tweet_sample.txt')

In [60]:
data2.head(3)

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,favorite_count,favorited,full_text,geo,...,lang,place,possibly_sensitive,possibly_sensitive_appealable,retweet_count,retweeted,retweeted_status,source,truncated,user
0,,,2017-04-03 00:16:10,"[0, 119]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 848690539105071104, 'id_str'...",25829,False,Please stop sending in animals other than dogs...,,...,en,,0.0,0.0,4494,False,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
1,,,2016-02-10 04:06:43,"[0, 138]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 697270435977818113, 'id_str'...",4848,False,This is Bentley. He got stuck on his 3rd homew...,,...,en,,0.0,0.0,1964,False,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
2,,,2017-07-15 16:51:35,"[27, 105]","{'hashtags': [], 'symbols': [], 'user_mentions...",,116,False,@NonWhiteHat @MayhewMayhem omg hello tanner yo...,,...,en,,,,4,False,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."


---
### Test 2: Using For-Loop

In [63]:
err = []
# Store each tweet's entire JSON data in a file
with open('data/tweet_sample2.txt', 'w') as outfile: 
    for i in df_sample['tweet_id']:
        try:
            tweet = api.get_status(i)
            # convert to JSON and write
            json.dump(tweet._json, outfile)
            # separate each tweet's data with a line break
            outfile.write("\n")
            
        except:   
            print('ERROR: ID {} Not Found !'.format(tweet))
            # store tweet id with errors
            err.append(i)

In [67]:
# read JSON and store it in a pandas DataFrame
data2 = []

with open('data/tweet_sample2.txt', 'r') as json_file:
    for line in json_file:
        tweet = json.loads(line)
        data2.append({'tweet_id': tweet['id_str'],
                      'retweets': tweet['retweet_count'],
                      'favorites': tweet['favorite_count']
                     })

In [69]:
df = pd.DataFrame(data2, columns=['tweet_id', 'retweets', 'favorites'])

In [72]:
df.head()

Unnamed: 0,tweet_id,retweets,favorites
0,848690551926992896,4494,25829
1,697270446429966336,1964,4848
2,886267009285017600,4,116
3,860276583193509888,3492,18260
4,850145622816686080,3967,16683
