### Gathering Data

In [1]:
import pandas as pd
import numpy as np
import requests
import tweepy
import tweepy_credentials as creds
import json
import time
import re

In [2]:
# create pandas DataFrame for the WeRateDogs Twitter archive
df_archive = pd.read_csv('data/twitter-archive-enhanced.csv')

# check
df_archive.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


In [3]:
## download, save, and create pandas DataFrame for the tweet image predictions
# create response object using requests.get() method
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

# Check the status where 200 being the HTTP status code for the request has succeeded.
response

<Response [200]>

In [4]:
# download the file and save it as image_predictions.tsv
with open('data/image_predictions.tsv', 'wb') as file:
    file.write(response.content)

In [5]:
# load .tsv file to pandas DataFrame
df_img = pd.read_csv('data/image_predictions.tsv', sep='\t')

# check
df_img.head(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True


In [6]:
# import Twitter api developer credentials
consumer_key = creds.consumer_key
consumer_secret = creds.consumer_secret
access_token = creds.access_token
access_secret = creds.access_secret

In [7]:
# setup for twitter api, tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [8]:
# Gather additional data from Twitter's API

err = []  # For Tweet IDs with errors

# Store each tweet's entire JSON data in a file
with open('data/tweet_json.txt', 'w') as file: 
    for i in df_archive['tweet_id']:
        try:
            tweet = api.get_status(i)
            # converting Status Object into JSON and write
            json.dump(tweet._json, file)
            # separate each tweet's data with a line break
            file.write("\n")
            
        except:   
            print('ERROR: ID {} Not Found !'.format(i))
            # store tweet id with errors in err[]
            err.append(i)

ERROR: ID 888202515573088257 Not Found !
ERROR: ID 873697596434513921 Not Found !
ERROR: ID 872668790621863937 Not Found !
ERROR: ID 872261713294495745 Not Found !
ERROR: ID 869988702071779329 Not Found !
ERROR: ID 866816280283807744 Not Found !
ERROR: ID 861769973181624320 Not Found !
ERROR: ID 845459076796616705 Not Found !
ERROR: ID 842892208864923648 Not Found !
ERROR: ID 837012587749474308 Not Found !
ERROR: ID 827228250799742977 Not Found !
ERROR: ID 812747805718642688 Not Found !
ERROR: ID 802247111496568832 Not Found !


Rate limit reached. Sleeping for: 490


ERROR: ID 775096608509886464 Not Found !
ERROR: ID 770743923962707968 Not Found !
ERROR: ID 754011816964026368 Not Found !


Rate limit reached. Sleeping for: 509


ERROR: ID 680055455951884288 Not Found !


In [29]:
# read additional JSON file and store it into a DataFrame
data_json = []

with open('data/tweet_json.txt', 'r') as json_file:
    for line in json_file:
        tweet = json.loads(line)
        
        # take id, retweet_counts, and favorite_count from each tweet
        data_json.append({'tweet_id': tweet['id'],
                          'retweets': tweet['retweet_count'],
                          'favorites': tweet['favorite_count']
                         })
# convert it to pandas DataFrame
df_counts = pd.DataFrame(data_json, columns=['tweet_id', 'retweets', 'favorites'])

In [30]:
# check
df_counts.head()

Unnamed: 0,tweet_id,retweets,favorites
0,892420643555336193,8213,37672
1,892177421306343426,6073,32356
2,891815181378084864,4015,24373
3,891689557279858688,8361,40994
4,891327558926688256,9072,39192


In [31]:
# check Tweet IDs with errors
len(err)

17

In [36]:
# save to a .csv file
df_counts.to_csv('data/tweet_counts.csv', index=False)

### Assessing Data
#### 1. df_counts

In [40]:
df_counts = pd.read_csv('data/tweet_counts.csv')

In [45]:
# shape
df_counts.shape

(2339, 3)

In [43]:
# info
df_counts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2339 entries, 0 to 2338
Data columns (total 3 columns):
tweet_id     2339 non-null int64
retweets     2339 non-null int64
favorites    2339 non-null int64
dtypes: int64(3)
memory usage: 54.9 KB


In [48]:
# duplicates
df_counts.duplicated().sum()

0

In [56]:
# null
df_counts.isnull().sum()

tweet_id     0
retweets     0
favorites    0
dtype: int64

In [44]:
# assess visually
df_counts

Unnamed: 0,tweet_id,retweets,favorites
0,892420643555336193,8213,37672
1,892177421306343426,6073,32356
2,891815181378084864,4015,24373
3,891689557279858688,8361,40994
4,891327558926688256,9072,39192
5,891087950875897856,3007,19706
6,890971913173991426,1989,11520
7,890729181411237888,18243,63542
8,890609185150312448,4132,27093
9,890240255349198849,7132,31053


#### 2. df_archive

In [46]:
# shape
df_archive.shape

(2356, 17)

In [None]:
# info


In [None]:
# duplicates

In [None]:
# null

In [None]:
# assess visually

#### 3. df_image

In [None]:
# shape

In [None]:
# info

In [None]:
# duplicates

In [None]:
# null

In [None]:
# assess visually

#### Quality
- 


#### Tidiness
- 


#### Reference
- Convert Tweepy Status object into JSON: https://stackoverflow.com/questions/27900451/convert-tweepy-status-object-into-json
- Reading/Writing JSON to a File in Python: https://stackabuse.com/reading-and-writing-json-to-a-file-in-python/