Python environment

In [3]:
import os
import glob
import pandas as pd
import numpy as np
import requests
import tweepy
import json
from timeit import default_timer as timer

%matplotlib inline

# Introduction

# Part I - Gathering Data

In [2]:
#The WeRateDogs Twitter archive file
df_archive = pd.read_csv('twitter-archive-enhanced.csv')
df_archive.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,


In [None]:
df_archive.info()

In [None]:
# The tweet image predictions, i.e., what breed of dog (or other object, animal, etc.) is present in each tweet
# Created according to a neural network, Download from Udacity's servers

URL = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
r = requests.get(URL)
file_name = URL.split('/')[-1]
if r.ok:    
    with open(file_name, mode='wb') as file:
        file.write(r.content)
        
# Read flat file
df_predictions = pd.read_csv(file_name, sep='\t')
df_predictions.head(2)

In [None]:
df_predictions.info()

In [None]:
# Dataframe shape
df_predictions.shape, df_archive.shape

In [None]:
# Check for duplicates tweets
df_predictions.tweet_id.duplicated().sum(), df_archive.tweet_id.duplicated().sum()

## Connecting to Twitter API
At this step with help of [Tweepy](http://www.tweepy.org/query) Python library we wil query Twitter's API for additional data beyond the data already included in the WeRateDogs Twitter archive file. This additional data will include retweet count and favorite count. 

[Tweepy API Documentation](http://docs.tweepy.org/en/v3.2.0/api.html#API)

In [5]:
# Set up the connection to Twitter API (requires Twitter account)
consumer_key = os.getenv('TW_CONSUMER_KEY')
consumer_secret = os.getenv('TW_CONSUMER_SECRET')

access_token = os.getenv('TW_ACCESS_TOKEN')
access_secret = os.getenv('TW_ACCESS_SECRET')

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

# Note the handling of Twitter rate limit may extend the tweet query time
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [11]:
#api.rate_limit_status()

### Store Retweets and Favorite data in a JSON file

In [None]:
# Read tweet IDs

for tweet_id in list(df_archive.tweet_id.values[10:20]):
    try:
        tweet = api.get_status(tweet_id, tweet_mode='extended')
        print(tweet.full_text)
    except:
        print(f'\nTweet id - {tweet_id} - does not exist anymore.\n')
        

In [6]:
df_archive.tweet_id.values[:5]

array([892420643555336193, 892177421306343426, 891815181378084864,
       891689557279858688, 891327558926688256], dtype=int64)

In [8]:
# Tweet IDs for which to gather additional data
tweet_ids = df_archive.tweet_id.values[:5]
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()

# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as file:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    # Rate limits are divided into 15 minute intervals
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, file)
            file.write('\n')
        except tweepy.TweepError as e:
            print(f'\nTweet id - {tweet_id} - does not exist anymore.\n')
            fails_dict[tweet_id] = e
            pass
end = timer()
print(f'Total execution time: {end - start}\n)
print(fails_dict)

1: 892420643555336193
Success
2: 892177421306343426
Success
3: 891815181378084864
Success
4: 891689557279858688
Success
5: 891327558926688256
Success
1.5414323999998487
{}


In [None]:
api.get_user('dog_rates')

In [None]:
api.rate_limit_status()

In [None]:
# Returns the authenticated user’s information.
API.me()

# Part II - Assessing Data

# Part III - Cleaning Data

## Storing Data

# Part IV - Analyzing Data

## Visualizing Data 

# Conclusions
Reporting on 
  1. your data wrangling efforts
  2. your data analyses and visualizations