# Part I: Data Wrangling
## Data Gathering

In [2]:
# Importing required Libraries
import pandas as pd
import numpy as np
import tweepy
import requests
import re
import json
import matplotlib.pyplot as plt
import datetime as dt
import os
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## 1) WeRateDogs Twitter Archive
Reading a manually downloaded `CSV` file, using pandas `read_csv()`.

In [3]:
# Import the WeRateDogs Twitter Archive into a DataFrame
archive_df = pd.read_csv('twitter-archive-enhanced.csv')

In [4]:
# Check to see if the file was imported correctly
archive_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


## 2) Tweet Image Predictions
Programmatically download the tweet image predictions using the `Requests` library.

In [6]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
response

<Response [200]>

In [8]:
# Create file if it doesn't already exist
file_name = 'image-predictions.tsv'
if not os.path.isfile(file_name):
    # Write content to file
    with open(file_name, mode='wb') as file:
              file.write(response.content)

In [9]:
# Read the tweet image predictions TSV file into a DataFrame
img_pred_df = pd.read_csv(file_name, sep='\t')
# Check to see if the file was imported correctly
img_pred_df.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


## 3) Additional Data via the Tweepy API
Getting the **retweet count** and **favorite count** via the Tweepy API.

In [10]:
# Library to read secret credentials
from dotenv import load_dotenv
load_dotenv()

True

In [11]:
consumer_key = os.getenv('MY_CONSUMER_KEY')
consumer_secret = os.getenv('MY_CONSUMER_SECRET')
access_token = os.getenv('MY_ACCESS_TOKEN')
access_secret = os.getenv('MY_ACCESS_SECRET')

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

### Testing the Tweepy API

In [16]:
# Experimenting to extract one tweet_id information after creating the Tweepy API object
test_tweet = api.get_status(archive_df.tweet_id[5], tweet_mode = 'extended')
tweet_content = test_tweet._json
tweet_content

{'created_at': 'Sat Jul 29 00:08:17 +0000 2017',
 'id': 891087950875897856,
 'id_str': '891087950875897856',
 'full_text': "Here we have a majestic great white breaching off South Africa's coast. Absolutely h*ckin breathtaking. 13/10 (IG: tucker_marlo) #BarkWeek https://t.co/kQ04fDDRmh",
 'truncated': False,
 'display_text_range': [0, 138],
 'entities': {'hashtags': [{'text': 'BarkWeek', 'indices': [129, 138]}],
  'symbols': [],
  'user_mentions': [],
  'urls': [],
  'media': [{'id': 891087942176911360,
    'id_str': '891087942176911360',
    'indices': [139, 162],
    'media_url': 'http://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg',
    'url': 'https://t.co/kQ04fDDRmh',
    'display_url': 'pic.twitter.com/kQ04fDDRmh',
    'expanded_url': 'https://twitter.com/dog_rates/status/891087950875897856/photo/1',
    'type': 'photo',
    'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
     'small': {'w': 680, 'h':

In [17]:
tweet_content['retweet_count']

2731

In [18]:
tweet_content['favorite_count']

18481

In [19]:
tweet_content.keys()

dict_keys(['created_at', 'id', 'id_str', 'full_text', 'truncated', 'display_text_range', 'entities', 'extended_entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'possibly_sensitive_appealable', 'lang'])

### Querying Tweepy API to get the retweet count and favorite count

In [20]:
# List of Not Found Tweets
errors = []
# Create 'tweet_json.txt' if it doesn't exist already
if not os.path.isfile('tweet_json.txt'):
    # Write the Tweets data to the file
    with open('tweet_json.txt', mode='w') as file:
        for tweet_id in archive_df['tweet_id']:
            try:
                status = api.get_status(tweet_id, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, tweet_mode = 'extended')
                json.dump(status._json, file)
                # Insert a new line after each tweet
                file.write('\n')
            except Exception as e:
                print("Error on tweet id {}".format(tweet_id) + ";" + str(e))
                errors.append(tweet_id)

Error on tweet id 888202515573088257;[{'code': 144, 'message': 'No status found with that ID.'}]
Error on tweet id 873697596434513921;[{'code': 144, 'message': 'No status found with that ID.'}]
Error on tweet id 872668790621863937;[{'code': 144, 'message': 'No status found with that ID.'}]
Error on tweet id 872261713294495745;[{'code': 144, 'message': 'No status found with that ID.'}]
Error on tweet id 869988702071779329;[{'code': 144, 'message': 'No status found with that ID.'}]
Error on tweet id 866816280283807744;[{'code': 144, 'message': 'No status found with that ID.'}]
Error on tweet id 861769973181624320;[{'code': 144, 'message': 'No status found with that ID.'}]
Error on tweet id 861383897657036800;Failed to send request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error on tweet id 856602993587888130;[{'code': 144, 'message': 'No status found with that ID.'}]
Error on tweet id 85195390

Rate limit reached. Sleeping for: 370


Error on tweet id 698549713696649216;Failed to send request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error on tweet id 680055455951884288;[{'code': 144, 'message': 'No status found with that ID.'}]
Error on tweet id 674036086168010753;Failed to send request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))


Rate limit reached. Sleeping for: 367


Error on tweet id 667152164079423490;Failed to send request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
