
**Load Libraries**

In [None]:
#Load libraries
import tweepy
import requests
from tweepy import OAuthHandler
import os
import json
from timeit import default_timer as timer
import random
import sys
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

: 

## STEP 1: DATA GATHERING

In [None]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

: 

In [None]:
twitter_archive.head(2)

: 

In [None]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
my_download = requests.get(url)
my_download

: 

In [None]:
with open('image-predictions.tsv', 'wb') as file:
    file.write(my_download.content)

: 

In [None]:
image_prediction = pd.read_csv('image-predictions.tsv', sep='\t')
image_prediction.head()

: 

In [None]:
CONSUMER_KEY = 'YOUR CONSUMER KEY'
CONSUMER_SECRET = 'YOUR CONSUMER SECRET'
OAUTH_TOKEN = 'YOUR ACCESS TOKEN'
OAUTH_TOKEN_SECRET = 'YOUR ACCESS SECRET'

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

api = tweepy.API(auth, wait_on_rate_limit = True)

: 

In [None]:
#NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API

: 

In [None]:
tweet_ids = twitter_archive.tweet_id.values
len(tweet_ids)


: 

In [None]:
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()

: 

In [11]:
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.errors.TweepyException as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

### The dataset has been downloaded already and written into the tweet_json.txt file. The data will be extracted from this file below

In [None]:
tweets = []
for line in open('tweet_json.txt', 'r'):
    tweets.append(json.loads(line))
#print out one json object for further reference 
tweets[0] 

: 

In [None]:
# Defining the properties that are needed.
vital_properties = ['id', 'favorite_count', 'retweet_count']

: 

In [None]:
#Create an empty list to
# Make use of the Dictionary
archived_tweets = []
with open('tweet_json.txt', 'r') as r: 
    json_tweet = r.readline()
    while json_tweet:
        dictionnary = json.loads(json_tweet)
        data_row = dict((i, dictionnary[i]) for i in vital_properties)
        archived_tweets.append(data_row)
        json_tweet = r.readline()

: 

In [None]:
# create DataFrame    
df_tweets = pd.DataFrame.from_dict(archived_tweets)

: 

In [None]:
df_tweets.head()

: 

### 2 Assessing the Data

**2.1 Assessing the archive**

In [None]:
twitter_archive

: 

### Observation
<li>The Columns doggo, floofer, pupper and puppo all represent one single variable.</li>
<li>The source contains html residues.</li>

In [None]:
twitter_archive.shape

: 

In [None]:
twitter_archive.info()

: 

### Observation
timestamp column is a string rather than a datetime

In [None]:

# take a data frame column and a column name and print the number of 'None' string in the column
def none_find(column,name):
    
    count=0
    for word in column:
        if word == 'None':
            count+=1
    print('Name of the column is: ', name, ' and has ',count,' number of None.')

: 

In [None]:
none_find(twitter_archive['tweet_id'],'tweet_id')
none_find(twitter_archive['timestamp'],'timestamp')
none_find(twitter_archive['source'],'source')
none_find(twitter_archive['expanded_urls'],'expanded_urls')
none_find(twitter_archive['rating_numerator'],'rating_numerator')
none_find(twitter_archive['rating_denominator'],'rating_denominator')
none_find(twitter_archive['name'],'name')
none_find(twitter_archive['doggo'],'doggo')
none_find(twitter_archive['floofer'],'floofer')
none_find(twitter_archive['pupper'],'pupper')
none_find(twitter_archive['puppo'],'puppo')

: 

### Observation
<li>The "name" column has a total of 745  'None' strings present</li>
<li>The "doggo" column has a total of  2259 'None' strings present</li>
<li>The "floofer" column has a total of 2346 'None' strings present</li>
<li>The "pupper" column has a total of 2099 'None' strings present</li>
<li>The "puppo" column has a total of2326 'None' strings present</li>

In [None]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

: 

In [None]:
# take a data frame column and a column name and print the number of stop words in the column
def the_stopwords(column,name):
    count=0
    for word in column:
        if word in ENGLISH_STOP_WORDS:
            count+=1
    print('Name of the column is: ',name,'  and has ',count,' stop words.')

: 

In [None]:
the_stopwords(twitter_archive['tweet_id'],'tweet_id')
the_stopwords(twitter_archive['timestamp'],'timestamp')
the_stopwords(twitter_archive['source'],'source')
the_stopwords(twitter_archive['expanded_urls'],'expanded_urls')
the_stopwords(twitter_archive['rating_numerator'],'rating_numerator')
the_stopwords(twitter_archive['rating_denominator'],'rating_denominator')
the_stopwords(twitter_archive['name'],'name')
the_stopwords(twitter_archive['doggo'],'doggo')
the_stopwords(twitter_archive['floofer'],'floofer')
the_stopwords(twitter_archive['pupper'],'pupper')
the_stopwords(twitter_archive['puppo'],'puppo')

: 

### Observation
<li>The "name" column has a total of 87 stop words present</li>

In [48]:
twitter_archive.expanded_urls.sample(5)

27      https://www.gofundme.com/mingusneedsus,https:/...
1976    https://twitter.com/dog_rates/status/672995267...
957     https://twitter.com/dog_rates/status/751538714...
1207    https://twitter.com/dog_rates/status/715733265...
1477    https://twitter.com/dog_rates/status/693622659...
Name: expanded_urls, dtype: object

In [None]:
twitter_archive.expanded_urls[27]

: 

### Observation 
 we notice a Double-Link in "expanded_urls" column data content

In [None]:
#Count the number of entries with double urls
twitter_archive.expanded_urls.str.contains(r',').sum()

: 

### Observation 
 The total counts of double url is 639

In [60]:
twitter_archive.rating_denominator.sample(5)  

1008    10
2215    10
1662    11
335     10
2286    10
Name: rating_denominator, dtype: int64

In [None]:
twitter_archive.rating_denominator[1662]

: 