In [1]:
#importing necessary packages
import pandas as pd
import requests
import tweepy
import matplotlib as plt
import seaborn as sns
import os
import json
%matplotlib inline

#### Data wrangling is the process of cleaning data and making the data functionally fit for exploratory data analysis. It involves four steps: Gathering , Asessing, Cleaning and Storing Data(optional). We'll go through these steps one by one in this project

### 1. Gathering Data: This step is used to gather data that is required for the analysis. Data can be gathered through various methods. These methods involve:
<li> Downloading data directly via webisites </li>
<li> Downloading data programmatically using HTTP protocol </li>
<li> Accessing data using APIs(Application Programming Interface) </li>

In [2]:
#data folder name in the project directory
data_folder_name = r'data'

#### The following file was directly downloaded from the udacity website. This is a case of *Data Gathering: Downloading data directly via websites*

In [3]:
df_basic = pd.read_csv(data_folder_name+r'\twitter-archive-enhanced-2.csv')

#### In the following, we'll request the data via requests lib. We'll then save the retrieved contents to a file. This is a case of *Data Gathering: Downloading data via Internet through HTTP protocol

In [4]:
dog_predictions_file_name = r'dog_predictions'
if not os.path.exists(os.path.join(data_folder_name, dog_predictions_file_name)):
    print('not exists')
    dog_predictions = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
    with open(os.path.join(data_folder_name, dog_predictions_file_name), mode='wb') as file:
        file.write(dog_predictions.content)

In [5]:
df_predictions = pd.read_csv(os.path.join(data_folder_name, dog_predictions_file_name), sep='\t', encoding='utf-8')

#### In the following, we'll access "@WeRateDogs" twitter archive and store to a file. This is a case of *Data Gathering: via APIs*

In [6]:
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

In [8]:
tweets_id = [str(each) for each in df_basic.tweet_id]

In [9]:
import timeit
tweets_json_data_file_name = r'tweet_json.txt'
errors = []
tweets_count = 0
if not os.path.exists(os.path.join(data_folder_name, tweets_json_data_file_name)):
    #os.mkdir(os.path.join(data_folder_name, tweets_json_data_file_name))
    with open(os.path.join(data_folder_name, tweets_json_data_file_name), 'w') as outfile:
        start_time = timeit.timeit()
        for index, each_tweet_id in enumerate(tweets_id):
            try:
                tweet_data = api.get_status(each_tweet_id, tweet_mode='extended', wait_on_rate_limit=True,\
                                           wait_on_rate_limit_notify=True)
                json.dump(tweet_data._json, outfile)
                outfile.write('\n')
            except:
                errors.append(each_tweet_id)
                continue
            print('tweet count is', index)
        end_time = timeit.timeit()
    print(end_time - start_time)

In [11]:
#reading each line in the tweet_json.txt as json object using the panda's read_json object
tweet_additional_data = pd.read_json(os.path.join(data_folder_name, tweets_json_data_file_name), lines=True)

In [12]:
#extracting only the required  fields such as id, retweet_count and favorite_count
df_additional = tweet_additional_data[['id', 'retweet_count', 'favorite_count']]

### 2. Assessing data: In this step, observations are made about the data after the data is assessed visually and programmatically

#### Visual Assessment: The intention of visual assessment is to acquaint oneself with the data in its entirety

In [75]:
df_basic

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,


> <b> About the data: </b>
<ol>
    <li>There are 2357 rows and 17 columns</li>
    <li>"tweet_id" is the id of ofthe tweet</li>
    <li>"in_reply_to_status_id" is the id of the status being replied to</li>
    <li>"in_reply_to_user_id" is the id of the user being replied to</li>
    <li>"timestamp" is the time at which the tweet was posted</li>
    <li>"source" tells the source of the tweet whether the tweet was posted using android, iphone or web</li>
    <li>"text" field has the contents of the tweet</li>
    <li>"retweeted_status_id" field contains of the id of the original tweet that is retweeted</li>
    <li>"retweeted_status_user_id" feild contains the id of the user of the original post</li>
    <li>"retweeted_status_timestamp" contains the timestamp of the original tweet</li>
    <li>"expanded_urls" contains urls to the original post</li>
    <li>"name" contains the name of the dog</li>
    <li>"doggo", "floofer", "pupper", "puppo" are various stages of a dog</li>
</ol>

In [19]:
df_additional

Unnamed: 0,id,retweet_count,favorite_count
0,892420643555336193,7271,34723
1,892177421306343426,5440,30102
2,891815181378084864,3585,22657
3,891689557279858688,7464,37989
4,891327558926688256,8025,36265
...,...,...,...
2325,666049248165822465,37,92
2326,666044226329800704,120,255
2327,666033412701032449,38,105
2328,666029285002620928,41,117


> <b> About the data: </b>
<ol>
    <li>There are 2331 rows and 3 columns</li>
    <li>"id" is the id of ofthe tweet</li>
    <li>"retweet_count" is the count of that tweet being retweeted</li>
    <li>"favorite_count" is the number of likes for that particular tweet</li>
</ol>

In [20]:
df_predictions

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


> <b> About the data: </b>
<ol>
    <li>There are 2076 rows and 12 columns</li>
    <li>"tweet_id" is the id of ofthe tweet</li>
    <li>"jpg_url" is the url for the image of the dog being predicted</li>
    <li>"img_num" corresponds to the image number which the prediction is most confident of</li>
    <li>"p1" is the algorithm's #1 prediction for the image in the tweet</li>
    <li>"p1_conf" is how confident the algorithm is in its #1 prediction </li>
    <li>"p1_dog" is whether or not the #1 prediction is a breed of dog</li>
    <li>etc..</li>
</ol>

#### Programmatic Assessment: Using code to do anything other than look at the data in its entirety.

In [22]:
df_basic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [24]:
df_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 119.6+ KB


In [25]:
df_additional.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2330 entries, 0 to 2329
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   id              2330 non-null   int64
 1   retweet_count   2330 non-null   int64
 2   favorite_count  2330 non-null   int64
dtypes: int64(3)
memory usage: 54.7 KB


<b> Quality Issues: </b>
<br>
<br>
<b>df_basic_dataset: </b>
<br>
<ol>
    <li>"doggo", "floofer", "pupper", "puppo" fields are almost none for all entries</li>
    <li>"retweeted_status_id", "retweeted_status_user_id", "retweeted_status_timestamp" have 181 non-null values which means there are 181 retweets in the data</li>
    <li>"doggo", "floofer", "pupper", "puppo" fields are almost none for all entries but these none values are not being counted as null objects by pandas</li>
    <li>"timestamp" is of 'str' data type</li>
</ol>
<br>
<b>df_predictions: </b>
<br>
<ol>
    <li>names of dogs not in uniform standard. Some starts with lower case letters and others upper case letters.</li>
    <li>wrongs predictions of dogs. ie, p1_dog?, p2_dog? and p3_dog? all flags false</li>
</ol>
<br>
<b>df_additional: </b>
<br>
<ol>
    <li>the number of rows in df_additional is less than the df_basic, which means that certain tweets has either been removed or made unavailable for view</li>
    <li>column name is "id" which id of the tweet. Whereas in df_basics and df_predictions, this column name is "tweet_id"</li>
</ol>
<br>
   

<b>Tidiness Issues: </b>
<br>
<br>
<b>df_basic_dataset: </b>
<br>
<ol>
    <li>"doggo", "floofer", "pupper", "puppo" fields are various stages of a dog in different columns </li>
</ol>
<br>
<b>df_additional: </b>
<br>
<ol>
    <li>the fields "retweet_count" and "favorite_count" are attributes of tweet but in a seperate table</li>
</ol>
<br>
   