# Project 2
## Investigation of the WeRateDogs Twitter archive
## Table of Contents
<ul>
<li><a href="#intro">Introduction</a></li>
<li><a href="#wrangling">Data Wrangling</a></li>
<li><a href="#eda">Exploratory Data Analysis</a></li>
<li><a href="#conclusions">Conclusions</a></li>
</ul>

<a id='intro'></a>
## Introduction

### **References**:

>The pandas documentation was utilized for reference (https://pandas.pydata.org/docs/) <br>
>The plotly documentation was utilized for plotting reference (https://plotly.com/python/radar-chart/) <br>
>Stack overflow: (https://stackoverflow.com/questions/35371043/use-python-requests-to-download-csv) <br>
>Python tutorial: (https://www.pythontutorial.net/python-basics/python-write-csv-file/) <br>
>Tweepy documentation: (https://docs.tweepy.org/en/stable/) <br>

In [1]:
%pip install tweepy

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Use this cell to set up import statements for all of the packages that you
#   plan to use.

# Remember to include a 'magic word' so that your visualizations are plotted
#   inline with the notebook. See this page for more:
#   http://ipython.readthedocs.io/en/stable/interactive/magics.html

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import requests as rq
import csv
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer
%matplotlib inline

<a id='wrangling'></a>
## Data Wrangling

> **Tip**: In this section of the report, you will load in the data, check for cleanliness, and then trim and clean your dataset for analysis. Make sure that you document your steps carefully and justify your cleaning decisions.

### General Properties

In [3]:
# Loading the relevant files
df = pd.read_csv("files/twitter-archive-enhanced.csv")

In [4]:
# Loading of image data and store it in the image-predictions.tsv file
downloadURL = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
imageData = rq.get(downloadURL).content.decode('utf-8')
newData = csv.reader(imageData.splitlines(), delimiter='\t')
with open('files/image-predictions.tsv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    # write multiple rows
    writer.writerows(newData)
df_i = pd.read_csv('files/image-predictions.tsv', sep='\t')

In [5]:
# Gathering Data
keys = open('keys.json')
data = json.load(keys)

# Twitter developer account API Keys
consumer_key = data['consumerKeys']['ApiKey']
consumer_secret = data['consumerKeys']['ApiKeySecret']
access_token = data['accessKeys']['ApiKey']
access_secret = data['accessKeys']['ApiKeySecret']

# Tweepy Authentication
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

# Consumption of Twitter through Tweepy
tweep = tweepy.API(auth, wait_on_rate_limit=True)
tweetIDs = df.tweet_id.values
len(tweetIDs)

pageNotFound = []
minedTweetData = []

with open('files/tweet_json.txt', mode="w") as file:
    for i in list(tweetIDs):
        try:
            tweet = tweep.get_status(str(i), tweet_mode='extended')
            file.write(json.dumps(tweet._json))
            minedTweetData.append({
                "tweet_id" : str(i),
                "retweet_count" : tweet._json['retweet_count'],
                "favorite_count" : tweet._json['favorite_count']
            })
        except:
            pageNotFound.append(i) 

In [162]:
# Confirm the data was mined
len(minedTweetData), len(pageNotFound)

In [None]:
# Udacity tweet-json file
# Assign the mined data to a dataframe
minedData = 'files/tweet-json.txt'
minedData = pd.DataFrame(minedData, 
columns=["tweet_id",'retweet_count', 'favorite_count']).to_csv('datasets/mined_twitter_data.csv')

In [None]:
# Programatically mined data
# Assign the mined data to a dataframe
minedTweetData = pd.DataFrame(minedTweetData, 
columns=["tweet_id",'retweet_count', 'favorite_count']).to_csv('datasets/twitter_mined.csv')

In [164]:
# print(df_tw.tweet_id.dtype)
# df_tw.tweet_id = df_tw.tweet_id.astype('str')
# print(df_tw.tweet_id.dtype)

int64
object


### Assess

#### Visual Assessment

In [165]:
# View Enhanced Twitter Archive Data
df

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,8.924210e+17,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,8.921770e+17,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,8.918150e+17,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,8.916900e+17,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,8.913280e+17,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,6.660490e+17,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,
2352,6.660440e+17,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,
2353,6.660330e+17,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,6.660290e+17,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,


In [166]:
# View Image Predictions Data
df_i

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


#### Programatic Assessment

In [170]:
# Verify the result is stored in the dataframe
df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,8.92421e+17,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,8.92177e+17,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,8.91815e+17,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,8.9169e+17,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,8.91328e+17,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [171]:
# Verify the result is stored in the dataframe
df_i.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [173]:
# Check the value counts of the dataframe columns
df.rating_numerator.value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
0         2
15        2
75        2
420       2
182       1
204       1
143       1
121       1
99        1
20        1
45        1
27        1
17        1
24        1
26        1
44        1
50        1
60        1
80        1
84        1
88        1
1776      1
960       1
666       1
144       1
165       1
Name: rating_numerator, dtype: int64

In [174]:
# Check the value counts of the dataframe columns
df.rating_denominator.value_counts()

10     2333
11        3
50        3
20        2
80        2
0         1
120       1
7         1
170       1
150       1
130       1
90        1
110       1
2         1
70        1
40        1
16        1
15        1
Name: rating_denominator, dtype: int64

In [175]:
# Check the value counts of the dataframe columns
df.name.value_counts()

None        745
a            55
Charlie      12
Lucy         11
Oliver       11
           ... 
Doobert       1
Jersey        1
Ralphson      1
Willow        1
Bauer         1
Name: name, Length: 957, dtype: int64

In [176]:
# Check the value counts of the dataframe columns
print('Doggo')
print(df.doggo.value_counts())
print(' ')
print('Floofer')
print(df.floofer.value_counts())
print(' ')
print('Pupper')
print(df.pupper.value_counts())
print(' ')
print('Puppo')
print(df.puppo.value_counts())

Doggo
None     2259
doggo      97
Name: doggo, dtype: int64
 
Floofer
None       2346
floofer      10
Name: floofer, dtype: int64
 
Pupper
None      2099
pupper     257
Name: pupper, dtype: int64
 
Puppo
None     2326
puppo      30
Name: puppo, dtype: int64


In [177]:
# Shape of the data
df.shape

(2356, 17)

In [178]:
# Shape of the data
df_i.shape

(2075, 12)

In [180]:
# Description of the data
df.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455128e+17,2.015385e+16,7.720221e+17,1.241437e+16,13.126486,10.455433
std,6.856706e+16,7.583419e+16,1.253546e+17,6.236131e+16,9.597227e+16,45.876648,6.745237
min,6.66021e+17,6.66e+17,11856340.0,6.66e+17,783214.0,0.0,0.0
25%,6.783992e+17,6.76e+17,308637400.0,7.19e+17,4196984000.0,10.0,10.0
50%,7.196275e+17,7.035e+17,4196984000.0,7.8e+17,4196984000.0,11.0,10.0
75%,7.993375e+17,8.26e+17,4196984000.0,8.2e+17,4196984000.0,12.0,10.0
max,8.92421e+17,8.86e+17,8.41e+17,8.87e+17,7.87e+17,1776.0,170.0


In [181]:
# Description of the data
df_i.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [183]:
# Info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   float64
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [184]:
# Info of the data
df_i.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [186]:
# View the index number and label for each column
def view_index(data):
    for i, v in enumerate(data.columns):
        print(i, v)

In [187]:
# View the index number and label for each column
view_index(df)

0 tweet_id
1 in_reply_to_status_id
2 in_reply_to_user_id
3 timestamp
4 source
5 text
6 retweeted_status_id
7 retweeted_status_user_id
8 retweeted_status_timestamp
9 expanded_urls
10 rating_numerator
11 rating_denominator
12 name
13 doggo
14 floofer
15 pupper
16 puppo


In [188]:
# View the index number and label for each column
view_index(df_i)

0 tweet_id
1 jpg_url
2 img_num
3 p1
4 p1_conf
5 p1_dog
6 p2
7 p2_conf
8 p2_dog
9 p3
10 p3_conf
11 p3_dog


### Issues Identified

Quality Issues
> 1. Not all `rating_numeritor` values are less than 10 given majority of the rating is out of 10
> 2. Not all `rating_denominator` values are equal than 10 given majority of the rating is out of 10
> 3. Under the name column the value `a` isn't a name yet it appears 55 times (`None` can be used in place)
> 4. Multiple columns are missing data within majority of their rows within the twitter archive dataset
> 5. Unecessary retweets rows
> 6. Unecessary "in reply to users' tweet" rows
> 7. Wrong Data types of "timestamp", "tweet_id"
> 8. The `tweet_id`, `source`, `in_reply_to_status_id` and `in_reply_to_user_id` columns are duplicated in all the tables
> 9. A large number of the predictions contain erronious entries in the image prediction dataset

Tidiness Issues
> 1. The `doggo`, `floofer`, `pupper` an `puppo` columns can be represented in a single column known as stage (i.e. stages of dog).
> 2. The `retweet count` and `favorite count` columns are not in the twitter archive dataset where they would ideally fit into


### Data Cleaning

#### Cleaning process:
> 1. Create copy of datasets
> 2. Apply define-code-test framework to handle issues identified
> 3. Prepare tidy master dataset(s)

#### Copy of datasets

In [190]:
# Copy of datasets
Archive = df.copy()
Image = df_i.copy()

### Define - Code - Test

#### Define issue to be solved

> Define: Remove/Drop redundant columns (Multiple columns representing the same result)

In [191]:
# Code solution of issue
Archive['stage']=None
Archive['stage'] = Archive.doggo + Archive.floofer + Archive.pupper + Archive.puppo

In [192]:
# Check value counts to see all merged values represented
Archive['stage'].value_counts()

NoneNoneNoneNone        1976
NoneNonepupperNone       245
doggoNoneNoneNone         83
NoneNoneNonepuppo         29
doggoNonepupperNone       12
NoneflooferNoneNone        9
doggoflooferNoneNone       1
doggoNoneNonepuppo         1
Name: stage, dtype: int64

In [193]:
# Remove the unecessary words none
Archive['stage'] = Archive['stage'].map(lambda x: x.replace("None",""))

In [194]:
# Check value counts to see all merged values represented
Archive['stage'].value_counts()

                1976
pupper           245
doggo             83
puppo             29
doggopupper       12
floofer            9
doggofloofer       1
doggopuppo         1
Name: stage, dtype: int64

In [195]:
# Replace the unecessary merged results and drop the unecessary columns
# Replace values
Archive.loc[Archive.stage == 'doggopupper', 'stage'] = 'doggo, pupper'
Archive.loc[Archive.stage == 'doggopuppo', 'stage'] = 'doggo, puppo'
Archive.loc[Archive.stage == 'doggofloofer', 'stage'] = 'doggo, floofer'
# Drop columns
Archive.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis =1, inplace=True)

In [196]:
# Test solution
Archive.stage.value_counts()

                  1976
pupper             245
doggo               83
puppo               29
doggo, pupper       12
floofer              9
doggo, floofer       1
doggo, puppo         1
Name: stage, dtype: int64

#### Define issue to be solved

> Define: Merge The `retweet count` and `favourite count` with the archive dataset

In [199]:
# Code solution of issue
minedTweetData = pd.DataFrame(minedTweetData, columns=["tweet_id",'retweet_count', 'favorite_count']).to_csv('datasets/twitter_mined.csv')
Archive.tweet_id = Archive.tweet_id.astype('str')
Archive = pd.merge(Archive, minedTweetData,
                on=['tweet_id'], how='left')

float64
object
object
object


In [203]:
Archive

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,stage,retweet_count,favorite_count
0,8.92421e+17,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,
1,8.92177e+17,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,
2,8.91815e+17,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,
3,8.9169e+17,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,
4,8.91328e+17,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,6.66049e+17,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,
2352,6.66044e+17,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,
2353,6.66033e+17,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,
2354,6.66029e+17,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,


In [201]:
# Test solution
Archive.retweet_count.describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: retweet_count, dtype: float64

In [204]:
# Test solution
Archive.favorite_count.describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: favorite_count, dtype: float64

#### Define issue to be solved

> Define: Find the index of retweets, then remove retweets rows and [ '`retweeted_status_id`', '`retweeted_status_user_id`', '`retweeted_status_timestamp`'] columns

In [205]:
# Code solution of issue
retweets_index = list(Archive[Archive["retweeted_status_id"].isnull()==False].index)
Archive.drop(axis=0, index=retweets_index, inplace=True)

In [206]:
# Drop columns
Archive.drop(['retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp'],axis=1,inplace=True)

In [207]:
# Test solution
for retweet in retweets_index:
    if retweet in list(Archive.index):
        print('Retweet found')

#### Define the issue to be solved

> Define: Identify the index for `reply`, then remove them and the `in_reply_to_user_id`,`in_reply_to_status_id` columns

In [208]:
# Code solution for the issue
replys_index = list(Archive[Archive["in_reply_to_status_id"].isnull()==False].index)
Archive.drop(axis=0, index=replys_index, inplace=True)
Archive.drop(['in_reply_to_user_id','in_reply_to_status_id'],axis=1,inplace=True)

In [209]:
# Test the solution
for reply in replys_index:
    if reply in list(Archive.index):
        print('Found a reply')

#### Define the issue to be solved

> Define: Find the wrong values, and save them into 'temp' column temporarily , then convert the rating_ numerator column to float data type and add the right float number into it.

In [210]:
# Code the solution
Archive['temp'] = Archive.text.str.extract(r"([0-9]+[.][0-9]+/[0-9]+)")
Archive.rating_numerator = Archive.rating_numerator.astype(float)
wrong_numerator_index = list(Archive[Archive['temp'].isnull()==False].index)
for i in wrong_numerator_index:
    Archive.rating_numerator[i] = float(Archive.temp[i][:-3])
Archive.drop("temp", axis=1, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Archive.rating_numerator[i] = float(Archive.temp[i][:-3])


In [211]:
# Test the solution
for i in wrong_numerator_index:
    print(Archive.rating_numerator[i])

13.5
9.75
11.27
11.26


#### Define the issue to be solved

> Define: Find the list of wrong names and use for loop to give indvidual name "None".

In [212]:
# Code the solution
wrongNames = list(Archive.query('name == "a" or name == "an"').index)
for i in wrongNames:
    Archive.name[i] = "None"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Archive.name[i] = "None"


In [213]:
# Test the solution
for i in wrongNames:
    if Archive.name[i] != "None":
        print("wrong name")

#### Define the issue to be solved

> Define: Find erroneous list of rating_denominator != 10, then drop those rows

In [214]:
# Code the solution
wrongDenominator = list(Archive.query("rating_denominator != 10").index)
Archive.drop(index=wrongDenominator, inplace=True)

In [215]:
# Test the solution
Archive.query("rating_denominator != 10").index

Int64Index([], dtype='int64')

#### Define the issue to be solved

> Define: Change the data types for timestamp and tweet id

In [216]:
# Code the solution
Archive.timestamp = Archive.timestamp.astype('datetime64')
Archive.tweet_id = Archive.tweet_id.astype(str)

In [217]:
# Test the solution
Archive.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2080 entries, 0 to 2355
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tweet_id            2080 non-null   object        
 1   timestamp           2080 non-null   datetime64[ns]
 2   source              2080 non-null   object        
 3   text                2080 non-null   object        
 4   expanded_urls       2077 non-null   object        
 5   rating_numerator    2080 non-null   float64       
 6   rating_denominator  2080 non-null   int64         
 7   name                2080 non-null   object        
 8   stage               2080 non-null   object        
 9   retweet_count       0 non-null      float64       
 10  favorite_count      0 non-null      float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(6)
memory usage: 195.0+ KB


#### Define the issue to be solved

> Define: Drop source column due to duplicates

In [218]:
# Code the solution
Archive.drop("source",axis=1,inplace=True)

In [219]:
# Test the solution
Archive.columns

Index(['tweet_id', 'timestamp', 'text', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'stage', 'retweet_count',
       'favorite_count'],
      dtype='object')

#### Define the issue to be solved

> Define: Identify rows that have three false values and drop them.

In [220]:
# Code the solution
falseValues = list(Image.query("p1_dog==False and p2_dog==False and p3_dog == False").index)
Image.drop(index=falseValues,inplace=True)

In [221]:
# Test the solution
Image.query("p1_dog==False and p2_dog==False and p3_dog == False").index

Int64Index([], dtype='int64')

#### Store the final Data

In [225]:
# save this for later
Image.to_csv('datasets/twitter_image.csv')

Archive.reset_index(drop=True)
Archive.to_csv('datasets/twitter_archive_master.csv')

df_Archive = pd.read_csv('datasets/twitter_archive_master.csv')
df_Image = pd.read_csv('datasets/twitter_image.csv')
df_Archive.drop('Unnamed: 0',axis=1, inplace=True)
df_Archive.head()

Unnamed: 0,tweet_id,timestamp,text,expanded_urls,rating_numerator,rating_denominator,name,stage,retweet_count,favorite_count
0,8.92421e+17,2017-08-01 16:23:56,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13.0,10,Phineas,,,
1,8.92177e+17,2017-08-01 00:17:27,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13.0,10,Tilly,,,
2,8.91815e+17,2017-07-31 00:18:03,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12.0,10,Archie,,,
3,8.9169e+17,2017-07-30 15:58:51,This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13.0,10,Darla,,,
4,8.91328e+17,2017-07-29 16:00:24,This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12.0,10,Franklin,,,


#### Analysis and Visualization

##### Check for columns with null values

In [None]:
# Check for null values
Archive.isnull().sum()

In [None]:
# Check for null values
Image.isnull().sum()

### Confirm the dropping of columns worked

In [None]:
# Check if null values were replaced
Archive.isnull().sum()

### Confirmation of the columns present in the new dataset

In [None]:
# View the index number and label for each column
view_index(Archive)

In [None]:
# View the index number and label for each column
view_index(Image)

In [None]:
Archive.describe()

In [None]:
Archive.info()

In [None]:
Image.describe()

In [None]:
Image.info()

<a id='eda'></a>
## Exploratory Data Analysis

In [None]:
# Load data to plot


### Univariate Visualizations

### Bivariate Visualizations

### Line graph plots for the data

In [None]:
# Line plot function

def line_plot(p1, p2, p3,
              q1, q2, q3,
              title, xlabel, ylabel,
              mainLabel1, mainLabel2, mainLabel3,
              label_1, label_2, label_3,):
    x1 = q1
    x2 = q2
    x3 = q3
    
    y1 = p1
    y2 = p2
    y3 = p3
    
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    line1 = plt.plot(x1, y1, mainLabel1, marker='o', color='blue')
    line2 = plt.plot(x2, y2, mainLabel2, marker='o', color='red')
    line3 = plt.plot(x3, y3, mainLabel3, marker='o', color='green')
    
    # Point Labels
    # e.g plt.text(x,y,'name')
    label_1
    label_2
    label_3
    
    leg = plt.legend(loc='lower left')
    plt.show()

### Histogram Plots for the data

In [None]:
# Bar plot function
def bar_plot(p1, p2, p3,
             q1, q2, q3,
             title, xlabel, ylabel,
             label_1, label_2, label_3,
             legend1, legend2, legend3):
    N = 6
    ind = np.arange(N) 
    width = 0.25

    x1 = q1
    x2 = q2
    x3 = q3
    
    y1 = p1
    y2 = p2
    y3 = p3
    
    bar1 = plt.bar(ind, x1, width, color = 'b')
    bar2 = plt.bar(ind+width, x2, width, color='r')
    bar3 = plt.bar(ind+width*2, x3, width, color = 'g')

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    # Point Labels
    # e.g plt.text(x,y,'name')
    label_1
    label_2
    label_3

    plt.xticks(ind+width,[y1, y2, y3])
    plt.legend((bar1, bar2, bar3), (legend1, legend2, legend3))
    plt.show()

<a id='conclusions'></a>
## Conclusions