#### Imports 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import requests

# Phase 1: **Data Wrangling**

#### Reading and Collecting Files

##### 1. `twitter-archive-enhanced.csv` 

In [3]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

##### 2. `image-predictions.csv`

In [4]:
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"

req = requests.get(url)

with open(url.split('/')[-1], mode = 'wb') as file:

    file.write(req.content)

In [5]:
image_predictions = pd.read_csv('image-predictions.tsv', sep="\t")

##### 3. `tweet-json.json`

In [6]:
tweet_json = pd.read_json('tweet-json.json', lines=True)

----

# Phase 2: **Data Assessment**

## **Assesment Report** ⏩

### **Quality issues:**
1. `image-predictions` have images that don't represent dogs, they should all be dropped 

2. Columns containing NaN Values

3. Inaccurate data in `names` column in `twitter_archived` as 55 dogs are named 'a'

4.

5.

6.

7.

8.


### **Tidiness issues:**
1. `text` column in `twitter-archive` violated 'Column-Variable Principle

2. Unnecessary columns in `twitter-archive` and `tweet_json`

### 1. `twitter_archive`

##### **Tidiness Issue 1**: `text` column contains two variables: `text-content` and `tweet-link` they should be split up using 'string split' due to the Tidiness Principle 'Column-Variable Principle'

In [7]:
list(twitter_archive)

['tweet_id',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'timestamp',
 'source',
 'text',
 'retweeted_status_id',
 'retweeted_status_user_id',
 'retweeted_status_timestamp',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'name',
 'doggo',
 'floofer',
 'pupper',
 'puppo']

In [8]:
twitter_archive['in_reply_to_status_id'].isna().value_counts()

in_reply_to_status_id
True     2278
False      78
Name: count, dtype: int64

In [9]:
twitter_archive['text'].iloc[1]

"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV"

In [10]:
twitter_archive.sample(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1625,684830982659280897,,,2016-01-06 20:16:44 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",This little fella really hates stairs. Prefers...,,,,https://vine.co/v/eEZXZI1rqxX,13,10,,,,pupper,
1434,697270446429966336,,,2016-02-10 04:06:43 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bentley. He got stuck on his 3rd homew...,,,,https://twitter.com/dog_rates/status/697270446...,10,10,Bentley,,,,
1789,677547928504967168,,,2015-12-17 17:56:29 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Not much to say here. I just think everyone ne...,,,,https://twitter.com/dog_rates/status/677547928...,12,10,,,,,
1878,675047298674663426,,,2015-12-10 20:19:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a fluffy albino Bacardi Columbia mix. ...,,,,https://twitter.com/dog_rates/status/675047298...,11,10,a,,,,
36,885311592912609280,,,2017-07-13 01:35:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Lilly. She just paralle...,8.305833e+17,4196984000.0,2017-02-12 01:04:29 +0000,https://twitter.com/dog_rates/status/830583320...,13,10,Lilly,,,,
180,857062103051644929,,,2017-04-26 02:41:43 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @AaronChewning: First time wearing my @dog_...,8.570611e+17,58709720.0,2017-04-26 02:37:47 +0000,https://twitter.com/AaronChewning/status/85706...,13,10,,,,,
1332,705475953783398401,,,2016-03-03 19:32:29 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Zara. She found a sandal and coul...,,,,https://twitter.com/dog_rates/status/705475953...,12,10,Zara,,,,
1778,677895101218201600,,,2015-12-18 16:56:01 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Guys this was terrifying. Really spooked me up...,,,,https://twitter.com/dog_rates/status/677895101...,9,10,,,,,
292,838083903487373313,,,2017-03-04 17:49:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Daisy. She's puppears to be rare as al...,,,,https://twitter.com/dog_rates/status/838083903...,13,10,Daisy,,,,
1549,689255633275777024,,,2016-01-19 01:18:43 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",This is Ferg. He swallowed a chainsaw. 1 like ...,,,,https://vine.co/v/iOL792n5hz2,10,10,Ferg,,,,


In [11]:
twitter_archive = twitter_archive.drop(columns=['in_reply_to_status_id', 'in_reply_to_user_id'])

In [14]:
twitter_archive['retweeted_status_id'].isna().value_counts()

retweeted_status_id
True     2175
False     181
Name: count, dtype: int64

In [15]:
list(twitter_archive)

['tweet_id',
 'timestamp',
 'source',
 'text',
 'retweeted_status_id',
 'retweeted_status_user_id',
 'retweeted_status_timestamp',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'name',
 'doggo',
 'floofer',
 'pupper',
 'puppo']

In [16]:
list(image_predictions)

['tweet_id',
 'jpg_url',
 'img_num',
 'p1',
 'p1_conf',
 'p1_dog',
 'p2',
 'p2_conf',
 'p2_dog',
 'p3',
 'p3_conf',
 'p3_dog']

In [17]:
list(tweet_json)

['created_at',
 'id',
 'id_str',
 'full_text',
 'truncated',
 'display_text_range',
 'entities',
 'extended_entities',
 'source',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'in_reply_to_screen_name',
 'user',
 'geo',
 'coordinates',
 'place',
 'contributors',
 'is_quote_status',
 'retweet_count',
 'favorite_count',
 'favorited',
 'retweeted',
 'possibly_sensitive',
 'possibly_sensitive_appealable',
 'lang',
 'retweeted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'quoted_status']

In [20]:
image_predictions['img_num'].value_counts()

img_num
1    1780
2     198
3      66
4      31
Name: count, dtype: int64

In [22]:
tweet_json = tweet_json.drop(columns=['retweeted_status', 'quoted_status_id', 'quoted_status', 'quoted_status_id_str'])

In [40]:
tweet_json.columns

Index(['created_at', 'id', 'id_str', 'full_text', 'truncated',
       'display_text_range', 'entities', 'extended_entities', 'source', 'user',
       'is_quote_status', 'retweet_count', 'favorite_count', 'favorited',
       'retweeted', 'possibly_sensitive', 'possibly_sensitive_appealable',
       'lang'],
      dtype='object')

In [31]:
tweet_json.drop(columns=['geo'], inplace=True)

In [39]:
tweet_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   created_at                     2354 non-null   datetime64[ns, UTC]
 1   id                             2354 non-null   int64              
 2   id_str                         2354 non-null   int64              
 3   full_text                      2354 non-null   object             
 4   truncated                      2354 non-null   bool               
 5   display_text_range             2354 non-null   object             
 6   entities                       2354 non-null   object             
 7   extended_entities              2073 non-null   object             
 8   source                         2354 non-null   object             
 9   user                           2354 non-null   object             
 10  is_quote_status         

In [35]:
tweet_json.drop(columns=['coordinates', 'place', 'contributors'], inplace=True)

In [38]:
tweet_json.drop(columns=['in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name'], inplace=True)

In [41]:
image_predictions

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


In [68]:
tweet_json.isna().sum()

created_at            0
id                    0
id_str                0
full_text             0
display_text_range    0
entities              0
source                0
user                  0
is_quote_status       0
retweet_count         0
favorite_count        0
favorited             0
retweeted             0
lang                  0
dtype: int64

In [55]:
tweet_json.drop(columns=['possibly_sensitive', 'possibly_sensitive_appealable'], inplace=True)

In [62]:
for column in list(tweet_json):
    print(tweet_json[column].value_counts())
    print('-' * 30)

created_at
2017-08-01 16:23:56+00:00    1
2016-01-13 02:43:46+00:00    1
2016-01-15 02:41:12+00:00    1
2016-01-15 02:08:05+00:00    1
2016-01-15 01:25:33+00:00    1
                            ..
2016-09-10 23:54:11+00:00    1
2016-09-10 16:03:16+00:00    1
2016-09-09 18:31:54+00:00    1
2016-09-08 20:45:53+00:00    1
2015-11-15 22:32:08+00:00    1
Name: count, Length: 2354, dtype: int64
------------------------------
id
892420643555336193    1
687102708889812993    1
687826841265172480    1
687818504314159109    1
687807801670897665    1
                     ..
774757898236878852    1
774639387460112384    1
774314403806253056    1
773985732834758656    1
666020888022790149    1
Name: count, Length: 2354, dtype: int64
------------------------------
id_str
892420643555336192    1
687102708889812992    1
687826841265172480    1
687818504314159104    1
687807801670897664    1
                     ..
774757898236878848    1
774639387460112384    1
774314403806253056    1
7739857328347586

In [63]:
tweet_json.drop(columns='truncated', inplace=True)

In [65]:
list(tweet_json)

['created_at',
 'id',
 'id_str',
 'full_text',
 'display_text_range',
 'entities',
 'extended_entities',
 'source',
 'user',
 'is_quote_status',
 'retweet_count',
 'favorite_count',
 'favorited',
 'retweeted',
 'lang']

In [67]:
tweet_json.drop(columns='extended_entities', inplace=True)

In [69]:
tweet_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   created_at          2354 non-null   datetime64[ns, UTC]
 1   id                  2354 non-null   int64              
 2   id_str              2354 non-null   int64              
 3   full_text           2354 non-null   object             
 4   display_text_range  2354 non-null   object             
 5   entities            2354 non-null   object             
 6   source              2354 non-null   object             
 7   user                2354 non-null   object             
 8   is_quote_status     2354 non-null   bool               
 9   retweet_count       2354 non-null   int64              
 10  favorite_count      2354 non-null   int64              
 11  favorited           2354 non-null   bool               
 12  retweeted           2354 non-null 

In [70]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   timestamp                   2356 non-null   object 
 2   source                      2356 non-null   object 
 3   text                        2356 non-null   object 
 4   retweeted_status_id         181 non-null    float64
 5   retweeted_status_user_id    181 non-null    float64
 6   retweeted_status_timestamp  181 non-null    object 
 7   expanded_urls               2297 non-null   object 
 8   rating_numerator            2356 non-null   int64  
 9   rating_denominator          2356 non-null   int64  
 10  name                        1611 non-null   object 
 11  doggo                       97 non-null     object 
 12  floofer                     10 non-null     object 
 13  pupper                      257 n

In [71]:
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [76]:
twitter_archive['name'].value_counts()

name
a             55
Charlie       12
Oliver        11
Cooper        11
Lucy          11
              ..
Aqua           1
Chase          1
Meatball       1
Rorie          1
Christoper     1
Name: count, Length: 956, dtype: int64

In [81]:
twitter_archive['name'].sample(10)

736            NaN
97          Sierra
750         Reggie
23          Canela
2066             a
6              Jax
950          Brody
2315    Christoper
1428           NaN
1115       Aldrick
Name: name, dtype: object

In [83]:
twitter_archive['text'].iloc[2066]

"This is a Helvetica Listerine named Rufus. This time Rufus will be ready for the UPS guy. He'll never expect it 9/10 https://t.co/34OhVhMkVr"