In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
%matplotlib inline
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer


# Gather Data 
- The dataset that you will be wrangling (and analyzing and visualizing) is the tweet archive of Twitter user @dog_rates, also known as WeRateDogs. WeRateDogs is a Twitter account that rates people's dogs with a humorous comment about the dog.

- Twitter archive enchaned.csv Twitter archive contains basic tweet data for all 5000+ of their tweets, but not everything.

- image predictions a table full of image predictions (the top three only) alongside each tweet ID, image URL, and the image number that corresponded to the most confident prediction

- twitter Api this additional data can be gathered by anyone from Twitter's API.
----

In [2]:
df_twitter_archive = pd.read_csv("twitter-archive-enhanced.csv")
df_image_prediction = pd.read_csv('image-predictions.tsv', sep='\t')

In [3]:
# read twitter api from text contain json data 
twitter_api= []
with open('tweet_json.txt') as file:
    for line in file:
        tweet = json.loads(line)
        try:
            tweet_id = tweet['id']
            retweet_count = tweet['retweet_count']
            favourites_count = tweet['favorite_count']
            followers_count = tweet['user']['followers_count']
            created_at = tweet['user']['created_at']
            twitter_api.append({"tweet_id":tweet_id,
                               "retweet_count":retweet_count,
                               "favourites_count":favourites_count,
                               "followers_count":followers_count,
                                "created_at":created_at
                               })
        except KeyError:
            print("Columns not here")

In [4]:
df_json = pd.DataFrame(twitter_api,columns=['tweet_id','followers_count','favourites_count','retweet_count','created_at'])

## Make A Copy OF Data
- Three New Data Frame Contain a copy of the orignal Data

In [5]:
clean_twitter_archive = df_twitter_archive.copy()
clean_image_prediction = df_image_prediction.copy()
clean_json= df_json.copy()

In [6]:
# This method return information about clean_twitter_archive
clean_twitter_archive.head(20)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a majestic great white breaching ...,,,,https://twitter.com/dog_rates/status/891087950...,13,10,,,,,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,,,,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",13,10,Jax,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,13,10,,,,,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,,,,https://twitter.com/dog_rates/status/890609185...,13,10,Zoey,,,,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,,,,https://twitter.com/dog_rates/status/890240255...,14,10,Cassie,doggo,,,


In [7]:
clean_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [8]:
# denominator should be 10
clean_twitter_archive['rating_denominator'].value_counts().sort_values(ascending=False)

10     2333
11        3
50        3
80        2
20        2
7         1
2         1
16        1
40        1
0         1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
70        1
Name: rating_denominator, dtype: int64

In [9]:
#should be from 0 to 10 sometimes people will give higher value than 10 
clean_twitter_archive['rating_numerator'].value_counts().sort_values(ascending=False)

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
121       1
80        1
204       1
24        1
143       1
99        1
45        1
27        1
17        1
1776      1
960       1
666       1
182       1
26        1
144       1
88        1
84        1
165       1
60        1
50        1
44        1
20        1
Name: rating_numerator, dtype: int64

In [10]:
clean_twitter_archive['name'].value_counts()

None         745
a             55
Charlie       12
Cooper        11
Lucy          11
Oliver        11
Lola          10
Penny         10
Tucker        10
Winston        9
Bo             9
Sadie          8
the            8
Bailey         7
Toby           7
Buddy          7
Daisy          7
an             7
Leo            6
Rusty          6
Oscar          6
Stanley        6
Koda           6
Jax            6
Bella          6
Dave           6
Jack           6
Scout          6
Milo           6
Alfie          5
            ... 
Donny          1
Malikai        1
Blipson        1
Mingus         1
Todo           1
Lorelei        1
Moofasa        1
Kulet          1
Jerome         1
Hamrick        1
Dante          1
Brat           1
Siba           1
Snoopy         1
Kellogg        1
Mya            1
Chadrick       1
Ralphé         1
Liam           1
Barry          1
Stephanus      1
Terrenth       1
Roscoe         1
Mike           1
Flurpson       1
Rudy           1
Pumpkin        1
Jessiga       

In [11]:
clean_twitter_archive['name'].isna().sum()

0

In [12]:
clean_twitter_archive.columns


Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')

In [13]:
# This Methods Return Counts value for col
col = ['doggo', 'floofer', 'pupper', 'puppo']
for columns in col:
    print(clean_twitter_archive[columns].value_counts())

None     2259
doggo      97
Name: doggo, dtype: int64
None       2346
floofer      10
Name: floofer, dtype: int64
None      2099
pupper     257
Name: pupper, dtype: int64
None     2326
puppo      30
Name: puppo, dtype: int64


In [14]:
# Nulls/None Values Represented as String value IN
print("sum of Nan Value: name columns ",clean_twitter_archive['name'].isna().sum())
print("sum of Nan Value doggo columns: ",clean_twitter_archive['doggo'].isna().sum())
print("sum of Nan Value floofer columns: ",clean_twitter_archive['floofer'].isna().sum())
print("sum of Nan Value pupper columns: ",clean_twitter_archive['pupper'].isna().sum())
print("sum of Nan Value puppo columns: ",clean_twitter_archive['puppo'].isna().sum())



sum of Nan Value: name columns  0
sum of Nan Value doggo columns:  0
sum of Nan Value floofer columns:  0
sum of Nan Value pupper columns:  0
sum of Nan Value puppo columns:  0


# Assess And Clean Data
## Data Quality Problem In clean_twitter_archive

- Invalid data Type. (type of `timestamp and retweeted_status_timestamp` are object should be Date Type) 
- `rating_numerator has many extremly value` I'll replace IT to 20 as Extremely Value Or more lovely dog just for analysis
- `rating_denominator` should not exceed 10 I'll replace all values to 10
- `name` has many Missing and Invalid Name like None,an,a,the I'll replace it with `NoT Found` 
- Missing Value In (```in_reply_to_status_id ,in_reply_to_user_id,retweeted_status_id,retweeted_status_timestamp,expanded_urls```)
- Nulls Represented as "None or Nan" String Value In `name ,doggo ,floofer ,pupper ,puppo,Most Columns` Should be None Not Text
- Inaccurate data : Like Nan and None are object data type Instead Of Null Type In Most columns

## Messey OR UnTidy Data In clean_twitter_archive
- kind of dogs(`doggo, floofer,pupper,puppo`)Should be in one column and each row should be an observation

-------------------------



In [15]:
# Invalid data Type. (type of timestamp and retweeted_status_timestamp are object should be Date Type)
clean_twitter_archive['timestamp'] =pd.to_datetime(clean_twitter_archive['timestamp']) 
clean_twitter_archive['retweeted_status_timestamp'] =pd.to_datetime(clean_twitter_archive['retweeted_status_timestamp']) 

In [16]:
# Test
print("timestamp type: ",clean_twitter_archive['timestamp'].dtype)
print("retweeted_status_timestamp type: ",clean_twitter_archive['retweeted_status_timestamp'].dtype)

timestamp type:  datetime64[ns, UTC]
retweeted_status_timestamp type:  datetime64[ns, UTC]


In [17]:
# rating_numerator has many extremly value I'll replace IT to 20 as Extremely Value Or more lovely dog just for analysis

In [18]:
clean_twitter_archive['rating_numerator'].value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

In [19]:
# df.loc[df['First Season'] > 1990, 'First Season'] = 1
clean_twitter_archive.loc[clean_twitter_archive['rating_numerator'] > 20, 'rating_numerator'] = 20

In [20]:
# Test Now we see that we got 25 result from 20 rating_result as Extrem Value
clean_twitter_archive['rating_numerator'].value_counts()

12    558
11    464
10    461
13    351
9     158
8     102
7      55
14     54
5      37
6      32
20     25
3      19
4      17
1       9
2       9
0       2
15      2
17      1
Name: rating_numerator, dtype: int64

In [21]:
# rating_denominator should not exceed 10 I'll replace all values to 10 all values should be 10

clean_twitter_archive['rating_denominator']=10

In [22]:
# Test
clean_twitter_archive['rating_denominator'].value_counts()

10    2356
Name: rating_denominator, dtype: int64

In [23]:
# Melt the the columns into values
clean_twitter_archive = pd.melt(clean_twitter_archive, id_vars = ['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name'], value_vars = ['doggo', 'floofer', 'pupper', 'puppo'], \
        var_name = 's', value_name = 'dog_kind')
 


In [24]:
# delete s columns
clean_twitter_archive = clean_twitter_archive.drop('s', 1)

In [25]:
# Test
clean_twitter_archive['dog_kind'].value_counts()

None       9030
pupper      257
doggo        97
puppo        30
floofer      10
Name: dog_kind, dtype: int64

In [26]:
# deleted duplicated value
clean_twitter_archive = clean_twitter_archive.drop_duplicates()

In [27]:
# Test dog_kind
clean_twitter_archive['dog_kind'].value_counts()

None       2356
pupper      257
doggo        97
puppo        30
floofer      10
Name: dog_kind, dtype: int64

In [28]:
# Test Duplicated
clean_twitter_archive.duplicated().sum()

0

-------------------------
## Now It's Time To Discover Clean_Json
---------------------

In [29]:
clean_json.head()

Unnamed: 0,tweet_id,followers_count,favourites_count,retweet_count,created_at
0,892420643555336193,3200889,39467,8853,Sun Nov 15 21:41:29 +0000 2015
1,892177421306343426,3200889,33819,6514,Sun Nov 15 21:41:29 +0000 2015
2,891815181378084864,3200889,25461,4328,Sun Nov 15 21:41:29 +0000 2015
3,891689557279858688,3200889,42908,8964,Sun Nov 15 21:41:29 +0000 2015
4,891327558926688256,3200889,41048,9774,Sun Nov 15 21:41:29 +0000 2015


In [30]:
clean_json.tail()

Unnamed: 0,tweet_id,followers_count,favourites_count,retweet_count,created_at
2349,666049248165822465,3201018,111,41,Sun Nov 15 21:41:29 +0000 2015
2350,666044226329800704,3201018,311,147,Sun Nov 15 21:41:29 +0000 2015
2351,666033412701032449,3201018,128,47,Sun Nov 15 21:41:29 +0000 2015
2352,666029285002620928,3201018,132,48,Sun Nov 15 21:41:29 +0000 2015
2353,666020888022790149,3201018,2535,532,Sun Nov 15 21:41:29 +0000 2015


In [31]:
clean_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 5 columns):
tweet_id            2354 non-null int64
followers_count     2354 non-null int64
favourites_count    2354 non-null int64
retweet_count       2354 non-null int64
created_at          2354 non-null object
dtypes: int64(4), object(1)
memory usage: 82.8+ KB


In [32]:
clean_json.describe()

Unnamed: 0,tweet_id,followers_count,favourites_count,retweet_count
count,2354.0,2354.0,2354.0,2354.0
mean,7.426978e+17,3200942.0,8080.968564,3164.797366
std,6.852812e+16,44.57302,11814.771334,5284.770364
min,6.660209e+17,3200799.0,0.0,0.0
25%,6.783975e+17,3200898.0,1415.0,624.5
50%,7.194596e+17,3200945.0,3603.5,1473.5
75%,7.993058e+17,3200953.0,10122.25,3652.0
max,8.924206e+17,3201018.0,132810.0,79515.0


In [33]:
# duplicated values
clean_json.duplicated().sum()

0

In [34]:
# return null values
clean_json.isna().sum()

tweet_id            0
followers_count     0
favourites_count    0
retweet_count       0
created_at          0
dtype: int64

In [35]:
clean_json.sample(5)

Unnamed: 0,tweet_id,followers_count,favourites_count,retweet_count,created_at
2285,667177989038297088,3201017,200,58,Sun Nov 15 21:41:29 +0000 2015
2287,667174963120574464,3201017,262,88,Sun Nov 15 21:41:29 +0000 2015
597,798686750113755136,3200898,0,2702,Sun Nov 15 21:41:29 +0000 2015
2035,671561002136281088,3201006,13679,7931,Sun Nov 15 21:41:29 +0000 2015
1644,683834909291606017,3200952,2880,1265,Sun Nov 15 21:41:29 +0000 2015


In [36]:
clean_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 5 columns):
tweet_id            2354 non-null int64
followers_count     2354 non-null int64
favourites_count    2354 non-null int64
retweet_count       2354 non-null int64
created_at          2354 non-null object
dtypes: int64(4), object(1)
memory usage: 82.8+ KB


-----------------

## Data Quality Problem In Clean_Json
- created_at should be datatime type
- I'll keep all tweet_id and follower and favourits ,retweet Int For analysis purpose 

## Messy UnTidy 
- We don't need Twitter Id because I have one on archived csv

----------------------

In [37]:
clean_json.created_at = pd.to_datetime(clean_json.created_at)

In [38]:
# Test created_at should be time now
clean_json['created_at'].dtypes

datetime64[ns, UTC]


# Now It's Time To Discover Clean_Image_Prediction

In [39]:
clean_image_prediction.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [40]:
clean_image_prediction.tail()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.22577,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True
2074,892420643555336193,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1,orange,0.097049,False,bagel,0.085851,False,banana,0.07611,False


In [41]:
clean_image_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 119.6+ KB


In [42]:
clean_image_prediction.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [43]:
# duplicated  url_jpg
clean_image_prediction['jpg_url'].duplicated().sum()

66

In [44]:
clean_image_prediction.isna().sum()

tweet_id    0
jpg_url     0
img_num     0
p1          0
p1_conf     0
p1_dog      0
p2          0
p2_conf     0
p2_dog      0
p3          0
p3_conf     0
p3_dog      0
dtype: int64

In [45]:
clean_image_prediction['p1'].value_counts()

golden_retriever             150
Labrador_retriever           100
Pembroke                      89
Chihuahua                     83
pug                           57
chow                          44
Samoyed                       43
toy_poodle                    39
Pomeranian                    38
cocker_spaniel                30
malamute                      30
French_bulldog                26
Chesapeake_Bay_retriever      23
miniature_pinscher            23
seat_belt                     22
German_shepherd               20
Siberian_husky                20
Staffordshire_bullterrier     20
web_site                      19
Cardigan                      19
beagle                        18
Shetland_sheepdog             18
Maltese_dog                   18
Eskimo_dog                    18
teddy                         18
Rottweiler                    17
Lakeland_terrier              17
Shih-Tzu                      17
Italian_greyhound             16
kuvasz                        16
          

In [46]:
clean_image_prediction['p1_dog'].value_counts()

True     1532
False     543
Name: p1_dog, dtype: int64

In [47]:
clean_image_prediction['p2'].value_counts()

Labrador_retriever                104
golden_retriever                   92
Cardigan                           73
Chihuahua                          44
Pomeranian                         42
French_bulldog                     41
Chesapeake_Bay_retriever           41
toy_poodle                         37
cocker_spaniel                     34
miniature_poodle                   33
Siberian_husky                     33
beagle                             28
collie                             27
Pembroke                           27
Eskimo_dog                         27
kuvasz                             26
Italian_greyhound                  22
Pekinese                           21
American_Staffordshire_terrier     21
Samoyed                            20
chow                               20
miniature_pinscher                 20
toy_terrier                        20
malinois                           20
Norwegian_elkhound                 19
Boston_bull                        19
Staffordshir

In [48]:
clean_image_prediction['p2_dog'].value_counts()

True     1553
False     522
Name: p2_dog, dtype: int64

In [49]:
clean_image_prediction['p3'].value_counts()

Labrador_retriever                79
Chihuahua                         58
golden_retriever                  48
Eskimo_dog                        38
kelpie                            35
kuvasz                            34
chow                              32
Staffordshire_bullterrier         32
cocker_spaniel                    31
beagle                            31
Pekinese                          29
toy_poodle                        29
Pomeranian                        29
Great_Pyrenees                    27
Chesapeake_Bay_retriever          27
Pembroke                          27
malamute                          26
French_bulldog                    26
American_Staffordshire_terrier    24
Cardigan                          23
pug                               23
basenji                           21
toy_terrier                       20
bull_mastiff                      20
Siberian_husky                    19
Boston_bull                       17
Shetland_sheepdog                 17
d

In [50]:
clean_image_prediction['p3_dog'].value_counts()

True     1499
False     576
Name: p3_dog, dtype: int64

### Data Quality Problem In Clean_Image_Prediction
- Data Types: There's no action needs here
- Duplicated Values In jpg_url 

### Data Untidy Problem In Clean_Image_Prediction
- unneccessary columns `jpg_url and img_num`
- We Don't need Three column to represnted algorithms p1_dog:p3_dog should be In one Columns

In [51]:
clean_image_prediction['jpg_url'].duplicated().sum()

66

In [52]:
clean_image_prediction['jpg_url'].drop_duplicates(inplace=True)

In [53]:
# Test drop_duplocated jpg
clean_image_prediction['jpg_url'].duplicated().sum()


0

In [54]:
# we don't need jpg at all
clean_image_prediction.drop('jpg_url', axis=1, inplace=True)

In [55]:
# Test 
try:
    print(clean_image_prediction['jpg_url'])
except KeyError:
    print("Not Here jpg_url")

Not Here jpg_url


In [56]:
# we don't need img_num 
clean_image_prediction.drop('img_num', axis=1, inplace=True)

In [57]:
# Test 
try:
    print(clean_image_prediction['img_num'])
except KeyError:
    print("Not Here img_num")

Not Here img_num


In [58]:
clean_image_prediction['p1_dog'] = clean_image_prediction['p1_dog'].replace(True, "p1_algo")

In [59]:
# Test to check If True changed
clean_image_prediction['p1_dog'].value_counts() 

p1_algo    1532
False       543
Name: p1_dog, dtype: int64

In [60]:
clean_image_prediction['p2_dog'] = clean_image_prediction['p2_dog'].replace(True, "p2_algo")
clean_image_prediction['p3_dog'] = clean_image_prediction['p3_dog'].replace(True, "p3_algo")

In [61]:
print(clean_image_prediction['p2_dog'].value_counts())
print("--"*35)
print(clean_image_prediction['p3_dog'].value_counts())

p2_algo    1553
False       522
Name: p2_dog, dtype: int64
----------------------------------------------------------------------
p3_algo    1499
False       576
Name: p3_dog, dtype: int64


In [62]:
clean_image_prediction = pd.melt(clean_image_prediction, id_vars = ['tweet_id','p1','p1_conf', 'p2','p2_conf',
       'p3','p3_conf'], value_vars = ['p1_dog', 'p2_dog', 'p3_dog'], var_name = 'w', value_name = 'type_algorithm')

In [63]:
clean_image_prediction.head()

Unnamed: 0,tweet_id,p1,p1_conf,p2,p2_conf,p3,p3_conf,w,type_algorithm
0,666020888022790149,Welsh_springer_spaniel,0.465074,collie,0.156665,Shetland_sheepdog,0.061428,p1_dog,p1_algo
1,666029285002620928,redbone,0.506826,miniature_pinscher,0.074192,Rhodesian_ridgeback,0.07201,p1_dog,p1_algo
2,666033412701032449,German_shepherd,0.596461,malinois,0.138584,bloodhound,0.116197,p1_dog,p1_algo
3,666044226329800704,Rhodesian_ridgeback,0.408143,redbone,0.360687,miniature_pinscher,0.222752,p1_dog,p1_algo
4,666049248165822465,miniature_pinscher,0.560311,Rottweiler,0.243682,Doberman,0.154629,p1_dog,p1_algo


In [64]:
# drop unneccessary columns w
clean_image_prediction.drop('w', axis=1, inplace=True)

In [65]:
# test
clean_image_prediction.head()

Unnamed: 0,tweet_id,p1,p1_conf,p2,p2_conf,p3,p3_conf,type_algorithm
0,666020888022790149,Welsh_springer_spaniel,0.465074,collie,0.156665,Shetland_sheepdog,0.061428,p1_algo
1,666029285002620928,redbone,0.506826,miniature_pinscher,0.074192,Rhodesian_ridgeback,0.07201,p1_algo
2,666033412701032449,German_shepherd,0.596461,malinois,0.138584,bloodhound,0.116197,p1_algo
3,666044226329800704,Rhodesian_ridgeback,0.408143,redbone,0.360687,miniature_pinscher,0.222752,p1_algo
4,666049248165822465,miniature_pinscher,0.560311,Rottweiler,0.243682,Doberman,0.154629,p1_algo


In [66]:
clean_image_prediction['type_algorithm'].value_counts()

False      1641
p2_algo    1553
p1_algo    1532
p3_algo    1499
Name: type_algorithm, dtype: int64

In [67]:
clean_image_prediction.duplicated().sum()

809

In [68]:
#delete duplicated Values


In [69]:
clean_image_prediction.drop_duplicates(keep=False,inplace=True)

In [70]:
clean_image_prediction.duplicated().sum()

0

In [71]:
clean_image_prediction['tweet_id'].drop_duplicates(keep=False,inplace=True)

In [72]:
clean_image_prediction['tweet_id'].duplicated().sum()

0

-----------------------------
## Now Data Is Ready  To Combine Them All For Visualizing  
--------------------

In [75]:
# save clean data to visualize CSV
clean_image_prediction.to_csv("image_visualize.csv", index=False)
clean_json.to_csv("twitter_visualize.csv", index=False)
clean_twitter_archive.to_csv("archive_visualize.csv", index=False)
