# Tweets data wrangling
#### Imports

In [28]:
import pandas as pd
import numpy as np
import requests
import os
import yaml
import json
import pprint
import tweepy
import time

## Gather
### Twitter Archive

In [29]:
# Create twitter archive DataFrame from the associated csv file
df_twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
df_twitter_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


### Tweets image prediction

In [30]:
image_pred_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
file_name = image_pred_url.split('/')[-1]

if not os.path.isfile(file_name):
    # Retrieve the file content from the web
    response = requests.get(image_pred_url)
    # And save it in a tsv file
    with open(file_name, mode='wb') as file:
        file.write(response.content)
        
# Create tweet image prediction DataFrame from the downloaded tsv file
df_tweets_img_pred = pd.read_csv(file_name, sep='\t')
df_tweets_img_pred.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


### Tweeter API

In [31]:
# Load the API keys from the yaml file
with open('twitter_api_keys.yaml', mode='r') as file:
    twitter_cred = yaml.load(file, Loader=yaml.FullLoader)
    
consumer_key = twitter_cred['APIKEY']
consumer_secret = twitter_cred['APIKEYSECRET']
access_token = twitter_cred['ACCESSTOKEN']
access_secret = twitter_cred['ACCESSTOKENSECRET']

# Open a tweepy API object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth,
                 wait_on_rate_limit=True,
                 wait_on_rate_limit_notify=True)

tweet_json_path = 'tweet_json.txt'
overwrite_json_file = False
if overwrite_json_file or not os.path.isfile(tweet_json_path):
    with open(tweet_json_path, 'w') as outfile:
        for idx, tweet_id in df_twitter_archive.tweet_id.iteritems():
            start_time = time.time()
            try:
                tweet_status = api.get_status(tweet_id, tweet_mode='extended')
                if idx > 0:
                    outfile.write("\n")
                json.dump(tweet_status._json, outfile)
                
                print("Tweet {} retrieved from Twitter API in {:.3f} seconds".format(tweet_id, time.time() - start_time))
            except tweepy.TweepError as e:
                print("Tweet {} couldn\'t be retrieved from Twitter API with error: \"{}\"".format(tweet_id, e))               

In [32]:
# Open the json txt file and extract the needed infos from the json data for each tweet saved
if os.path.isfile(tweet_json_path):
    with open(tweet_json_path, 'r') as json_file:
        # Create some columns for the new values retrieved from Twitter API
        df_twitter_archive['retweet_count'] = pd.Series(np.nan, dtype='Int64')
        df_twitter_archive['favorite_count'] = pd.Series(np.nan, dtype='Int64')
        
        # Iterate over each line in the txt file until the end of the file
        json_line = json_file.readline()
        while json_line:
            # Retrieve info from the json data structure
            tweet_json_data = json.loads(json_line)
            tweet_id = tweet_json_data['id']
            retweet_count = tweet_json_data['retweet_count']
            favorite_count = tweet_json_data['favorite_count']
            #print("Tweet {} has {} retweets and {} favorites".format(tweet_id,
            #                                                        retweet_count,
            #                                                        favorite_count))
            
            # And copy it in the Twitter archive DataFrame
            df_twitter_archive.loc[df_twitter_archive.tweet_id == tweet_id, 'retweet_count'] = retweet_count
            df_twitter_archive.loc[df_twitter_archive.tweet_id == tweet_id, 'favorite_count'] = favorite_count
            
            json_line = json_file.readline()

In [33]:
df_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

## Assess
### Data Quality
#### df_twitter_archive

- source column should be a human readable category instead of non readable HTML code
- timestamp and retweeted_status_timestamp should be a DateTime instead of a string
- rating_denominator is not always 10
- Some rating_numerator doesn't follow the Dog Rates convention
- \*_status_id and \*_user_id should be int instead of float
- Some expanded_urls are missing
- Some name are missing ('None')
- p1 (dog's race name) string should be formated the same way
- Many dogs have 'a' as name
- Some dogs have multiple dog stage (doggo, flooter, pupper or puppo)
- Some tweets have missing retweet_count and favorite_count
- There are less tweets rows in df_tweets_img_pred as in df_twitter_archive

#### df_tweets_img_pred


### Data tidiness

- doggo, flooter, pupper and puppo should be one categorical variable
- Dog race with highest probability and and dog flag variables from df_tweets_img_pred should be part of df_twitter_archive

In [34]:
df_twitter_archive.sample(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweet_count,favorite_count
709,785170936622350336,,,2016-10-09 17:31:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Hero. He was enjoying the car ride unt...,,,,https://twitter.com/dog_rates/status/785170936...,11,10,Hero,,,,,4751,11892
1942,673711475735838725,,,2015-12-07 03:51:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...",🎶 HELLO FROM THE OTHER SIIIIIIIIDE 🎶 10/10 htt...,,,,https://twitter.com/dog_rates/status/673711475...,10,10,,,,,,278,982
747,779834332596887552,,,2016-09-25 00:06:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Scout. He really wants to kiss himself...,,,,https://twitter.com/dog_rates/status/779834332...,11,10,Scout,,,,,6900,18597
888,759846353224826880,,,2016-07-31 20:21:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Kirby. He's a Beneblip Cumberpat. Pret...,,,,https://twitter.com/dog_rates/status/759846353...,11,10,Kirby,,,,,1916,6584
1185,718613305783398402,,,2016-04-09 01:35:37 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Carper. He's a Tortellini Angiosperm. ...,,,,https://twitter.com/dog_rates/status/718613305...,11,10,Carper,,,,,460,2352
928,754747087846248448,,,2016-07-17 18:38:22 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Keith. He's pursuing a more 2D lifesty...,,,,https://twitter.com/dog_rates/status/754747087...,12,10,Keith,,,,,502,2531
71,878776093423087618,,,2017-06-25 00:45:22 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Snoopy. He's a proud #PrideMonthPuppo....,,,,https://twitter.com/dog_rates/status/878776093...,13,10,Snoopy,,,,puppo,3616,17741
2208,668625577880875008,,,2015-11-23 03:02:14 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Maks. Maks just noticed something wasn...,,,,https://twitter.com/dog_rates/status/668625577...,10,10,Maks,,,,,119,352
1258,710283270106132480,,,2016-03-17 01:55:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Gunner. He's a Figamus Newton. King of...,,,,https://twitter.com/dog_rates/status/710283270...,11,10,Gunner,,,,,501,2085
470,816816676327063552,,,2017-01-05 01:20:46 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Timber. He misses Christmas. Specifica...,,,,https://twitter.com/dog_rates/status/816816676...,12,10,Timber,,,,,2004,9879


In [35]:
df_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [36]:
df_twitter_archive.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator,retweet_count,favorite_count
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0,2331.0,2331.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433,2609.335049,7353.651652
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237,4414.651029,11421.088202
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0,1.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0,529.0,1277.5
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0,1219.0,3192.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0,3028.5,8999.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0,74969.0,151551.0


In [37]:
df_twitter_archive.name.value_counts()

None       745
a           55
Charlie     12
Lucy        11
Oliver      11
          ... 
Goliath      1
Combo        1
Mingus       1
Grey         1
Lolo         1
Name: name, Length: 957, dtype: int64

In [38]:
df_twitter_archive.source.value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

In [39]:
df_twitter_archive[['doggo', 'pupper', 'puppo']].value_counts()

doggo  pupper  puppo
None   None    None     1985
       pupper  None      245
doggo  None    None       84
None   None    puppo      29
doggo  pupper  None       12
       None    puppo       1
dtype: int64

In [40]:
df_tweets_img_pred.sample(10)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,0.014594,False,golden_retriever,0.007959,True
334,672169685991993344,https://pbs.twimg.com/media/CVQGv-vUwAEUjCj.jpg,1,cocker_spaniel,0.991011,True,Sussex_spaniel,0.004032,True,miniature_poodle,0.001276,True
1541,791026214425268224,https://pbs.twimg.com/media/CpmyNumW8AAAJGj.jpg,1,malamute,0.375098,True,jean,0.069362,False,keeshond,0.050528,True
1684,814153002265309185,https://pbs.twimg.com/media/C0xz04SVIAAeyDb.jpg,1,golden_retriever,0.490068,True,Labrador_retriever,0.291956,True,chow,0.072475,True
745,687494652870668288,https://pbs.twimg.com/media/CYp4vFrVAAEs9AX.jpg,1,Rottweiler,0.391471,True,miniature_pinscher,0.273595,True,Tibetan_mastiff,0.041692,True
230,670408998013820928,https://pbs.twimg.com/media/CU3FbQgVAAACdCQ.jpg,1,ping-pong_ball,0.999945,False,tennis_ball,1.8e-05,False,racket,1.5e-05,False
679,683742671509258241,https://pbs.twimg.com/media/CX0kVRxWYAAWWZi.jpg,1,Pembroke,0.895279,True,Cardigan,0.022385,True,cocker_spaniel,0.017045,True
43,666776908487630848,https://pbs.twimg.com/media/CUDeDoWUYAAD-EM.jpg,1,seat_belt,0.375057,False,miniature_pinscher,0.167175,True,Chihuahua,0.086951,True
1382,765395769549590528,https://pbs.twimg.com/media/Cp87Y0jXYAQyjuV.jpg,1,Pembroke,0.509491,True,Cardigan,0.330401,True,Shetland_sheepdog,0.038875,True
71,667200525029539841,https://pbs.twimg.com/media/CUJfVMPXIAAgbue.jpg,1,Siberian_husky,0.694904,True,malamute,0.232006,True,Eskimo_dog,0.050635,True


In [41]:
df_tweets_img_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [42]:
df_tweets_img_pred.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [43]:
df_tweets_img_pred.p1.value_counts()

golden_retriever      150
Labrador_retriever    100
Pembroke               89
Chihuahua              83
pug                    57
                     ... 
conch                   1
ping-pong_ball          1
fiddler_crab            1
candle                  1
quilt                   1
Name: p1, Length: 378, dtype: int64

## Clean
### Save a copy of the original data

In [48]:
df_twitter_archive_clean = df_twitter_archive.copy()
df_tweets_img_pred_clean = df_tweets_img_pred.copy()

In [49]:
df_twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

### Dog race with highest probability and and dog flag variables from df_tweets_img_pred should be part of df_twitter_archive
This also solve "There are less tweets rows in df_tweets_img_pred as in df_twitter_archive"
#### Define
Merge p1 and p1_dog df_tweets_img_pred_clean into df_twitter_archive_clean on the tweet_id column and rename the merged colums. Keep only the rows from the df_tweets_img_pred since there are less tweets in this DF as in df_twitter_archive. This way, each tweet will have a dog prediction.
#### Code

In [50]:
#df_twitter_archive_clean = df_twitter_archive.copy()
df_twitter_archive_clean = pd.merge(df_twitter_archive_clean, df_tweets_img_pred_clean[['tweet_id', 'p1', 'p1_dog']],
                                    on='tweet_id', how='right')
df_twitter_archive_clean.p1_dog = df_twitter_archive_clean.p1_dog.astype("boolean")
df_twitter_archive_clean.rename(columns={'p1': 'pred_dog_breed',
                                        'p1_dog': 'pred_is_dog'},
                               inplace=True)

#### Test

In [51]:
df_twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2074
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2075 non-null   int64  
 1   in_reply_to_status_id       23 non-null     float64
 2   in_reply_to_user_id         23 non-null     float64
 3   timestamp                   2075 non-null   object 
 4   source                      2075 non-null   object 
 5   text                        2075 non-null   object 
 6   retweeted_status_id         81 non-null     float64
 7   retweeted_status_user_id    81 non-null     float64
 8   retweeted_status_timestamp  81 non-null     object 
 9   expanded_urls               2075 non-null   object 
 10  rating_numerator            2075 non-null   int64  
 11  rating_denominator          2075 non-null   int64  
 12  name                        2075 non-null   object 
 13  doggo                       2075 

### doggo, flooter, pupper and puppo should be one categorical variable 
Also solve "Some dogs have multiple dog stage (doggo, flooter, pupper or puppo)"
#### Define
Merge doggo, flooter, pupper and puppo should into one dog_stage categorical variable. If a dog is classified with multiple stage, give the priority to the older stage (doggo > pupper > puppo).
#### Code

In [52]:
#df_twitter_archive_clean = df_twitter_archive.copy()
df_twitter_archive_clean['dog_stage'] = np.nan

for index, row in df_twitter_archive_clean.iterrows():
    if row.doggo == 'doggo':
        df_twitter_archive_clean.loc[index, 'dog_stage'] = 'doggo'
    elif row.pupper == 'pupper':
        df_twitter_archive_clean.loc[index, 'dog_stage'] = 'pupper'
    elif row.puppo == 'puppo':
        df_twitter_archive_clean.loc[index, 'dog_stage'] = 'puppo'
        
df_twitter_archive_clean.dog_stage = df_twitter_archive_clean.dog_stage.astype('category')
df_twitter_archive_clean[['doggo', 'pupper', 'puppo', 'dog_stage']].sample(10)

Unnamed: 0,doggo,pupper,puppo,dog_stage
211,,,,
1136,,,,
372,,,,
1322,doggo,,,doggo
1921,,,,
998,,,,
1342,,,,
905,,pupper,,pupper
1501,,,,
291,,,,


In [53]:
df_twitter_archive_clean.drop(columns=['doggo', 'pupper', 'puppo'], inplace=True)

#### Test

In [54]:
df_twitter_archive_clean.dog_stage.value_counts()

pupper    211
doggo      80
puppo      23
Name: dog_stage, dtype: int64

In [55]:
df_twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2074
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   tweet_id                    2075 non-null   int64   
 1   in_reply_to_status_id       23 non-null     float64 
 2   in_reply_to_user_id         23 non-null     float64 
 3   timestamp                   2075 non-null   object  
 4   source                      2075 non-null   object  
 5   text                        2075 non-null   object  
 6   retweeted_status_id         81 non-null     float64 
 7   retweeted_status_user_id    81 non-null     float64 
 8   retweeted_status_timestamp  81 non-null     object  
 9   expanded_urls               2075 non-null   object  
 10  rating_numerator            2075 non-null   int64   
 11  rating_denominator          2075 non-null   int64   
 12  name                        2075 non-null   object  
 13  floofer           

### Some tweets have missing retweet_count and favorite_count
#### Define
Since some counts are already equals or near 0, replace all the missing counts by 0

#### Code

In [56]:
df_twitter_archive_clean.retweet_count.fillna(0, inplace=True)
df_twitter_archive_clean.favorite_count.fillna(0, inplace=True)

#### Test

In [57]:
df_twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2074
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   tweet_id                    2075 non-null   int64   
 1   in_reply_to_status_id       23 non-null     float64 
 2   in_reply_to_user_id         23 non-null     float64 
 3   timestamp                   2075 non-null   object  
 4   source                      2075 non-null   object  
 5   text                        2075 non-null   object  
 6   retweeted_status_id         81 non-null     float64 
 7   retweeted_status_user_id    81 non-null     float64 
 8   retweeted_status_timestamp  81 non-null     object  
 9   expanded_urls               2075 non-null   object  
 10  rating_numerator            2075 non-null   int64   
 11  rating_denominator          2075 non-null   int64   
 12  name                        2075 non-null   object  
 13  floofer           

In [58]:
df_twitter_archive_clean[['retweet_count', 'favorite_count']].describe()

Unnamed: 0,retweet_count,favorite_count
count,2075.0,2075.0
mean,2475.830843,7705.85012
std,4348.555332,11711.884463
min,0.0,0.0
25%,523.5,1434.5
50%,1170.0,3380.0
75%,2850.5,9556.0
max,74969.0,151551.0


## Analyze