# Project: Wrangling and Analyze Data

In [1]:
# import installed packages
import pandas as pd
import numpy as np
import requests
import json
import tweepy
from tweepy import OAuthHandler
from keys import *
from bs4 import BeautifulSoup
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# setup tweepy
import tweepy

consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

In [3]:
# function that is reusable
def get_image_predictions_tsv():
    url = " https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
    response = requests.get(url)
    with open('image-predictions.tsv', 'wb') as file:
        file.write(response.content)

## Data Gathering


In [4]:
# Gather data from memory
tweets_df = pd.read_csv("twitter-archive-enhanced.csv")
images_df = pd.read_csv("image-predictions.tsv", sep="\t")

In [5]:
# Get each tweet's status string using Tweepy 
with open('tweet_json.txt', mode='a') as file:
        for tweet_id in tweets_df['tweet_id']:
            try:
                tweet = api.get_status(tweet_id, tweet_mode='extended')
                json.dump(tweet._json, file)
                file.write('\n')
            except:
                continue

# Create a DataFrame with tweet_id, retweet_count and favorite_count for each tweet
twitter_data_list = []

for line in open('tweet_json.txt', 'r'):
    twitter_data = json.loads(line)
    twitter_data_list.append({
        'tweet_id': twitter_data['id_str'],
        'retweet_count': twitter_data['retweet_count'],
        'favorite_count': twitter_data['favorite_count']
    })

In [6]:
tweets_meta_df = pd.DataFrame(twitter_data_list, columns=['tweet_id', 'retweet_count', 'favorite_count'])

## Assessing Data


In [7]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [8]:
tweets_df.head(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a majestic great white breaching ...,,,,https://twitter.com/dog_rates/status/891087950...,13,10,,,,,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,,,,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",13,10,Jax,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,13,10,,,,,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,,,,https://twitter.com/dog_rates/status/890609185...,13,10,Zoey,,,,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,,,,https://twitter.com/dog_rates/status/890240255...,14,10,Cassie,doggo,,,


In [9]:
images_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [10]:
images_df.head(10)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
5,666050758794694657,https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg,1,Bernese_mountain_dog,0.651137,True,English_springer,0.263788,True,Greater_Swiss_Mountain_dog,0.016199,True
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,0.045885,False,terrapin,0.017885,False
7,666055525042405380,https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg,1,chow,0.692517,True,Tibetan_mastiff,0.058279,True,fur_coat,0.054449,False
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,0.014594,False,golden_retriever,0.007959,True
9,666058600524156928,https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg,1,miniature_poodle,0.201493,True,komondor,0.192305,True,soft-coated_wheaten_terrier,0.082086,True


In [11]:
tweets_meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4605 entries, 0 to 4604
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_id        4605 non-null   object
 1   retweet_count   4605 non-null   int64 
 2   favorite_count  4605 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 108.1+ KB


In [12]:
tweets_meta_df.head(10)

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,6955,33654
1,892177421306343426,5263,29182
2,891815181378084864,3463,21950
3,891689557279858688,7178,36726
4,891327558926688256,7706,35126
5,891087950875897856,2579,17723
6,890971913173991426,1643,10314
7,890729181411237888,15660,56578
8,890609185150312448,3598,24389
9,890240255349198849,6056,27806


In [13]:
all_columns = pd.Series(list(tweets_df) + list(images_df) + list(tweets_meta_df))
all_columns[all_columns.duplicated()]

17    tweet_id
29    tweet_id
dtype: object

In [14]:
tweets_df.head(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a majestic great white breaching ...,,,,https://twitter.com/dog_rates/status/891087950...,13,10,,,,,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,,,,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",13,10,Jax,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,13,10,,,,,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,,,,https://twitter.com/dog_rates/status/890609185...,13,10,Zoey,,,,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,,,,https://twitter.com/dog_rates/status/890240255...,14,10,Cassie,doggo,,,


In [15]:
tweets_df.tail(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2346,666058600524156928,,,2015-11-16 01:01:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is the Rand Paul of retrievers folks! He'...,,,,https://twitter.com/dog_rates/status/666058600...,8,10,the,,,,
2347,666057090499244032,,,2015-11-16 00:55:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",My oh my. This is a rare blond Canadian terrie...,,,,https://twitter.com/dog_rates/status/666057090...,9,10,a,,,,
2348,666055525042405380,,,2015-11-16 00:49:46 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a Siberian heavily armored polar bear ...,,,,https://twitter.com/dog_rates/status/666055525...,10,10,a,,,,
2349,666051853826850816,,,2015-11-16 00:35:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is an odd dog. Hard on the outside but lo...,,,,https://twitter.com/dog_rates/status/666051853...,2,10,an,,,,
2350,666050758794694657,,,2015-11-16 00:30:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a truly beautiful English Wilson Staff...,,,,https://twitter.com/dog_rates/status/666050758...,10,10,a,,,,
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,
2355,666020888022790149,,,2015-11-15 22:32:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a Japanese Irish Setter. Lost eye...,,,,https://twitter.com/dog_rates/status/666020888...,8,10,,,,,


#### QUALITY ISSUES 
##### Twitter Archive Enhanced Dataset 
- `in_reply_to_status_id, in_reply_to_user_id, retweeted_status_user_id column, retweeted_status_timestamp column, retweeted_status_timestamp:` these are not needed for analysis, and should therefore be dropped.
- `retweeted_status_user_id`: should be deleted.
- `rating_numerator, tweet_ids, time_stamp column, rating_denominator:` wrong data types.
- `source column:` values contain HTML.

##### Images Prediction Dataset
- `missing values:` 2075 of 2356 rows available
- `tweet_id column:` wrong datatype

##### Tweets Meta Dataset
- `tweet_id column:` wrong datatype 

#### TIDINESS ISSUES
##### Twitter Archive Enhanced Dataset
- `img_num`: we drop tweets with no dog images
- `text column:` we should split it into `tweet_text` and `tweet_url`
- `doggo, floofer, pupper and puppo columns:` should be merged into a single column `dog_stage` 

##### Tweets Meta Dataset
- `retweet_count and favorite_count columns:` should be merged into the tweets_df 
##### Image Prediction Dataset
- `images_df` should also be merged into the tweets_df dataframe

## Cleaning Data


In [16]:
# Make copies of original pieces of data
clean_tweets = tweets_df.copy()
clean_images = images_df.copy()
clean_tweets_meta = tweets_meta_df.copy()

### Issue #1:

#### Define: Wrong data types

#### Code:

In [17]:
clean_tweets.tweet_id = clean_tweets.tweet_id.astype(str)
clean_tweets.timestamp = pd.to_datetime(clean_tweets.timestamp)
clean_images.tweet_id = clean_images.tweet_id.astype(str)
clean_tweets_meta.tweet_id = clean_tweets_meta.tweet_id.astype(str)

#### Test

In [18]:
clean_tweets.info()
clean_images.info()
clean_tweets_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2356 non-null   object             
 1   in_reply_to_status_id       78 non-null     float64            
 2   in_reply_to_user_id         78 non-null     float64            
 3   timestamp                   2356 non-null   datetime64[ns, UTC]
 4   source                      2356 non-null   object             
 5   text                        2356 non-null   object             
 6   retweeted_status_id         181 non-null    float64            
 7   retweeted_status_user_id    181 non-null    float64            
 8   retweeted_status_timestamp  181 non-null    object             
 9   expanded_urls               2297 non-null   object             
 10  rating_numerator            2356 non-null   int64           

### Issue #2:

#### Define: Delete Retweets from the `clean_tweets` dataframe

#### Code

In [19]:
clean_tweets = clean_tweets[pd.isnull(clean_tweets['retweeted_status_user_id'])]

#### Test

In [20]:
clean_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2175 non-null   object             
 1   in_reply_to_status_id       78 non-null     float64            
 2   in_reply_to_user_id         78 non-null     float64            
 3   timestamp                   2175 non-null   datetime64[ns, UTC]
 4   source                      2175 non-null   object             
 5   text                        2175 non-null   object             
 6   retweeted_status_id         0 non-null      float64            
 7   retweeted_status_user_id    0 non-null      float64            
 8   retweeted_status_timestamp  0 non-null      object             
 9   expanded_urls               2117 non-null   object             
 10  rating_numerator            2175 non-null   int64           

### Issue #3:

#### Define: Incorrect names for dogs. 
##### `a` names will be taken as None,  else the name will be capitalized.

<b>Code</b>

In [21]:
import string


for index, row in clean_tweets.iterrows():
    try:
        if clean_tweets.loc[index, 'name'] == 'a':
            clean_tweets.loc[index, 'name'] = 'None'
        else:
            clean_tweets.loc[index, 'name'] = string.capwords(clean_tweets.loc[index, 'name'], sep=None)
    except:
        pass

<b>Test</b>

In [22]:
clean_tweets['name'].value_counts()

None          735
Charlie        11
Lucy           11
Oliver         10
Cooper         10
             ... 
Wishes          1
Rose            1
Theo            1
Fido            1
Christoper      1
Name: name, Length: 955, dtype: int64

### Issue #4:

#### <b>Define</b>: Drop unnecessary columns

<b>Code</b>

In [23]:
clean_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2175 non-null   object             
 1   in_reply_to_status_id       78 non-null     float64            
 2   in_reply_to_user_id         78 non-null     float64            
 3   timestamp                   2175 non-null   datetime64[ns, UTC]
 4   source                      2175 non-null   object             
 5   text                        2175 non-null   object             
 6   retweeted_status_id         0 non-null      float64            
 7   retweeted_status_user_id    0 non-null      float64            
 8   retweeted_status_timestamp  0 non-null      object             
 9   expanded_urls               2117 non-null   object             
 10  rating_numerator            2175 non-null   int64           

In [24]:
clean_tweets = clean_tweets.drop(clean_tweets.columns[1:2],1)
clean_tweets = clean_tweets.drop(clean_tweets.columns[6:9],1)

  clean_tweets = clean_tweets.drop(clean_tweets.columns[1:2],1)
  clean_tweets = clean_tweets.drop(clean_tweets.columns[6:9],1)


<b>Test</b>

In [25]:
clean_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   tweet_id             2175 non-null   object             
 1   in_reply_to_user_id  78 non-null     float64            
 2   timestamp            2175 non-null   datetime64[ns, UTC]
 3   source               2175 non-null   object             
 4   text                 2175 non-null   object             
 5   retweeted_status_id  0 non-null      float64            
 6   rating_numerator     2175 non-null   int64              
 7   rating_denominator   2175 non-null   int64              
 8   name                 2175 non-null   object             
 9   doggo                2175 non-null   object             
 10  floofer              2175 non-null   object             
 11  pupper               2175 non-null   object             
 12  puppo               

### Issue #5:

#### Define: Source column contains HTML. 
##### we cut out the HTML from the column value

<b>Code</b>

In [42]:
for index, row in clean_tweets.iterrows():
    if (clean_tweets.loc[index, 'source']):
        soup = BeautifulSoup(clean_tweets.loc[index, 'source'])
        clean_tweets.loc[index, 'source'] = soup.find('a').text
    else:
        pass

<b>Test</b>

In [44]:
clean_tweets.source.value_counts()

Series([], Name: source, dtype: int64)

### Issue #6:

#### <b>Define</b> : Drop tweets in `clean_tweets` with no images

<b>Code</b>

In [45]:
clean_tweets.dropna(axis=0, inplace=True)

<b>Test</b>

In [46]:
clean_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   in_reply_to_user_id  0 non-null      float64            
 1   timestamp            0 non-null      datetime64[ns, UTC]
 2   source               0 non-null      object             
 3   text                 0 non-null      object             
 4   retweeted_status_id  0 non-null      float64            
 5   rating_numerator     0 non-null      int64              
 6   rating_denominator   0 non-null      int64              
 7   name                 0 non-null      object             
 8   retweet_count_x      0 non-null      int64              
 9   favorite_count_x     0 non-null      int64              
 10  jpg_url_x            0 non-null      object             
 11  img_num_x            0 non-null      int64              
 12  p1_x                 0 non-null      ob

### Issue #7:

#### <b>Define</b> : Merge the `images_df` and the `tweets_meta_df` into the `tweets_df`


<b>Code</b>

In [30]:
clean_tweets = pd.merge(left=clean_tweets, right=clean_tweets_meta, on='tweet_id', how='left')
clean_tweets = pd.merge(left=clean_tweets, right=clean_images, on='tweet_id', how='left')

<b>Test</b>

In [31]:
clean_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   in_reply_to_user_id  0 non-null      float64            
 1   timestamp            0 non-null      datetime64[ns, UTC]
 2   source               0 non-null      object             
 3   text                 0 non-null      object             
 4   retweeted_status_id  0 non-null      float64            
 5   rating_numerator     0 non-null      int64              
 6   rating_denominator   0 non-null      int64              
 7   name                 0 non-null      object             
 8   doggo                0 non-null      object             
 9   floofer              0 non-null      object             
 10  pupper               0 non-null      object             
 11  puppo                0 non-null      object             
 12  retweet_count        0 non-null      in

### Issue #8:

#### <b>Define</b>: Merge the `images_df` and the `tweets_meta_df` into the `tweets_df`

<b>Code</b>

In [32]:
clean_tweets = pd.merge(left=clean_tweets, right=clean_tweets_meta, on='tweet_id', how='left')
clean_tweets = pd.merge(left=clean_tweets, right=clean_images, on='tweet_id', how='left')

<b>Test</b>

In [33]:
clean_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 39 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   in_reply_to_user_id  0 non-null      float64            
 1   timestamp            0 non-null      datetime64[ns, UTC]
 2   source               0 non-null      object             
 3   text                 0 non-null      object             
 4   retweeted_status_id  0 non-null      float64            
 5   rating_numerator     0 non-null      int64              
 6   rating_denominator   0 non-null      int64              
 7   name                 0 non-null      object             
 8   doggo                0 non-null      object             
 9   floofer              0 non-null      object             
 10  pupper               0 non-null      object             
 11  puppo                0 non-null      object             
 12  retweet_count_x      0 non-null      in

<b>define</b>: doggo, floofer, pupper and puppo columns:` could be merged into a single column called let's say `dog_stage`

<b>Code</b>

In [34]:
for index, row in clean_tweets.iterrows():
    stages = []
    stages.append(clean_tweets.loc[index, 'doggo'])
    stages.append(clean_tweets.loc[index, 'floofer'])
    stages.append(clean_tweets.loc[index, 'pupper'])
    stages.append(clean_tweets.loc[index, 'puppo'])
    if (stages.count('None') < 3):
        clean_tweets.loc[index, 'dog_stage'] = 'Multiple'
    else:
        if (stages.count('doggo') == 1):
            clean_tweets.loc[index, 'dog_stage'] = 'Doggo'
        elif (stages.count('floofer') == 1):
            clean_tweets.loc[index, 'dog_stage'] = 'Floofer'
        elif (stages.count('pupper') == 1):
            clean_tweets.loc[index, 'dog_stage'] = 'Pupper'
        elif (stages.count('puppo') == 1):
            clean_tweets.loc[index, 'dog_stage'] = 'Puppo'
        else:
            clean_tweets.loc[index, 'dog_stage'] = 'None'

# drop unecessary columns
clean_tweets = clean_tweets.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1)

## Storing Data


In [35]:
clean_tweets.to_csv('twitter_archive_master.csv', index=False)

## Analyzing and Visualizing Data
In this section, analyze and visualize your wrangled data. You must produce at least **three (3) insights and one (1) visualization.**

In [36]:
master_clean_tweets = pd.read_csv('twitter_archive_master.csv')

In [37]:
master_clean_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   in_reply_to_user_id  0 non-null      object
 1   timestamp            0 non-null      object
 2   source               0 non-null      object
 3   text                 0 non-null      object
 4   retweeted_status_id  0 non-null      object
 5   rating_numerator     0 non-null      object
 6   rating_denominator   0 non-null      object
 7   name                 0 non-null      object
 8   retweet_count_x      0 non-null      object
 9   favorite_count_x     0 non-null      object
 10  jpg_url_x            0 non-null      object
 11  img_num_x            0 non-null      object
 12  p1_x                 0 non-null      object
 13  p1_conf_x            0 non-null      object
 14  p1_dog_x             0 non-null      object
 15  p2_x                 0 non-null      object
 16  p2_conf_x            0 no

we have to convert datatypes again, because most have been changed to object datatype after saving the master dataset.

In [38]:
master_clean_tweets.tweet_id = master_clean_tweets.tweet_id.astype(str)
master_clean_tweets.source = master_clean_tweets.source.astype("category")
master_clean_tweets.dog_stage = master_clean_tweets.dog_stage.astype("category")

AttributeError: 'DataFrame' object has no attribute 'dog_stage'

In [None]:
master_clean_tweets.info()

### Insights to be visualized:
1. Popularity of source of the most tweets.

2. The most popular dog names.

3. Correlation between retweet count and favorite count of the WeRateDogs dataset

### The most popular dog names
What are the most common names between the dogs in the WeRateDogs dataset?

In [47]:
popular_dog_names = master_clean_tweets.name.value_counts().drop('None')[0:5]

names_bar_graph = popular_dog_names.plot.bar(color = 'blue', fontsize=15)

names_bar_graph.figure.set_size_inches(10,7)

plt.title('5 Most popular Dog Names', color = 'black', fontsize = '15')
plt.xlabel('Dog Names', color = 'black', fontsize = '15')
plt.ylabel('Popularity Count', color = 'black', fontsize = '15')

plt.savefig('dog_names.png')

KeyError: "['None'] not found in axis"

The 5 most popular dog names are Charlie, Lucy, Cooper, Oliver and Penny, in that order.

### Popularity of the source of the most tweets

What is the source of most of the tweets?

In [None]:
tweet_sources = master_clean_tweets.source.value_counts()

sources_bar_graph = tweet_sources.plot.bar(color='blue', fontsize=15)

sources_bar_graph.figure.set_size_inches(10,7)

plt.title('Popularity of tweet sources', color = 'black', fontsize = '15')
plt.xlabel('Tweet Sources', color = 'black', fontsize = '15')
plt.ylabel('Usage Count', color = 'black', fontsize = '15');

plt.savefig('tweet_sources.png')

Most of the tweets were sent from; Twitter for iPhone.

### Correlation between retweet count and favorite count of the WeRateDogs dataset

Let us establish if there is a relation between how often tweets are liked and how often they are retweeted.

In [None]:
master_clean_tweets.plot(x='retweet_count', y='favorite_count', kind='scatter', title='Correlation between Retweet Count and Favorite Count', figsize=(10,7), color='blue')

plt.savefig('likes_vs_retweets_corr.png')

In [None]:
master_clean_tweets['retweet_count'].corr(master_clean_tweets['favorite_count'])

The value above leads to the deduction that there is a **high positive correlation** between tweet likes and retweets.