# Udacity Data Wrangling Project

### Importing Libraries and Files

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import tweepy
#from tweepy import OAuthHandler
#import json
#from timeit import default_timer as timer
import re
import requests


import twitter_credentials
%matplotlib inline

# Gather

###  Gathering Get Request

In [None]:
#response = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
#if response.status_code == 200:
#    print('Success!')
#elif response.status_code == 404:
#    print('Not Found.')

In [None]:
images = pd.read_csv('image_predictions.tsv', sep = '\t', encoding = 'utf-8')

### Gathering data manually with Pandas

In [None]:
df_1 = pd.read_csv('twitter_archive_enhanced.csv')

### Gathering with the Twitter API



With a quick assessment of the tweet_id column I saw a unique problem I would need to solve first before being able to gather my last dataset containing all of the twitter data.

In [None]:
df_1.tweet_id.head()

Gathering tweets from the twitter API requires precise id's and unfortunately my csv was imported with a column of **twitter_id** that my unique id had suffered some rounding errors.

In [None]:
with pd.option_context('float_format', '{:.0f}'.format):  
    print(df_1.tweet_id.head())

In [None]:
with pd.option_context('float_format', '{:.8f}'.format):  
    print(df_1.tweet_id.head())

This gave me an oportunity to begin some data wrangling before jumping into the rest of my data. My solution to this issue was to split the strings of the URLs addresses that were included in the twitter_enhanced_archive csv file.  Using regular expression, I wanted to extract the Tweet Id's from the end of every URL. This required seperating the digits(these would be the id's I needed) from all of the char values that came before them, using '/' as a seperator.

In [None]:
df_1.expanded_urls.head()


My first step was to drop and null value in  **expanded_urls**, then take all of the remaining rows and place them into a list.

In [None]:
df_1['expanded_urls'] = df_1.expanded_urls.fillna(value='https://www.twitter.com')
urls = df_1.expanded_urls
urls = list(urls)

Next I created a new empty list, **status**, and used a loop to extract the id's from **urls**, appending
each id to **status**. As a conditional, also appended the string '0' to status, every time my search
was unable to find an accurate tweet_id

In [None]:
status= []
for i in range(len(urls)): 
    t= urls[i]
    if bool(re.search(r'\d', t)) == True:
        twt = re.findall(r'\d+', t)[0]
        status.append(twt)
    else:
        status.append('0')
status[:5]

I replaced my DataFrame column **tweet_id** with the values in **status** then removed all the values
that contained the string '0'.

In [None]:
df_1['tweet_id'] = status
df_1 = df_1.query("tweet_id != '0'")

My last step was to change the data type of **tweet_id** from strings to the int64 data type, standardizing the **tweet_id** data type accross various sources of data to be able to merge in the future, and preparing them for my next gathering step of the Twitter API. Additionally, int64 also is the most efficient way to store values of unique digits, as integers require much less memory than a string.

In [None]:
df_1 = df_1.astype({'tweet_id':'int64'})

In [None]:
tweet_ids = df_1['tweet_id']

In [None]:
#auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
#auth.set_access_token(access_token, access_secret)

#api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#count = 0
#fails_dict = {}
#start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
#with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
#    for tweet_id in tweet_ids:
#        count += 1
#        print(str(count) + ": " + str(tweet_id))
#        try:
#            tweet = api.get_status(tweet_id, tweet_mode='extended')
#            print("Success")
#            json.dump(tweet._json, outfile)
#            outfile.write('\n')
#        except tweepy.TweepError as e:
#            print("Fail")
#            fails_dict[tweet_id] = e
            
#            pass
#end = timer()
#print(end - start)
#print(fails_dict)


In [None]:
#df_2 = pd.DataFrame(columns=['tweet_id','retweet count','favorite_count'])

#with open('tweet_json.txt') as i:
#    for line in i:
#        stat = json.loads(line)
#        tweet_id = stat['id_str']
#        retweet_count = stat['retweet_count']
#        favorite_count = stat['favorite_count']
#        df_2 = df_2.append(pd.DataFrame([[tweet_id,retweet_count,
#                                         favorite_count]], 
#                                       columns=['tweet_id',
#                                        'retweet_count', 'favorite_count']))
#df_2 = df_2.reset_index(drop=True)
#df_2.head()
        

In [None]:
#df_2.to_csv (r'C:\Users\tssan\Desktop\Udacity Projects\4wrangle\WeRateDogs.csv',
#                          index = None, header=True, sep='\t', encoding='utf-8')

In [None]:
df_2 = pd.read_csv('WeRateDogs.csv', sep='\t',encoding='utf-8')

# Assessment
### Visual Assessment

In [None]:
df_1.loc[:9].style.applymap(lambda x: 'color: red' if pd.isnull(x) else '')

- Five columns in the **df_1** Dataframe appear to contain missing values: 
    - **in_reply_to_status_id**, 
    - **in_reply_to_user_id**, 
    - **retweeed_status_id**, 
    - **retweeted_status_user_id**,  
    - **retweeted_status_timestamp** 
 
- The **text** column contains the body of the tweet, but then ends each tweet with a hyperlink.  The hyperlink is already represented in the **expanded_urls** column, so having it shown twice is redundant information and is not really part of the text of the tweet. 
- The columns **doggo**, **floofer**, **pupper**, and **puppo** have the string "None" in many of there rows. These columns seem to be categorical varibales and would benefit by finding a melt them into a single categorical column.
- The **timestamp** column is formatted down to the millisecond. This is accurate and thourough information, but for future timeseries analysis, I will convert the time in a date format of yyyy-MM-dd instead of the yyyy-MM-dd HH:mm:ss. SSS it is in now.

In [None]:
images.loc[:10].style.applymap(lambda x: 'color: red' if pd.isnull(x) else '')

- The columns **p1**, **p2**, and **p3** are all in snake case format and some of the categorical values are capitalized while others are not.  To clean this up I would like to remove the underscore(s) of each row and put the classications in title format 

In [None]:
df_2.loc[:9].style.applymap(lambda x: 'color: red' if pd.isnull(x) else '')

**retweet count** is an inconsistent format by not having an underscore to replace the space between words. By grouping column names in snake case the column names of a dataframe are callable. This would likely be a column name that I renamed to much python convention for a column name but it looks like it is a duplicate column to **reteet_count** and has a majority of NaN values. I will drop the column **retweet count** to tidy up the **df_2** DataFrame.

### Programatic Assessment

In [None]:
df_1.info()

- As observed above, five columns have missing values.
    - **in_reply_to_status_id** and **in_reply_to_user_id** each have 2247 null values
    - **retweeed_status_id**, **retweeted_status_user_id**, and **retweeted_status_timestamp** each have 2094 null values

In [None]:
df_2.info()

- After running a summary of the dataframe **df_2** we see that all 2166 values are missing in the column **retweet count**

In [None]:
images.info()

- The **images** dataframe has no missing values. 

In [None]:
a = df_1.doggo.value_counts()
b = df_1.floofer.value_counts()
c = df_1.pupper.value_counts()
d = df_1.puppo.value_counts()

print(f'{a},\n\n{b},\n\n{c},\n\n{d}')

In [None]:
df_1.rating_numerator.describe()

In [None]:
df_1.rating_numerator.value_counts().sort_index()

For most of the cases if a numerator had a value
greater than 14 it was only used once, however 75 occurs twice.
I decided to look into this a little further to figure out why this was.

In [None]:
ind1, ind2 = df_1.text[df_1.rating_numerator== 75].index.tolist()
num = 75
text1, text2 = df_1.text[ind1], df_1.text[ind2]
print(f'Index: {ind1}\nNumerator: {num}\n{text1} \n\nIndex: {ind2}\nNumerator: {num}\n{text2}')

By taking a closer look at these two tweets, I found two problems. The first is that the numerator is not 75, as it is listed in the df_1 dataset, its actually 10. The number 75 was misrepresented as the numerator in error when a rating of 9.75(a sneaky nod to the secret entrance of the Hogwarts Express on platform 9 and 3/4) was given to Logan and second, the indexed tweet #340 happens to be a duplicate of the indexed tweet #695 

In [None]:
df_1.tweet_id.duplicated().any()

Confirming what we already knew from the previous cell, there are duplicates in the **df_1** dataset that will need to be dropped.

In [None]:
df_2.tweet_id.duplicated().any()

In [None]:
images.tweet_id.duplicated().any()

Like the **df_1** dataset, duplicates were found in **df_2** as well. These duplicates will also need to be dropped. Luckily, no duplicates were found in the **images** dataset. 

In [None]:
df_1.rating_denominator.describe()

In [None]:
df_1.rating_denominator.value_counts().sort_index()

In [None]:
df_1.query('rating_denominator > 10').rating_denominator.count()

In [None]:
df_1.query('rating_denominator < 10').rating_denominator.count()

2251 of the 2270 observations have a denominator of ten. I decided to look into this further to see if I could find out why this was. 
- 2 tweets have a denominator less than ten
- 17 tweets have denominators greater than ten. 


In [None]:
ind1, ind2 = df_1.text[df_1.rating_denominator < 10].index.tolist()
denom1, denom2 = df_1.rating_denominator[ind1], df_1.rating_denominator[ind2]
text1, text2 = df_1.text[ind1], df_1.text[ind2]
print(f'Index: {ind1}\nDenominator:{denom1}\n{text1} \n\nIndex: {ind2} \nDenominator:{denom2}\n{text2}')

For the the denominators that had values less then ten
- Tweet indexed #516 was one few times where WeRateDogs was not given a rating.
- Tweet indexed #2335 misrepresented the rating in error when a fraction of 1/2 was included in the text.

In [None]:
ind = df_1.query('rating_denominator > 10')['rating_denominator'].index.tolist()
denom = df_1.query('rating_denominator > 10')['rating_denominator'].tolist()
text = df_1.query('rating_denominator > 10')['text'].tolist()
for i in range(len(ind)):
    nex_ind = ind[i]
    nex_denom = denom[i]
    nex_text = text[i]
    print(f'Index: {nex_ind}\nDenominator: {nex_denom}\n{nex_text}\n')

In [None]:
df_1['timestamp'].min(), df_1['timestamp'].max()

In [None]:
df_1.timestamp[:5]

### Summary of Assessment

#### Quality issues
**df_1** dataset
- stuff
- stuff
- stuff
- stuff
- stuff

**df_2** dataset
- stuff
- stuff
- stuff

**images** dataset
- No quality errors were found in this dataset


#### Tidiness Issues
**df_1** dataset 
- stuff
- stuff
- stuff

**df_2** dataset
- stuff
- stuff
- stuff

**images** dataset
- stuff
- stuff



# Clean

In [None]:
df_1_clean = df_1.copy()
df_2_clean = df_2.copy()
images_clean = images.copy()

### Quality 

##### Define
- 1. Dropping the columns with a majority of NaN values from the **df_1** dataframe

##### Code

In [None]:
df_1.drop(columns=['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id',
                   'retweeted_status_user_id','retweeted_status_timestamp'
                   ], inplace=True)

##### Test the results

In [None]:
df_1.columns

##### Define
- 2. Melting Doggo, Pupper, Floofer, and Puppo down to one category variable named Cute_Name in the **df_1** Dataframe  

#####  Code

In [None]:
a = df_1['doggo']
b = df_1['pupper']
c = df_1['floofer']
d = df_1['puppo']


df_1['cute_name'] = a.str.cat(b.replace('None','')).str.cat(c.replace('None','')).str.cat(d.replace('None',''))

In [None]:
dict_dog = {'Nonefloofer':'floofer','Nonepupper':'pupper','Nonepuppo':'puppo','doggopupper':'doggo, pupper',
           'doggofloofer':'doggo, floofer','doggopuppo':'doggo, puppo','None':'Other'}

for key, value in dict_dog.items():
    df_1['cute_name'] = df_1.cute_name.replace(key,value)
    
df_1.drop(columns=['doggo', 'floofer','pupper', 'puppo'],inplace=True)

##### Test the results

In [None]:
df_1.cute_name.value_counts()

In [None]:
df_1.columns

##### Define
- 3.  Fixing Tweet 516 (No rating given) in the **df_1** Dataframe

#####  Code

In [None]:
df_1.loc[516, 'rating_numerator']= np.median(df_1.rating_numerator)
df_1.loc[516, 'rating_denominator']= np.median(df_1.rating_denominator)

##### Test the results

In [None]:
print(f"Since Sam was never given a rating, I gave him the median rating which is {df_1.loc[516,'rating_numerator']}/{df_1.loc[516,'rating_denominator']}")

##### Define
- 4.Dropping the duplicate in the **df_1** and **df_2** Dataframes

#####  Code

In [None]:
df_1 = df_1.drop_duplicates(subset=['tweet_id'],keep='first').reset_index()
df_2 = df_2.drop_duplicates(subset=['tweet_id'],keep='first').reset_index()


_4. Test the results_

In [None]:
df_1.tweet_id.duplicated().any(), df_2.tweet_id.duplicated().any()

##### Define
- 6. Dropping the 'source' column from **df** table

#####  Code

In [None]:
df_1.drop(columns=['source'],inplace=True)

##### Test the results

In [None]:
df_1.columns

##### Define
- 8. 

#####  Code

In [None]:
indices = []
for i, row in df_1.iterrows():
    if df_1.rating_denominator[i] != 10:
        in_str= row['text']
        try:
            test = re.search(r'\d+(?:\.\d+)?/10', in_str).group()
            den = re.findall(r'\d+', test)[1]
            test = re.findall(r'\d+', test)[0]
            df_1.at[i,'rating_numerator'] = test
            df_1.at[i,'rating_denominator'] = den
            indices.append(i)
        except:
            test = 'Unchanged'
        if test == 'Unchanged':
            denominator = 'Unchanged'
        else:
            denominator = den
                
        print(f'Index: {i}\nNumerator: {test}\nDenominator: {denominator}\n{in_str}\n')
        
    else:
        pass

##### Test the results

In [None]:
for i in range(len(indices)):
    ind = indices[i]
    num = df_1.rating_numerator[indices[i]]
    den = df_1.rating_denominator[indices[i]]
    print(f'Tweet #{ind} has a rating of {num}/{den}')

##### Define
- 9. Normalizing Denominators to a Standard of 10 structuring data to facilitate analysis **df_1**

#####  Code

In [None]:
norm_numerator= []
norm_denominator= []
for i, row in df_1.iterrows():
    norm = 10 / row['rating_denominator']
    num = row['rating_numerator'] = row['rating_numerator']* norm
    norm_numerator.append(num)
    den = row['rating_denominator'] = row['rating_denominator']* norm
    norm_denominator.append(den)
    

In [None]:
df_1['rating_numerator'] = norm_numerator
df_1['rating_denominator'] = norm_denominator

##### Test the results

In [None]:
df_1.query('rating_denominator !=10')['rating_denominator'].any()

In [None]:
df_1.drop(columns=['rating_denominator'],inplace = True)
df_1.rename(columns={"rating_numerator": "rating"},inplace=True)

In [None]:
df_1.columns

In [None]:
df_1.rating.value_counts()

##### Define
- 12.  Correcting Ratings Containing Decimal Places, Using Regular Expression to loop through each text and pull out any digits with decimal places in **df**

#####  Code

In [None]:
for i, row in df_1.iterrows():
    in_str= row['text']
    split = re.split(r'/' , in_str)[0]
    if bool(re.search(r'\d+\.\d+$', split)) == True:
        after = re.findall(r'\d+\.\d+', split)[0]
        before = df_1.loc[i, 'rating']
        df_1.loc[i, 'rating'] = after
        print(f"The rating in row {i} used to be {before} and now is {after}.")
    else:
        pass

In [None]:
df_1.rating.dtype

In [None]:
#No longer need the denominator columns, and the rating_numerator column
#can just become rating (on a standardized scale of 10)
df_1['rating'] = df_1['rating'].astype('float64')


##### Test the results

In [None]:
check = df_1.rating
text = df_1.text
divide = "---" * 38

for i in range(len(check)):
    chk_dec = check[i]
    txt = text[i] 
    if bool(chk_dec.is_integer()) == False:
        print(f'Now row {i} has the correct rating of {chk_dec}. Take a look. \n\n{txt}\n{divide}\n')
    else:
        pass

##### Define
- 13. 

#####  Code

In [None]:
# Rounding Ratings to get a discrete scale
for i, row in df_1.iterrows():
    before = df_1.loc[i, 'rating']
    after = round(df_1.rating[i], 0)
    df_1.loc[i, 'rating'] = after
    if bool(before == after) == False :
        print(f"The rating in row {i} used to be {before} and now it is {after}.")
    else:
        pass

In [None]:
df_1.rating.dtype

##### Define
- 14. 


In [None]:
#####  Code

In [None]:
df_1.rating = df_1.rating.astype(int) 

In [None]:
##### Test the results


In [None]:
df_1.rating.dtype

#####  Code

In [None]:
df_1['text'] = df_1.text.apply(lambda text: text.split('http')[0])

##### Test the results

In [None]:
line1 = df_1.text[np.random.randint(0,len(df_1))]
line2 = df_1.text[np.random.randint(0,len(df_1))]
line3 = df_1.text[np.random.randint(0,len(df_1))]

print(f'{line1}\n\n{line2} \n\n{line3}')

##### Define
- 5. Dropping columns with null data in the **df_2** Dataframe

#####  Code

In [None]:
df_2.drop(columns='retweet count', inplace= True)

##### Test the results

In [None]:
df_2.info()

##### Define
- 18. 

#####  Code

In [None]:
df_2['retweet_count'].describe()

In [None]:
df_2['retweet_count'] = df_2['retweet_count'].astype(int)

##### Test the results

In [None]:
df_2.info()

##### Define
- 16.

##### Code

In [None]:
images['p1'] = images['p1'].astype(str)

def remove_snake(word):
        return ''.join(x.lower() or '_' for x in word.replace('_', ' '))

images['p1'] = images['p1'].apply(lambda x : remove_snake(x))
images['p2'] = images['p2'].apply(lambda x : remove_snake(x))
images['p3'] = images['p3'].apply(lambda x : remove_snake(x))

##### Test the results

In [None]:
images[['p1','p2','p3']].sample(10)


### Tidiness

##### Define
- 17. Merging **df_1**, **df_2**, and **images** Dataframes into a single table

#####  Code

In [None]:
df = df_1.merge(df_2, on='tweet_id', how='inner')


In [None]:
df = df.merge(images, on='tweet_id', how='inner').drop(columns=['index_x','index_y'])
df = df.reset_index(drop=True)

In [None]:
##### Test the results

In [None]:
df.columns

In [None]:
df.index

In [None]:
df.info()

##### Define
- 18. 

#####  Code

In [None]:
import datetime as dt
try:
    time = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
    df['timestamp'] = time.dt.strftime('%Y/%m/%d %H:%M')
    print('Success')
except:
    print('No changes made')

##### Test the results

In [None]:
df.timestamp.sample(5)


In [None]:
#df.to_csv (r'~/Desktop/Udacity Projects/4wrangle/WeRateDogs_Clean.csv',
                        #index = False, header=True, sep='\t', encoding='utf-8')

In [None]:
df_cln = df.copy()

In [None]:
#df_cln = pd.read_csv('WeRateDogs_Clean.csv', sep='\t',encoding='utf-8')

# Finding a Winner

In [None]:
df_cln.groupby(['rating']).count()

In [None]:
df_cln[df_cln['rating'] == 420]

In [None]:
df_cln.text[1712]

In [None]:
print(df_cln.jpg_url[1712])

### At second place Snoop Dogg comes in at very impressive 420 out of 10
<img src="second.jpg"/>

In [None]:
df_cln[df_cln['rating'] == 1776]

In [None]:
df_cln.text[757]

In [None]:
print(df_cln.jpg_url[757])

### Congratulation to the winner coming in at 1776 out of 10, such a good boy!
<img src="winner.jpg"/>

## Distribution of Dog Ratings by Dog Types

- The two winners above are data points that were accurately gathered, but their ratings are also global outliers to the rest of the observations. These global outliers greatly increase the standard deviation of the variable **rating** in our DataFrame 
- To simplify our analysis, I decided to remove the two outliers, and take a deeper look into the distribution of the ratings by dog type.

In [None]:
df_before= df_cln.copy()
df_after = df_cln[df_cln['rating'] < 15]

#df_before= plots of the distribution of the data before and after removing the two outliers
f, axes = plt.subplots(1, 2, figsize=(12,4))

sns.boxplot(data=df_before,x= 'rating',orient='w', ax=axes[0])
axes[0].set_title('Before, Including Global Outliers')


sns.boxplot(data=df_after, x= 'rating',orient='w', ax=axes[1])
axes[1].set_title('After, Excluding Global Outliers');

In [None]:
df_rate = df_after.copy()

In [None]:
df_rate['p1'].value_counts()

In [None]:
dog_types = df_rate['p1'].value_counts().keys()
dog_types = list(dog_types)

In [None]:
dog_types = dog_types[:9]

In [None]:
for d in dog_types:
    df_other = df_rate[(df_rate.p1 != d)]

df_other = df_other.rating.value_counts().sort_index()
df_other = df_other.rename_axis('rating').to_frame('other')

for d in dog_types:
    df_add = df_rate[df_rate['p1'] == d ]
    df_add = df_add.rating.value_counts().sort_index()
    df_add = df_add.rename_axis('rating').to_frame(d)
    df_other = df_other.merge(df_add, on='rating',how='left').fillna(0)
    

df_other.loc[:,'Total'] = df_other.sum(axis=1)
df_other.index = df_other.index.map(str)
df_other.columns = [x.lower() for x in df_other.columns]
df_all = df_other.copy()
df_all

In [None]:
b='Black'
r = df_all.index
t = df_all.total
c= sns.color_palette()[9]
f_sz=12
w='whitesmoke'
fig = plt.figure(figsize=(6,7), facecolor=c)

            
ax = plt.subplot(1, 1, 1)
plt.barh(range(len(r)),t, color=w)
plt.yticks(range(len(r)),r,fontsize=12, color=b)
plt.ylabel('Rating',color=w,fontsize=f_sz)
plt.xticks(color=b,fontsize=f_sz)
plt.xlabel('Frequency',color=w,fontsize=f_sz)
plt.title('Total Distribution of Dog Ratings',color='Black',fontsize=22)

ax.set_axisbelow(True)
ax.yaxis.grid(color='white', linestyle='dashed')
plt.box(False)
plt.show()

In [None]:
from mpl_toolkits.axes_grid1 import Grid
i = df_all.index
t = df_all.total
r = df_all.index
ch = df_all.chihuahua
g = df_all['golden retriever']
l = df_all['labrador retriever']
p = df_all.pembroke
pu = df_all.pug
ow = df_all.chow
s = df_all.samoyed
pm = df_all.pomeranian
tp = df_all['toy poodle']
c= sns.color_palette()[9]
w='whitesmoke'
b='Black'
f_sz=16
lim = [0,40]
fig = plt.figure(figsize=(10,14), facecolor=c)
grid = Grid(fig, rect=111, nrows_ncols=(2,2),
            axes_pad=0.25, label_mode='L')



ax1= plt.subplot(3, 3, 1)
plt.barh(range(len(r)),g, color=w)
plt.yticks(range(len(r)),r,fontsize=11,color=b)
plt.xticks(color=b,fontsize=11)
plt.xlabel('Frequency',color=w)
plt.ylabel('Rating',color=w)
plt.box(False)
plt.title('Golden Retriever',fontsize=f_sz,color=b)
plt.xlim(lim)
ax1.set_axisbelow(True)
ax1.yaxis.grid(color='white', linestyle='dashed')

ax2= plt.subplot(3, 3, 2)
plt.barh(range(len(r)),l, color=w)
plt.yticks(range(len(r)),r)
plt.xticks(color=b, fontsize=11)
plt.xlabel('Frequency',color=w)
plt.setp(ax2.get_yticklabels(), visible=False)
plt.box(False)
plt.title('Labrador',fontsize=f_sz,color=b)
plt.xlim(lim)
ax2.set_axisbelow(True)
ax2.yaxis.grid(color='white', linestyle='dashed')

ax3 = plt.subplot(3, 3, 3)
plt.barh(range(len(r)),ch, color=w)
plt.yticks(range(len(r)),r)
plt.xticks(color=b, fontsize=11)
plt.xlabel('Frequency',color=w)
plt.setp(ax3.get_yticklabels(), visible=False)
plt.box(False)
plt.title('Chihuah',fontsize=f_sz,color=b)
plt.xlim(lim)
ax3.set_axisbelow(True)
ax3.yaxis.grid(color='white', linestyle='dashed')

ax4 = plt.subplot(3, 3, 4)
plt.barh(range(len(r)),p, color=w)
plt.yticks(range(len(r)),r,fontsize=11,color=b)
plt.xticks(color=b, fontsize=11)
plt.xlabel('Frequency',color=w)
plt.ylabel('Rating',color=w)
plt.box(False)
plt.title('Pembroke',fontsize=f_sz,color=b)
plt.xlim(lim)
ax4.set_axisbelow(True)
ax4.yaxis.grid(color='white', linestyle='dashed')

ax5 = plt.subplot(3, 3, 5)
plt.barh(range(len(r)),pu, color=w)
plt.yticks(range(len(r)),r)
plt.xticks(color=b, fontsize=11)
plt.xlabel('Frequency',color=w)
plt.setp(ax5.get_yticklabels(), visible=False)
plt.box(False)
plt.title('Pug', fontsize=f_sz, color=b)
plt.xlim(lim)
ax5.set_axisbelow(True)
ax5.yaxis.grid(color='white', linestyle='dashed')

ax6 = plt.subplot(3, 3, 6)
plt.barh(range(len(r)),ow, color=w)
plt.yticks(range(len(r)),r)
plt.xticks(color=b, fontsize=11)
plt.xlabel('Frequency',color=w)
plt.setp(ax6.get_yticklabels(), visible=False)
plt.box(False)
plt.title('Chow', fontsize=f_sz, color=b)
plt.xlim(lim)
ax6.set_axisbelow(True)
ax6.yaxis.grid(color='white', linestyle='dashed')

ax7 = plt.subplot(3, 3, 7)
plt.barh(range(len(r)),ow, color=w)
plt.yticks(range(len(r)),r,fontsize=11,color=b)
plt.xticks(color=b, fontsize=11)
plt.xlabel('Frequency',color=w)
plt.ylabel('Rating',color=w)
plt.setp(ax7.get_yticklabels(), visible=True)
plt.box(False)
plt.title('Samoyed', fontsize=f_sz, color=b)
plt.xlim(lim)
ax7.set_axisbelow(True)
ax7.yaxis.grid(color='white', linestyle='dashed')

ax8 = plt.subplot(3, 3, 8)
plt.barh(range(len(r)),pm, color=w)
plt.yticks(range(len(r)),r,)
plt.xticks(color=b, fontsize=11)
plt.xlabel('Frequency',color=w)
plt.setp(ax8.get_yticklabels(), visible=False)
plt.box(False)
plt.title('Pomeranian', fontsize=f_sz, color=b)
plt.xlim(lim)
ax8.set_axisbelow(True)
ax8.yaxis.grid(color='white', linestyle='dashed')

ax9 = plt.subplot(3, 3, 9)
plt.barh(range(len(r)),tp, color=w)
plt.yticks(range(len(r)),r,)
plt.xticks(color=b, fontsize=11)
plt.xlabel('Frequency',color=w)
plt.setp(ax9.get_yticklabels(), visible=False)
plt.box(False)
plt.title('Toy Poodle', fontsize=f_sz, color=b)
plt.xlim(lim)
ax9.set_axisbelow(True)
ax9.yaxis.grid(color='white', linestyle='dashed')

plt.tight_layout()
plt.show()

## Time Series of WeRateDogs Tweets

In [None]:
df_time = df_cln.copy()

In [None]:
# TimeSeries of Twitter Activity 

#Favorite and Retweet series
time_faves = pd.Series(data=df_time['favorite_count'].values, index=df_time['timestamp']).sort_index(ascending=True)
time_retweets = pd.Series(data=df_time['retweet_count'].values, index=df_time['timestamp']).sort_index(ascending=True)

#Plotting both series to the same axis
size= (12,8)

time_faves.plot(kind='area',figsize=size, label='favorites',
                color='deepskyblue',legend=True)
time_retweets.plot(kind='area',figsize=size, label='retweet',
                   color='crimson',legend=True)

#Labels and preferences for the visualization

plt.title('WeRateDogs Time Series of Favorites and Retweets',fontsize=18)
plt.xlabel('Date',fontsize=18,color='crimson') 
plt.ylabel('Twitter Activity Count',fontsize=18, color='crimson')
plt.xticks(rotation=45)
plt.legend(fontsize=14,frameon=False)


plt.box(False)
plt.show();