In [1]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
import os
from twitter_api import get_twitter_data
import json
from sqlalchemy import create_engine
from PIL import Image
from io import BytesIO


# Do not truncate data in cells
pd.set_option('display.max_colwidth', None)

# Do not limit number of displayed columns
pd.set_option('display.max_columns', None)

# Display floats with 2 decimals
pd.set_option('display.float_format', '{:20,.2f}'.format)

ModuleNotFoundError: No module named 'twitter_api'

In [3]:
#load twitter archive file into pandas df.
df_arch = pd.read_csv('twitter-archive-enhanced.csv')

FileNotFoundError: File b'twitter-archive-enhanced.csv' does not exist

In [None]:
#download image predictions file.
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(os.path.join('data/' + url.split('/')[-1]), 'wb') as file:
    file.write(response.content)

In [None]:
# load image predictions file into pandas df.
df_pred = pd.read_csv('data/image-predictions.tsv', sep='\t')

In [None]:
#download data from twitter API 
if not os.path.exists('data/tweet_json.txt'):
    get_twitter_data(df_arch, 'data/tweet_json.txt')

In [None]:
#load tweets data into pandas df
with open('data/tweet_json.txt') as file:
    df_api = pd.read_json(file, lines= True, encoding = 'utf-8')

In [None]:
df_arch

df_arch.info()


df_arch[['rating_numerator', 'rating_denominator']].describe()

In [None]:
df_arch.name.unique()

df_arch.name.value_counts()

df_arch[df_arch.name.str.islower()]

In [None]:
# looking for numerators with wrong values in chunks
df_arch[df_arch.rating_numerator <= 5].loc[0:1000, :]

In [None]:
df_arch[df_arch.rating_numerator <= 5].loc[1000:2000, :]

In [None]:
df_arch[df_arch.rating_numerator <= 5].loc[2000:, :]

In [None]:
df_pred.info()

df_pred.columns

df_pred.img_num.value_counts()

In [None]:
df_pred.p1.unique()

In [None]:
df_api

df_api.info()

df_api.lang.value_counts()

# Observations

# Cleaning

In [None]:
col_list = ['doggo', 'floofer', 'pupper', 'puppo']

for col in col_list:
    df_arch_cleaned[col] = df_arch_cleaned[col].replace('None', np.nan)

df_arch_cleaned.info()

In [None]:
# check for the unique values
df_arch_cleaned.source.unique()

In [None]:

#make a function fix_source which extract the strings between tags
def fix_source(x):
    'x is an html string from the source column in df_arch_cleaned dataset'
    #find the first closed  tag >
    i= x.find('>') + 1
    # find the first open tag after the previous <
    j =x[i:].find('<')
    # extract the text in between
    return x[i:][:j]

In [None]:
df_arch_cleaned.source = df_arch_cleaned.source.apply(lambda x: fix_source(x)).astype('category')


In [None]:
# check for the unique values
df_arch_cleaned.source.unique(

In [None]:
df_arch_cleaned[df_arch_cleaned.text.str.contains(r"(\d+\.\d*\/\d+)")][['text', 'rating_numerator']]

In [None]:
new_ratings = df_arch_cleaned[df_arch_cleaned.text.str.contains(r"(\d+\.\d*\/\d+)")]['text'].str.extract(r"(\d+\.\d*(?=\/\d+))")
new_ratings

In [None]:
df_arch_cleaned.loc[new_ratings.index, 'rating_numerator'] = new_ratings.values
df_arch_cleaned.rating_numerator = df_arch_cleaned.rating_numerator.astype('float')

In [None]:
df_arch_cleaned.loc[new_ratings.index]

In [None]:
df_arch_cleaned.info()

In [None]:
#Remove ratings and links from text column using RegEx
df_arch_cleaned.text[0]
df_arch_cleaned.text = df_arch_cleaned.text.str.extract('(.+(?=\s\d+/\d+\s))')

In [None]:
df_arch_cleaned.text.sample(10)

In [None]:
#Converte timestamp column to datetime.

df_arch_cleaned.timestamp = pd.to_datetime(df_arch_cleaned.timestamp)
df_arch_cleaned.timestamp.dtype

In [None]:
#Remove retweets and replies

df_arch_cleaned = df_arch_cleaned.query('in_reply_to_status_id == "NaN" &\
                                                     in_reply_to_user_id == "NaN" &\
                                                    retweeted_status_id == "NaN" &\
                                                    retweeted_status_user_id == "NaN"')
# drop columns
cols = ['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id',
           'retweeted_status_user_id', 'retweeted_status_timestamp']
df_arch_cleaned.drop(columns = cols, axis=1, inplace=True)


In [None]:
# check for Null values in the df_arch_cleaned
df_arch_cleaned.info()

In [None]:
#Remove values other than 10 for rating_denominator

df_arch_cleaned = df_arch_cleaned[df_arch_cleaned['rating_denominator'] == 10]
df_arch_cleaned[['rating_numerator', 'rating_denominator']].describe()

In [None]:
#Remove any rows not related to dogs
df_arch_cleaned = df_arch_cleaned[~df_arch_cleaned.text.isnull()]
df_arch_cleaned = df_arch_cleaned.loc[~df_arch_cleaned.text.str.match('.*only rate dogs')]

df_arch_cleaned.loc[df_arch_cleaned.text.str.match('.*only rate dogs')]

In [None]:
df_arch_cleaned.info()

In [None]:
#Drop rows with NaNs for expanded_urls column.

df_arch_cleaned = df_arch_cleaned.loc[~df_arch_cleaned.expanded_urls.isnull()]
df_arch_cleaned.info()

In [None]:
#Create dog_stage column and remove the (doggo, floofer, pupper, puppo) columns.
# select the dog stages columns from the dataset
cols = ['doggo', 'floofer', 'pupper', 'puppo']

# create the dog_stage column with joining the four columns in one column dog_stage join for more than stage
df_arch_cleaned['dog_stage'] = df_arch_cleaned[cols].\
                                        apply(lambda x: ', '.join(x.dropna().astype(str)),axis =1)
# replace the empty string with nan and change datatype to category
df_arch_cleaned.dog_stage = df_arch_cleaned.dog_stage.replace('', np.nan).astype('category')

# drop the 4 columns
df_arch_cleaned = df_arch_cleaned.drop(columns = cols, axis =1)

In [None]:
df_arch_cleaned.info()

In [None]:
df_arch_cleaned.dog_stage.value_counts()

In [None]:
#Replace 'None' with np.name in df_arch name column.
#Remove any rows with invalid names which starts with lower laters.
df_arch_cleaned[~df_arch_cleaned.name.str.istitle()].name.unique()
df_arch_cleaned.name.replace(['such', 'a', 'quite', 'not', 'one', 'incredibly', 'mad',
       'an', 'very', 'just', 'my', 'his', 'actually', 'getting',
       'this', 'unacceptable', 'all', 'old', 'infuriating', 'the',
       'by', 'officially', 'life', 'light', 'space', 'None'], np.nan, inplace=True)

In [None]:
df_arch_cleaned.name.unique()

In [None]:
df_arch_cleaned.name.value_counts()
df_arch_cleaned.info()

In [None]:
#Remove img_num column from df_pred_cleand
df_pred_cleaned.drop('img_num', axis=1, inplace=True)
df_pred_cleaned.info()

In [None]:
#Create breed and confidence columns with highest confidence predictions and drop other columns

breed = []
confidence = []
# iterating over df_pred row by row and taking the highest confident prediction other wise np.nan
for index, row in df_pred_cleaned.iterrows():
    if row['p1_dog'] and row['p1_conf'] == max([row['p1_conf'], row['p2_conf'], row['p3_conf']]):
        breed.append(row['p1'])
        confidence.append(row['p1_conf'])
    elif row['p2_dog'] and row['p2_conf'] == max([row['p1_conf'], row['p2_conf'], row['p3_conf']]):
        breed.append(row['p2'])
        confidence.append(row['p2_conf'])
    elif row['p3_dog'] and row['p3_conf'] == max([row['p1_conf'], row['p2_conf'], row['p3_conf']]):
        breed.append(row['p3'])
        confidence.append(row['p3_conf'])
    else:
        breed.append(np.nan)
        confidence.append(np.nan)
        
df_pred_cleaned['breed'] = breed
df_pred_cleaned['confidence'] = confidence

df_pred_cleaned = df_pred_cleaned[['tweet_id', 'jpg_url', 'breed', 'confidence']]

In [None]:
df_pred_cleaned.sample(10)

In [None]:
df_pred_cleaned.info()

In [None]:
#Remove unnecessary columns for df_api_cleand
df_api_cleaned.columns

df_api_cleaned = df_api_cleaned[['id', 'retweet_count', 'favorite_count']]

df_api_cleaned.info()

In [None]:
#Rename id column in df_api_cleand to tweet_id
df_api_cleaned.columns = ['tweet_id', 'retweet_count', 'favorite_count']

df_api_cleaned.columns

In [None]:
#merge data into database
df = pd.merge(df_arch_cleaned, df_pred_cleaned, on='tweet_id')
df = pd.merge(df, df_api_cleaned, on = 'tweet_id')

# Create SQLAlchemy Engine and empty twitter_archive_master database
engine = create_engine('sqlite:///data/twitter_archive_master.db')
# Store master df into table master 
try:
    df.to_sql('master', engine, index=False)
except Exception as e:
    print(e)

In [None]:
df_master = pd.read_sql('SELECT * FROM master', engine, parse_dates='timestamp')
df_master.source = df_master.source.astype('category')
df_master.dog_stage = df_master.dog_stage.astype('category')
df_master.breed = df_master.breed.astype('category')
df_master.info()

# Visualizations

In [None]:
# the ratings distribution by ploting a bar chart for it's frequency.

data = df_master.rating_numerator.value_counts()

x = data.index
y = data.values
fig, ax = plt.subplots(figsize=(12, 6))
g = sns.barplot(x, y, palette='Blues_d', ax=ax)
ax.set(xlabel='Ratings', ylabel='Frequency', title='Ratings frequency')
plt.show()

In [None]:
#distribution with box plot.

data = df_master.rating_numerator.value_counts()

ax = sns.boxplot(data, orient='v', width=.4)
ax.set(xlabel='Ratings', ylabel='Frequency', title='Ratings frequency')
plt.show()

In [None]:
#As we can see that their are 2 outliers here so let's investigate more and check their data

outliers_df = df_master[df_master.rating_numerator > 400][['rating_numerator', 'name', 'jpg_url', 'text']]
outliers_df

In [None]:
if not os.path.exists('images'):
    os.makedirs('images')
fig=plt.figure()
c = 1
for index, row in outliers_df.iterrows():
    r = requests.get(row['jpg_url'])
    i = Image.open(BytesIO(r.content))
    i.save('images/' +  str(index) + '_' + str(row['rating_numerator']) + "_" + str(row['name']) + '.jpg')
    fig.add_subplot(1, 2, c)
    c += 1
    plt.imshow(i)
    plt.axis("off")
plt.show()

In [None]:
# Scatter plot to show the relation between favorits and retweets
ax = sns.scatterplot(x='retweet_count', y='favorite_count', data=df_master,
                     hue='rating_numerator', hue_norm=(5, 20), s=15) 
ax.set(xlabel='Retweet count', ylabel='Favorite count', title='Favorits VS Retweets')
plt.show()

In [None]:
ax = sns.regplot(x='retweet_count', y='favorite_count', data=df_master, color='b', scatter_kws={'s':5, 'alpha':.3}) 
ax.set(xlabel='Retweet count', ylabel='Favorite count', title='Favorits VS Retweets')
plt.show()

In [None]:
#compare dog stages

data = df_master.groupby('dog_stage').count()['tweet_id']
ax = sns.barplot(y=data.index, x=data.values, palette='Blues_d')
ax.set(xlabel='Count', ylabel='Dog stage', title='Dog Stage Counts')
plt.show()

In [None]:
#Comparing tweets different sources

data = df_master.groupby('source').count()['tweet_id']
ax = sns.barplot(y=data.index, x=data.values, palette='Blues_d')
ax.set(xlabel='Count', ylabel='Tweet source', title='Tweet Source Counts')
plt.show()

In [None]:
data = pd.read_sql("""SELECT name, COUNT(*) AS count
                        FROM master 
                    GROUP BY 1
                      HAVING name <> 'A'
                    ORDER BY 2 DESC
                       LIMIT 10;
                  """, engine)
fig, ax = plt.subplots(figsize=(12, 6))
g = sns.barplot(x='name', y='count', data=data, palette='Blues_d', ax=ax)
ax.set(ylabel='Count', xlabel='Dog name', title='Most popular dog names')
plt.show()