In [None]:
import requests
import pandas as pd
import numpy as np

import sys
import sys
!{sys.executable} -m pip install tweepy

import json
import time
import tweepy

In [None]:
pd.options.display.max_colwidth = 100

## Gather
Collect twitter data using csv and dog predictions via requests and additional twiiter data through the api

In [None]:
#twitter from file

twitter_df = pd.read_csv("twitter-archive-enhanced.csv", delimiter=",")

In [None]:
#predictions from url

predictions_url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
response = requests.get(predictions_url)
with open("image-predictions.tsv", mode="wb") as file:
    file.write(response.content)

predictions_df = pd.read_csv("image-predictions.tsv", delimiter="\t")

In [None]:
#tweets from twitter api

api_key = "api key"
api_secret_key = "api secret"
bearer_token = "bearer"
access_toekn = "access"
access_secret_token = "access secret"

auth = tweepy.AppAuthHandler(api_key, api_secret_key)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

start = time.time()
print("starting timer")
with open("tweet_json.txt", "a") as fo:
    for tid in twitter_df.tweet_id:
        try:
            tweet = api.get_status(tid, tweet_mode="extended")._json
            fo.write("\n")
            fo.write(json.dumps(tweet))
        except Exception as e:
            print (f"Error pulling tweet - {e}")
end = time.time()   
print (f"Ended after { end- start}")

In [None]:
# read into data frame

df_list = []
with open("tweet_json.txt") as f:
    for line in f:
        if line.strip():
            js = json.loads(line)
            tweet = {"tweet_id": js.get("id"),
                "retweet_count": js.get("retweet_count"),
                "favorite_count": js.get("favorite_count")}
            df_list.append(tweet)
        
new_tweet_df = pd.DataFrame(df_list, columns = ['tweet_id', 'retweet_count', 'favorite_count'])


# Assess and Clean the 3 dataframes

## Assess Twitter

In [None]:
twitter_df.sample(50)

In [None]:
twitter_df.info()

In [None]:
(twitter_df.doggo.unique(), twitter_df.floofer.unique(), twitter_df.pupper.unique(), twitter_df.puppo.unique())

In [None]:
twitter_df.rating_numerator.unique()

In [None]:
twitter_df.rating_denominator.unique()

In [None]:
twitter_df.source.unique()

In [None]:
denom_list = []

for el in [ 0,  15,  70,   7,  11, 150, 170,  20,  50,  90,  80,  40, 130, 110,  16, 120,   2]:
        filtered = twitter_df[twitter_df.rating_denominator == el]
        denom_list = denom_list + filtered[["tweet_id", "text", "rating_numerator", "rating_denominator"]].values.tolist()
denom_list

### Twitter data issues

#### cleanliness
* Retweets included (retweeted_status_id populated) 
* Look at timestamp, source fields formatting
* Some tweets do not have 10 denominator eg 704054845121142784.
* Incorrect name for some eg just for 770093767776997377
* Dont need columns in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp
* source xml, extract clearer value
* Incorrect score on some tweets - use denom_list to manually remove or change

#### tidiness
* doggo, floofer, pupper, puppo fields should can be represented as a single column.
* tweet dog information in seprate tables.

## Clean Twitter

#### cleanliness
1. Delete retweets and replies (where retweeted_status_id populated) 
2. Standardise timestamp
3. Change all denominators to 10.
4. Fix Incorrect names eg on 770093767776997377, change to None or find name where possible
5. remove columns related to retweets
6. Extract the source name from xml
7. remove null urls

#### tidiness

1. Flatten doggo, puppo, pupper and floofer columns
2. Split dog information and tweet details into seperate tables

### Cleanliness 1 - remove retweets and replies

In [None]:
twitter_df_copy = twitter_df.copy()
filtered_twitter_df = twitter_df_copy[twitter_df_copy.in_reply_to_status_id.isnull()]
filtered_twitter_df = filtered_twitter_df[filtered_twitter_df.retweeted_status_id.isnull()]

### Test

In [None]:
(twitter_df_copy.count(), filtered_twitter_df.count())

In [None]:
(filtered_twitter_df[~filtered_twitter_df.retweeted_status_id.isnull()].count(), filtered_twitter_df[~filtered_twitter_df.in_reply_to_status_id.isnull()].count())

In [None]:
(twitter_df_copy[~twitter_df_copy.retweeted_status_id.isnull()].count(), twitter_df_copy[~twitter_df_copy.in_reply_to_status_id.isnull()].count())

### Cleanliness 2 - reformat timestamp timestamp

In [None]:
filtered_twitter_df.timestamp = pd.to_datetime(filtered_twitter_df.timestamp).astype('datetime64[ns]')

### Test

In [None]:
filtered_twitter_df.sample(10)

In [None]:
filtered_twitter_df.info()

### Cleanliness 3 - Fix scores

Use denom list above to fix/remove entries

#### Update manually single dog

* 666287406224695296 -> 9/10
* 716439118184652801 -> 11/10
* 682962037429899265 -> 10/10
* 740373189193256964 -> 14/10

#### The follow tweet should not be included wrong, unsure what should be or groups of dogs which could negatively effect predictions.
* 677716515794329600 
* 682808988178739200
* 684222868335505415
* 684225744407494656
* 697463031882764288
* 710658690886586372
* 713900603437621249
* 704054845121142784
* 709198395643068416
* 686035780142297088
* 758467244762497024
* 810984652412424192
* 832088576586297345
* 835246439529840640
* 820690176645140481
* 775096608509886464

In [None]:
denom_list

In [None]:
# remove entries
drop_list = [677716515794329600,682808988178739200,684222868335505415,684225744407494656,697463031882764288,710658690886586372
            ,713900603437621249,704054845121142784,709198395643068416,686035780142297088,758467244762497024
            ,810984652412424192,832088576586297345,835246439529840640,820690176645140481,775096608509886464]
clean_scores_df = filtered_twitter_df[~filtered_twitter_df.tweet_id.isin(drop_list)]

In [None]:
# update scores
chage_list = [[666287406224695296,9],[716439118184652801, 11],[682962037429899265, 10],
[740373189193256964, 14]]

for el in chage_list:
    clean_scores_df.loc[clean_scores_df.tweet_id == el[0], ["rating_numerator", "rating_denominator"]] = el[1], 10

### Test

In [None]:
clean_scores_df[clean_scores_df.tweet_id == 716439118184652801]

In [None]:
clean_scores_df.tweet_id.count()

### Cleanliness 4 - fix Incorrect names replace all lower case names with none

In [None]:
not_names = clean_scores_df.name.str.islower()

clean_scores_df.loc[not_names, "name"] = None

### Test

In [None]:
clean_scores_df[clean_scores_df.name == "a"]

### Cleanliness 5 - remove columns related to retweets

* 'in_reply_to_status_id', 'in_reply_to_user_id' 'retweeted_status_user_id',  'retweeted_status_timestamp', 'retweeted_status_id'

In [None]:
reduced_df = clean_scores_df.drop(columns=['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_user_id','retweeted_status_timestamp', 'retweeted_status_id'])

### Test

In [None]:
reduced_df.sample(1)

### Cleanliness 6 - clean source column

* href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     -> Twitter for iPhone
* href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                         -> Vine - Make a Scene
* href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                      -> Twitter Web Client
* href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>     -> TweetDeck

In [None]:
replace_list = [['<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'Twitter for iPhone'],
['<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>', 'Vine - Make a Scene'],
['<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 'Twitter Web Client'],
['<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>', 'TweetDeck']]

for src in replace_list:
    reduced_df.source.replace(to_replace=src[0], value=src[1], inplace=True)

### Test

In [None]:
reduced_df.source.value_counts()

### Cleanliness 7 - Remove null urls

In [None]:
final_clean_twitter = reduced_df[~reduced_df.expanded_urls.isnull()]

### Test

In [None]:
final_clean_twitter[final_clean_twitter.expanded_urls.isnull()]

### Tidiness 1 flatten dog stage

In [None]:
#Nones are a string, need to change
for col in ["doggo", "floofer", "pupper", "puppo"]:
    reduced_df[col].replace(to_replace="None", value=None, inplace=True)

In [None]:
# back fill with stage
reduced_df['dog_status'] = reduced_df.bfill(axis=1).iloc[:,8]

In [None]:
reduced_cols = reduced_df.drop(columns=["doggo", "floofer", "pupper", "puppo"])

### Test

In [None]:
reduced_cols.dog_status.value_counts()

In [None]:
reduced_cols.columns

### Tidiness 2 split into 2 dataframes

In [None]:
tweets = reduced_cols[["tweet_id", "timestamp", "source", "text", "expanded_urls"]]
dogs = reduced_cols[["tweet_id", "name", "rating_numerator", "rating_denominator", "dog_status"]]

### Test

In [None]:
tweets.sample(1)

In [None]:
dogs.sample(1)

## Assess Predictions

In [None]:
predictions_df.info()

In [None]:
predictions_df.sample(50)

In [None]:
predictions_df.describe()

In [None]:
predictions_df.img_num.unique()

In [None]:
predictions_df.p3_dog.value_counts()

## Predicitons data issues

#### cleanliness

* columns names p1-p3 not very clear
* predictions names are not in a consistent format, eg some capitalised some with _ and some -

#### tidiness
* predictios should be in seperate table

## Clean Predicitons 


#### cleanliness
1. Rename p1, p2, p3 columns
2. Rename p1_dog, p2_dog, p3_dog 
3. Rename p1_conf, p2_conf, p3_conf
4. Reformat pi columns, change all - to _ and make all lower case


#### tidiness

1. Each prediction should be one row -> each row should be transform to 3 rows

### Tidy and rename columns at the same time

In [None]:
predictions_df_copy = predictions_df.copy()

## rename cols for joining on prediction number
cols_to_rename = {"p1_conf": "p1", "p2_conf": "p2","p3_conf": "p3", "p1_dog": "p1", "p2_dog": "p2","p3_dog": "p3",}

#keep name 
preds_df = predictions_df_copy[['tweet_id', 'jpg_url', 'img_num', 'p1', 'p2', 'p3']]
preds_melt = preds_df.melt(id_vars=['tweet_id', 'jpg_url', 'img_num'], var_name='prediction_number', value_name='breed')

#confidence values cols
preds_conf_cols = predictions_df_copy[['tweet_id', 'jpg_url', 'img_num', 'p1_conf', 'p2_conf', 'p3_conf']]
#rename to pred number for joining
preds_conf_df_rename = preds_conf_cols.rename(columns=cols_to_rename)
preds_conf_melt = preds_conf_df_rename.melt(id_vars=['tweet_id', 'jpg_url', 'img_num'], var_name='prediction_number', value_name='confidence')

#is bredd values cols
preds_is_dog_cols = predictions_df_copy[['tweet_id', 'jpg_url', 'img_num', 'p1_dog', 'p2_dog', 'p3_dog']]
#rename to pred number for joining
preds_is_dog_rename = preds_is_dog_cols.rename(columns=cols_to_rename)
preds_is_dog_melt = preds_is_dog_rename.melt(id_vars=['tweet_id', 'jpg_url', 'img_num'], var_name='prediction_number', value_name='is_dog')


In [None]:
# join the 3 dataframes on prediction number, tweet id and image details so each row is a single observation
joined = preds_melt.merge(preds_conf_melt, on=['tweet_id', 'jpg_url', 'img_num', 'prediction_number']).merge(preds_is_dog_melt, on=['tweet_id', 'jpg_url', 'img_num', 'prediction_number'])

### Test

In [None]:
joined.sample(10)

In [None]:
joined[joined.tweet_id ==754747087846248448]

### Clean breed column - easier less work now theres a single column for fix

In [None]:
joined.breed = joined.breed.str.lower()
joined.breed = joined.breed.str.replace("-","_")

### Test

In [None]:
joined[joined.tweet_id ==846042936437604353]

In [None]:
joined[joined.tweet_id ==692901601640583168]

## Twitter API 

Assess

In [None]:
new_tweet_df.sample(10)

In [None]:
new_tweet_df.info()

In [None]:
new_tweet_df.describe()

## Final dataframes:

* joined (predictions single prediction per tweet_id so a tweet id will appear 3 times - 1 for each prediction)
* tweets (tweet details)
* dogs (dog details)
* new_tweet_df (favourite and likes from the api)

In [None]:
#join tweets from file to favourtie and like counts
tweets_joined = tweets.merge(new_tweet_df, on=['tweet_id'])

In [None]:
tweets_joined.sample(10)

In [None]:
#write to csv files
tweets_joined.to_csv("twitter_archive_master.csv", encoding='utf-8', index=False)
joined.to_csv("predictions.csv", encoding='utf-8', index=False)
dogs.to_csv("dogs.csv", encoding='utf-8', index=False)

# Analysis and Visulisations

In [None]:
# Top 5 dog names
dog_df = pd.read_csv("dogs.csv", encoding='utf-8')
dog_df[dog_df.name != "None"].name.value_counts()[:5].index.tolist()

In [None]:
# Top 5 dogs posted going by the first prediction
predictions_df = pd.read_csv("predictions.csv", encoding='utf-8')
prediction_1_is_dog = predictions_df[(predictions_df.prediction_number == "p1") & (predictions_df.is_dog == True)]
prediction_1_is_dog.breed.value_counts()[:5].index.tolist()

In [None]:
# Most retweeted dog and most favourited dog
tweets_df = pd.read_csv("twitter_archive_master.csv", encoding='utf-8')
tweets_and_dogs = tweets_df.merge(prediction_1_is_dog, on=["tweet_id"])


In [None]:
#number of tweets
tweets_df.tweet_id.count()

In [None]:
# date range of tweets
(tweets.timestamp.min(), tweets.timestamp.max())

In [None]:
retweet = tweets_and_dogs.sort_values("retweet_count", ascending=False).head(10)
fav = tweets_and_dogs.sort_values("favorite_count", ascending=False).head(10)

In [None]:
retweet.merge(dog_df, on="tweet_id")

In [None]:
fav.merge(dog_df, on="tweet_id")

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.xticks(rotation=80)

plt.title("Most Retweeted Dogs")
plt.xlabel("Dog Breed")
plt.ylabel("Number of Retweets")
ax.bar(retweet.breed, retweet.retweet_count)


In [None]:
fig2 = plt.figure()
ax = fig2.add_axes([0,0,1,1])
plt.xticks(rotation=80)

plt.title("Most favourited dogs")
plt.xlabel("Dog Breed")
plt.ylabel("Favourite Count")
ax.bar(fav.breed, fav.favorite_count,  color="orange")

In [None]:
plt.title("Most common dog age")
plt.xlabel("", fontsize=18)
labels = np.full(len(dogs.dog_status.value_counts()), "", dtype=object)
labels[0]="t"
dogs.dog_status.value_counts().plot(kind="pie")