In [1]:
import pandas as pd
import numpy as np
import json


In [2]:
df_archive = pd.read_csv('twitter-archive-enhanced.csv')
df_archive.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


In [3]:
df_class = pd.read_csv('twitter_image_class.csv')
df_class.head(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True


In [4]:
df_status = pd.read_csv('twitter_status_dogs.csv')
df_status.head(3)

Unnamed: 0,tweet_id,text_check,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
0,892420643555336193,This is Phineas. He's a mystical boy. Only eve...,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434,8138,
1,892177421306343426,This is Tilly. She's just checking pup on you....,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164,6029,
2,891815181378084864,This is Archie. He is a rare Norwegian Pouncin...,http://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,https://twitter.com/dog_rates/status/891815181...,24251,3991,


In [5]:
df_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [6]:
df_class.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [7]:
df_status.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 7 columns):
tweet_id                 2335 non-null int64
text_check               2335 non-null object
dog_picture_url          2335 non-null object
tweet_web_url            2335 non-null object
favourite_count          2335 non-null int64
retweet_count            2335 non-null int64
retweet_status_id_api    2335 non-null object
dtypes: int64(3), object(4)
memory usage: 127.8+ KB


In [8]:
all_columns = pd.Series(list(df_archive) + list(df_class) + list(df_status))
all_columns[all_columns.duplicated()]

17    tweet_id
29    tweet_id
dtype: object

#### Good that names of columns are not duplicated apart from tweet_id on which the tables can be joined;  but there are actually duplicating contents in several columns (eg text in `df_archive` and text_check in `df_status` )

### Tidying/Cleaning

  - joining `df_archive` and `df_status`, which can sit in one table (following cleaning), complying with tidyness protocol; also once they do, it will be easier to examine the following
  - creating `class_df` from `df_class`
  - dropping rows that have no *favourite_rating* and *retween_count* (this is because those tweets were deleted, and where  subsequntly not received from Twitter API
  - checking tweet text is correct for each tweet (comparing two columns that store tweet text, deciding which to keep),
  - similarly checking other potentially duplicated columns, including tweet_url (columns `expanded_urls` and `tweet_web_url`) and retweet_id (columns `retweeted_status_id` and `retween_status_id_api`)
  - converting the `retweeted_status_id` column to integer (np.int64), in line with tweet_id
  - checking ratings (`rating_numerator` and `rating_denominator`) are correctly imported to ratings columns (from tweet text)
  - creating one column for 'dog_stages' (from the four "dummy" columns of `doggo`, `floofer`, `pupper`, `puppo`)
  - checking 'dog stages' are all correctly extracted
  - convert timestamp from string to datetime
  - identifying and dropping rows that are retweets (explicitly not required to be included in the project)
  - checking other cleanliness issues (e.g. ` dog_picture_url` in df_tweet should be the same as `jpg_url` in df_class; check and drop one)
  - renaming columns in `class_clean`
  
It seems that the image classification table `class_clean` (previously `df_class`) could also be joined without causing tidyness issues, that said, since it is such a different "unit" of information, with its own independent source (ie classification model output), will keep it separate for now.

In [9]:
# create TWEET_CLEAN from df_archive and df_status
tweet_clean = pd.merge(df_archive, df_status,
                            on=['tweet_id'], how='left')

#create CLASS_CLEAN from df_class
class_clean = df_class

In [10]:
tweet_clean

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,doggo,floofer,pupper,puppo,text_check,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,,,,,This is Phineas. He's a mystical boy. Only eve...,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,...,,,,,This is Tilly. She's just checking pup on you....,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,...,,,,,This is Archie. He is a rare Norwegian Pouncin...,http://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,https://twitter.com/dog_rates/status/891815181...,24251.0,3991.0,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,...,,,,,This is Darla. She commenced a snooze mid meal...,http://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,https://twitter.com/dog_rates/status/891689557...,40731.0,8304.0,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,...,,,,,This is Franklin. He would like you to stop ca...,http://pbs.twimg.com/media/DF6hr6AVYAAZ8G8.jpg,https://twitter.com/dog_rates/status/891327558...,38968.0,8992.0,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a majestic great white breaching ...,,,,https://twitter.com/dog_rates/status/891087950...,...,,,,,Here we have a majestic great white breaching ...,http://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg,https://twitter.com/dog_rates/status/891087950...,19575.0,2985.0,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,,,,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",...,,,,,Meet Jax. He enjoys ice cream so much he gets ...,http://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg,https://twitter.com/dog_rates/status/890971913...,11437.0,1979.0,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,...,,,,,When you watch your owner call another dog a g...,http://pbs.twimg.com/media/DFyBag_UQAAhhBC.jpg,https://twitter.com/dog_rates/status/890729181...,63112.0,18106.0,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,,,,https://twitter.com/dog_rates/status/890609185...,...,,,,,This is Zoey. She doesn't want to be one of th...,http://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg,https://twitter.com/dog_rates/status/890609185...,26940.0,4098.0,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,,,,https://twitter.com/dog_rates/status/890240255...,...,doggo,,,,This is Cassie. She is a college pup. Studying...,http://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,https://twitter.com/dog_rates/status/890240255...,30873.0,7078.0,


In [11]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 23 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
text_check                    23

#### Dropping rows that have no favourite_rating and retween_count (this is because those tweets were deleted, and where subsequntly not received from Twitter API

In [12]:
tweet_clean = tweet_clean[tweet_clean.favourite_count.notnull()]

In [13]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2335 entries, 0 to 2355
Data columns (total 23 columns):
tweet_id                      2335 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2335 non-null object
source                        2335 non-null object
text                          2335 non-null object
retweeted_status_id           165 non-null float64
retweeted_status_user_id      165 non-null float64
retweeted_status_timestamp    165 non-null object
expanded_urls                 2276 non-null object
rating_numerator              2335 non-null int64
rating_denominator            2335 non-null int64
name                          2335 non-null object
doggo                         2335 non-null object
floofer                       2335 non-null object
pupper                        2335 non-null object
puppo                         2335 non-null object
text_check                    23

In [14]:
tweet_clean.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,doggo,floofer,pupper,puppo,text_check,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
667,790337589677002753,,,2016-10-23 23:42:19 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Maggie. She can hear your cells divide. 1...,,,,https://twitter.com/dog_rates/status/790337589...,...,,,,,Meet Maggie. She can hear your cells divide. 1...,http://pbs.twimg.com/media/CvfX2AnWYAAQTay.jpg,https://twitter.com/dog_rates/status/790337589...,8241.0,2005.0,
739,780601303617732608,,,2016-09-27 02:53:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Hercules. He can have whatever he wants f...,,,,https://twitter.com/dog_rates/status/780601303...,...,,,,,Meet Hercules. He can have whatever he wants f...,http://pbs.twimg.com/media/CtVAvX-WIAAcGTf.jpg,https://twitter.com/dog_rates/status/780601303...,12700.0,3568.0,
1371,702276748847800320,,,2016-02-23 23:39:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...","""AND IIIIIIIIIIIEIIIIIIIIIIIII WILL ALWAYS LOV...",,,,https://twitter.com/dog_rates/status/702276748...,...,,,,,"""AND IIIIIIIIIIIEIIIIIIIIIIIII WILL ALWAYS LOV...",http://pbs.twimg.com/media/Cb78-nOWIAENNRc.jpg,https://twitter.com/dog_rates/status/702276748...,2551.0,797.0,
2190,668960084974809088,,,2015-11-24 01:11:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jaycob. He got scared of the vacuum. Hide...,,,,https://twitter.com/dog_rates/status/668960084...,...,,,,,Meet Jaycob. He got scared of the vacuum. Hide...,http://pbs.twimg.com/media/CUifpn4WUAAS5X3.jpg,https://twitter.com/dog_rates/status/668960084...,702.0,240.0,
592,798933969379225600,,,2016-11-16 17:01:16 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Iroh. He's in a predicament. 12/10 som...,,,,https://twitter.com/dog_rates/status/798933969...,...,,,,,This is Iroh. He's in a predicament. 12/10 som...,http://pbs.twimg.com/media/CxZiLcLXUAApMVy.jpg,https://twitter.com/dog_rates/status/798933969...,13843.0,4817.0,


In [15]:
#few checks to see values are sensible
tweet_clean.favourite_count.sort_values().head(5)

1295    51.0
342     68.0
2339    75.0
2298    82.0
2319    99.0
Name: favourite_count, dtype: float64

In [16]:
tweet_clean.favourite_count.sort_values().tail(5)

1079    119452.0
135     120097.0
534     124911.0
413     137875.0
1039    161664.0
Name: favourite_count, dtype: float64

In [17]:
tweet_clean.retweet_count.sort_values().head(5)

291     1.0
342     2.0
1295    2.0
274     2.0
113     3.0
Name: retweet_count, dtype: float64

In [18]:
tweet_clean.retweet_count.sort_values().tail(5)

66      42651.0
413     46757.0
534     59854.0
1079    60745.0
1039    82098.0
Name: retweet_count, dtype: float64

#### Checking tweet text is correct for each tweet (comparing two columns that store tweet text, deciding which to keep)

In [19]:
sum(tweet_clean.text != tweet_clean.text_check)

0

Ok, marvelous, all text from tweets was downloaded correctly;
therefore can just drop one of the text columns, will drop `text_check`

In [20]:
tweet_clean.drop(columns=['text_check'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [21]:
tweet_clean.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,name,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,Phineas,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,...,Tilly,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,


#### Similarly checking other potentially duplicated columns, including tweet_url (columns `expanded_urls` and `tweet_web_url`) and retweet_id (columns `retweeted_status_id` and  `retween_status_id_api`)

In [22]:
sum(tweet_clean.expanded_urls != tweet_clean.tweet_web_url)

852

Hm, not such a great match..

In [23]:
tweet_clean[tweet_clean.expanded_urls != tweet_clean.tweet_web_url][['expanded_urls','tweet_web_url']]

Unnamed: 0,expanded_urls,tweet_web_url
4,https://twitter.com/dog_rates/status/891327558...,https://twitter.com/dog_rates/status/891327558...
6,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",https://twitter.com/dog_rates/status/890971913...
7,https://twitter.com/dog_rates/status/890729181...,https://twitter.com/dog_rates/status/890729181...
10,https://twitter.com/dog_rates/status/890006608...,https://twitter.com/dog_rates/status/890006608...
13,https://twitter.com/dog_rates/status/889638837...,https://twitter.com/dog_rates/status/889638837...
17,https://twitter.com/dog_rates/status/888804989...,https://twitter.com/dog_rates/status/888804989...
18,https://twitter.com/dog_rates/status/888554962...,https://twitter.com/dog_rates/status/888554962...
20,https://twitter.com/dog_rates/status/888078434...,https://twitter.com/dog_rates/status/888078434...
23,https://twitter.com/dog_rates/status/887473957...,https://twitter.com/dog_rates/status/887473957...
26,https://twitter.com/dog_rates/status/886983233...,https://twitter.com/dog_rates/status/886983233...


In [24]:
tweet_clean[tweet_clean.expanded_urls != tweet_clean.tweet_web_url].loc[2209,['expanded_urls','tweet_web_url']].values

array(['https://twitter.com/dog_rates/status/668623201287675904/photo/1,https://twitter.com/dog_rates/status/668623201287675904/photo/1,https://twitter.com/dog_rates/status/668623201287675904/photo/1,https://twitter.com/dog_rates/status/668623201287675904/photo/1',
       'https://twitter.com/dog_rates/status/668623201287675904/photo/1'],
      dtype=object)

Ok, seems that `tweet_web_url` is more complete and also devoid of repetitions/multiple entries (unlike `expanded_urls` column); threrefore will drop `expanded_urls` column

In [25]:
tweet_clean.drop(columns=['expanded_urls'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [26]:
tweet_clean.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,rating_numerator,...,name,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,13,...,Phineas,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,13,...,Tilly,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,


#### Now checking for duplications in retweet_id (columns `retweeted_status_id` and  `retween_status_id_api`)

In [27]:
#normalising for NaN and 'None'
tweet_clean.retweet_status_id_api[tweet_clean.retweet_status_id_api == 'None'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [28]:
sum(tweet_clean.retweet_status_id_api == 'None')

0

In [29]:
sum(tweet_clean.retweet_status_id_api.isnull())

2170

In [30]:
#Ok, now let's compare the two columns
tweet_clean[tweet_clean.retweeted_status_id != tweet_clean.retweet_status_id_api][['retweeted_status_id','retweet_status_id_api']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2335 entries, 0 to 2355
Data columns (total 2 columns):
retweeted_status_id      165 non-null float64
retweet_status_id_api    165 non-null object
dtypes: float64(1), object(1)
memory usage: 54.7+ KB


In [31]:
#ok, for comparison let's try to use `retweet_status_id_api` values as float (also NaNs are accomodated for that way)
sum(tweet_clean.retweeted_status_id == tweet_clean.retweet_status_id_api.astype(float))

165

In [32]:
temp = tweet_clean.retweet_status_id_api.astype(float)
temp2 = tweet_clean.retweeted_status_id
sum(temp[temp.notnull()] != temp2[temp2.notnull()])

0

#### Ok, so in conclusion, the `retweeted_status_id`  and `retweet_status_id_api` are the same; therefore I will drop the `retweet_status_id_api` column

In [33]:
tweet_clean.drop(columns=['retweet_status_id_api'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [34]:
tweet_clean.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,13,10,Phineas,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,13,10,Tilly,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,12,10,Archie,,,,,http://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,https://twitter.com/dog_rates/status/891815181...,24251.0,3991.0


#### For completeness with the `retweeted_status_id` column, will also convert it to integer (np.int64), in line with tweet_id
To deal with NaNs I will first convert all NaNs to 0s, then the column to int64

In [35]:
tweet_clean.retweeted_status_id[tweet_clean.retweeted_status_id.isnull()] = 0
tweet_clean.retweeted_status_id = tweet_clean.retweeted_status_id.astype(np.int64)
#tweet_clean.retweeted_status_id[tweet_clean.retweeted_status_id == 0] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [36]:
tweet_clean.retweeted_status_id.dtype

dtype('int64')

#### Checking ratings are correctly imported to ratings columns (from tweet text)

In [37]:
tweet_clean.loc[0].text

"This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU"

In [38]:
test = tweet_clean.text.str.extract(r'(\d{1,4}[\.]?[\d{1,2}]?)([\s]?[/][\s]?)(\d{2,4})',expand = True)
test

Unnamed: 0,0,1,2
0,13,/,10
1,13,/,10
2,12,/,10
3,13,/,10
4,12,/,10
5,13,/,10
6,13,/,10
7,13,/,10
8,13,/,10
9,14,/,10


In [39]:
sum(test[0].isnull())

1

In [40]:
#let's check where NaNs are coming from
test = test.join(tweet_clean['text'], how='left')
test.to_csv('test.csv')

In [41]:
tweet_clean.rating_numerator.value_counts()

12      554
11      464
10      459
13      340
9       157
8       102
7        55
14       51
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

In [42]:
test[0].value_counts()

12      554
11      464
10      459
13      340
9       158
8       102
7        54
14       51
5        35
6        32
3        19
4        17
2         9
1         8
75        2
420       2
15        2
0         2
204       1
007       1
960       1
143       1
84        1
27        1
13.5      1
165       1
80        1
99        1
60        1
1776      1
121       1
45        1
9.5       1
666       1
44        1
26        1
182       1
50        1
88        1
20        1
144       1
17        1
Name: 0, dtype: int64

In [43]:
tweet_clean.rating_denominator.value_counts()

10     2313
50        3
80        2
11        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [44]:
test[2].value_counts()

10     2314
50        3
80        2
20        2
11        2
16        1
00        1
170       1
120       1
150       1
90        1
15        1
70        1
130       1
40        1
110       1
Name: 2, dtype: int64

#### Overall quality of extratction looks good (check in CSV for exceptions, because it's much easier to see full text there); 
It seems that the really high values in denominator and occosationally numerator eg 1776 or 960 are actually valid ratings, and may be in some cases ratings of multiple of dogs in one picture.  Evcen 0 in denominator is a valid number, which was given twice, what it seems not to dogs but to random picture sent to @dog_rates. One real exception was that code reads 9.75 as 75.
For now, will not delete any extreme values, because they're valid, will deal with these in visuals/analysis as relevant.

#### Therefore will now attach new ratings to `tweet_clean` dataframe, to carry out comparison between existings ratings there and newly extracted ratings, and to drop or keep columns accordingly.

In [45]:
test.columns

Index([0, 1, 2, 'text'], dtype='object')

In [46]:
tweet_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'rating_numerator', 'rating_denominator',
       'name', 'doggo', 'floofer', 'pupper', 'puppo', 'dog_picture_url',
       'tweet_web_url', 'favourite_count', 'retweet_count'],
      dtype='object')

In [47]:
#drop uncenessary columns from test dataframe
test.drop(columns=[1,'text'], axis = 1, inplace=True)

#rename columns
test.rename(columns={0:'rating_numerator_alt',2:'rating_denominator_alt'}, inplace=True)

#join test to tweet_clean
tweet_clean = tweet_clean.join(test, how='left')

In [48]:
tweet_clean.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,rating_numerator,...,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator_alt,rating_denominator_alt
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,0,,,13,...,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,13,10
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,0,,,13,...,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,13,10


In [49]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2335 entries, 0 to 2355
Data columns (total 22 columns):
tweet_id                      2335 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2335 non-null object
source                        2335 non-null object
text                          2335 non-null object
retweeted_status_id           2335 non-null int64
retweeted_status_user_id      165 non-null float64
retweeted_status_timestamp    165 non-null object
rating_numerator              2335 non-null int64
rating_denominator            2335 non-null int64
name                          2335 non-null object
doggo                         2335 non-null object
floofer                       2335 non-null object
pupper                        2335 non-null object
puppo                         2335 non-null object
dog_picture_url               2335 non-null object
tweet_web_url                 233

#### In order to do the comparison between existing ratings numerator and denominator columns (`rating_numerator` and `rating_denominator`) and newly extracted ones, we will need to convert both to float -- there were some ratings (e.g. 13.5/10) that were not intergers, so I decided to follow this rule

In [50]:
sum(tweet_clean.rating_numerator.astype(float) != tweet_clean.rating_numerator_alt.astype(float))

4

In [51]:
print(tweet_clean[tweet_clean.rating_numerator.astype(float) != tweet_clean.rating_numerator_alt.astype(float)]['text'].values)

['This is Bella. She hopes her smile made you smile. If not, she is also offering you her favorite monkey. 13.5/10 https://t.co/qjrljjt948'
 'Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx'
 "I've been told there's a slight possibility he's checking his mirror. We'll bump to 9.5/10. Still a menace"
 'This is an Albanian 3 1/2 legged  Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv']


In [52]:
tweet_clean[tweet_clean.rating_numerator.astype(float) != tweet_clean.rating_numerator_alt.astype(float)][['rating_numerator','rating_numerator_alt']]

Unnamed: 0,rating_numerator,rating_numerator_alt
45,5,13.5
516,24,
1689,5,9.5
2335,1,9.0


#### Conclusion:
Newly extraceted ratings numerators (in column `rating_numerator_alt`) are the accurate ones for these 4 instances of mismatch

In [53]:
sum(tweet_clean.rating_denominator.astype(float) != tweet_clean.rating_denominator_alt.astype(float))

2

In [54]:
print(tweet_clean[tweet_clean.rating_denominator.astype(float) != tweet_clean.rating_denominator_alt.astype(float)]['text'].values)

['Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx'
 'This is an Albanian 3 1/2 legged  Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv']


In [55]:
tweet_clean[tweet_clean.rating_denominator.astype(float) != tweet_clean.rating_denominator_alt.astype(float)][['rating_denominator','rating_denominator_alt']]

Unnamed: 0,rating_denominator,rating_denominator_alt
516,7,
2335,2,10.0


#### Conclusion:
Newly extraceted ratings denominators (in column `rating_denominator_alt`) are the accurate ones for these 2 instances of mismatch

#### ACTION:
Drop existing ratings columns and replace them with newly extracted ratings columns.

In [56]:
tweet_clean.drop(columns=['rating_numerator','rating_denominator'], axis=1, inplace=True)
tweet_clean.rename(columns={'rating_numerator_alt':'rating_numerator','rating_denominator_alt':'rating_denominator'}, inplace=True)

In [57]:
tweet_clean.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,name,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator,rating_denominator
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,0,,,Phineas,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,13,10
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,0,,,Tilly,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,13,10


#### Will also fix the two numerator values that were 75 instead of actual 9.5, and convert all values to Float type, to complete cleaning of this column 

In [58]:
print(tweet_clean[tweet_clean.rating_numerator.astype(float) == 75][['text','rating_numerator']].values)

[["RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu…"
  '75']
 ["This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS"
  '75']]


In [59]:
tweet_clean.loc[tweet_clean.rating_numerator.astype(float) == 75,'rating_numerator'] = 9.75

In [60]:
tweet_clean.loc[tweet_clean.rating_numerator.astype(float) == 75,'rating_numerator']

Series([], Name: rating_numerator, dtype: object)

In [61]:
tweet_clean.rating_numerator = tweet_clean.rating_numerator.astype(float)

In [62]:
tweet_clean.rating_denominator = tweet_clean.rating_denominator.astype(float)

#### Will now proceed to check if 'dog stages' were correctly extracted, steps will include:
* Creating one column for 'dog_stages' (from the four "dummy" columns of `doggo`, `floofer`, `pupper`, `puppo`)
* Extracting 'dog stages' anew
* Comparing to existing 'dog stages'
* Identifying and keeping accurate column, dropping inaccurate column

In [63]:
tweet_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'name', 'doggo', 'floofer', 'pupper',
       'puppo', 'dog_picture_url', 'tweet_web_url', 'favourite_count',
       'retweet_count', 'rating_numerator', 'rating_denominator'],
      dtype='object')

In [64]:
tweet_clean.loc[tweet_clean.puppo == 'None','puppo'] = ''
tweet_clean.loc[tweet_clean.doggo == 'None','doggo'] = ''
tweet_clean.loc[tweet_clean.floofer == 'None','floofer'] = ''
tweet_clean.loc[tweet_clean.pupper == 'None','pupper'] = ''

In [65]:
#checking the possible outcome of concatenation
(tweet_clean.puppo+tweet_clean.doggo+tweet_clean.floofer+tweet_clean.pupper).value_counts()

                1959
pupper           244
doggo             80
puppo             29
doggopupper       12
floofer            9
doggofloofer       1
puppodoggo         1
dtype: int64

Ok, looks like the given extraction was not too accurate, but the concatenation did work, so proceeding..

In [66]:
tweet_clean['dog_stages'] = tweet_clean.puppo+tweet_clean.doggo+tweet_clean.floofer+tweet_clean.pupper

In [67]:
#Quick sample check
tweet_clean.sample().values

array([[667065535570550784, nan, nan, '2015-11-18 19:43:11 +0000',
        '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
        'Here we have a Hufflepuff. Loves vest. Eyes wide af. Flaccid tail. Matches carpet. Always a little blurry. 8/10 https://t.co/7JdgVqDnvR',
        0, nan, nan, 'None', '', '', '', '',
        'http://pbs.twimg.com/media/CUHkkJpXIAA2w3n.jpg',
        'https://twitter.com/dog_rates/status/667065535570550784/photo/1',
        163.0, 46.0, 8.0, 10.0, '']], dtype=object)

In [68]:
#Checking out "floofer" class which is actually not defined, but will include it as separate from 'floof' and see what happens
tweet_clean[tweet_clean.floofer != ''].sample().values

array([[800388270626521089, nan, nan, '2016-11-20 17:20:08 +0000',
        '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
        'This is Doc. He takes time out of every day to worship our plant overlords. 12/10 quite the floofer https://t.co/azMneS6Ly5',
        0, nan, nan, 'Doc', '', 'floofer', '', '',
        'http://pbs.twimg.com/media/CxuM3ofWgAE72kK.jpg',
        'https://twitter.com/dog_rates/status/800388270626521089/photo/1',
        11758.0, 3025.0, 12.0, 10.0, 'floofer']], dtype=object)

In [69]:
#extract anew
test = tweet_clean.text.str.extractall(r'([Pp]uppo|[Dd]oggo|[Pp]upper|[Bb]lep|[Ss]noot|[Ff]loofer|[Ff]loof)').unstack()
test[0][0].value_counts()

pupper     262
doggo       88
puppo       37
floof       20
Floof       10
Doggo        8
Pupper       8
Floofer      5
floofer      3
Blep         2
Puppo        1
blep         1
Name: 0, dtype: int64

In [70]:
test = test[0]

In [71]:
#attach new values, lowercase them
test[0] = test[0].str.lower()
test[1] = test[1].str.lower()
test[2] = test[2].str.lower()
test[3] = test[3].str.lower()
test.sample(5)

match,0,1,2,3
351,doggo,,,
331,doggo,,,
1770,pupper,,,
1814,pupper,,,
1220,pupper,,,


In [72]:
test.rename(columns={0:'dog_stages_alt1', 1:'dog_stages_alt2', 2:'dog_stages_alt3', 3:'dog_stages_alt4'}, inplace=True)
tweet_clean = tweet_clean.join(test, how='left')

In [73]:
#Quick sample check
tweet_clean[(tweet_clean.dog_stages_alt1.notnull())&(tweet_clean.dog_stages_alt2.notnull())]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,name,...,tweet_web_url,favourite_count,retweet_count,rating_numerator,rating_denominator,dog_stages,dog_stages_alt1,dog_stages_alt2,dog_stages_alt3,dog_stages_alt4
29,886366144734445568,,,2017-07-15 23:25:31 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Roscoe. Another pupper fallen victim t...,0,,,Roscoe,...,https://twitter.com/dog_rates/status/886366144...,20483.0,3071.0,12.0,10.0,pupper,pupper,blep,,
46,883360690899218434,,,2017-07-07 16:22:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Grizzwald. He may be the floofiest floofe...,0,,,Grizzwald,...,https://twitter.com/dog_rates/status/883360690...,21957.0,3561.0,13.0,10.0,floofer,floof,floofer,,
172,858843525470990336,,,2017-05-01 00:40:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",I have stumbled puppon a doggo painting party....,0,,,,...,https://twitter.com/dog_rates/status/858843525...,15453.0,3489.0,13.0,10.0,doggo,puppo,doggo,puppo,
191,855851453814013952,,,2017-04-22 18:31:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a puppo participating in the #ScienceMa...,0,,,,...,https://twitter.com/dog_rates/status/855851453...,45582.0,17980.0,13.0,10.0,puppodoggo,puppo,doggo,,
200,854010172552949760,,,2017-04-17 16:34:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...","At first I thought this was a shy doggo, but i...",0,,,,...,https://twitter.com/dog_rates/status/854010172...,16290.0,3204.0,11.0,10.0,doggofloofer,doggo,floofer,,
433,820690176645140481,,,2017-01-15 17:52:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,0,,,,...,https://twitter.com/dog_rates/status/820690176...,12766.0,3448.0,84.0,70.0,,floof,floof,,
460,817777686764523521,,,2017-01-07 16:59:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...","This is Dido. She's playing the lead role in ""...",0,,,Dido,...,https://twitter.com/dog_rates/status/817777686...,11233.0,2866.0,13.0,10.0,doggopupper,pupper,doggo,,
531,808106460588765185,,,2016-12-12 00:29:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have Burke (pupper) and Dexter (doggo)...,0,,,,...,https://twitter.com/dog_rates/status/808106460...,9229.0,2368.0,12.0,10.0,doggopupper,pupper,doggo,pupper,doggo
565,802265048156610565,7.331095e+17,4196984000.0,2016-11-25 21:37:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...","Like doggo, like pupper version 2. Both 11/10 ...",0,,,,...,https://twitter.com/dog_rates/status/802265048...,6960.0,1468.0,11.0,10.0,doggopupper,doggo,pupper,,
575,801115127852503040,,,2016-11-22 17:28:25 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bones. He's being haunted by another d...,0,,,Bones,...,https://twitter.com/dog_rates/status/801115127...,8524.0,2255.0,12.0,10.0,doggopupper,doggo,pupper,,


In [74]:
tweet_clean.loc[822].values

array([770093767776997377, nan, nan, '2016-08-29 03:00:36 +0000',
       '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
       'RT @dog_rates: This is just downright precious af. 12/10 for both pupper and doggo https://t.co/o5J479bZUC',
       741067306818797568, 4196983835.0, '2016-06-10 00:39:48 +0000',
       'just', 'doggo', '', 'pupper', '',
       'http://pbs.twimg.com/media/CkjMx99UoAM2B1a.jpg',
       'https://twitter.com/dog_rates/status/741067306818797568/photo/1',
       9761.0, 3238.0, 12.0, 10.0, 'doggopupper', 'pupper', 'doggo', nan,
       nan], dtype=object)

In [75]:
tweet_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'name', 'doggo', 'floofer', 'pupper',
       'puppo', 'dog_picture_url', 'tweet_web_url', 'favourite_count',
       'retweet_count', 'rating_numerator', 'rating_denominator', 'dog_stages',
       'dog_stages_alt1', 'dog_stages_alt2', 'dog_stages_alt3',
       'dog_stages_alt4'],
      dtype='object')

#### Quick examination shows that the new extraction is more complete than existing one (e.g. picks up on floof, blep; even though its not black and white if there is a reference to floof that the dog stage is floof, sometimes the reference is purely to floofy nature of the dog)
#### Each tweet may contain more than one dog and therefore contain doggo and pupper, or one dog within single tweet can have several stages attributed to him (sometimes inadvertently, through an adjactive that contains dog stage indicator);
#### To really tidy it up one should create a separate table, that should contain tweet_id and dog_stage matches;
#### Also, a serious consideration should be given if floofer and floof is indeed one category;
#### But for here, will simply drop all the columns associated with existing classification ( `doggo`, `floofer`, `pupper`, `puppo` and  `dog_stages`) and leave four columns wiht newly extracted categories (that is `dog_stages_alt1`, `dog_stages_alt2`,`dog_stages_alt3`,`dog_stages_alt4`)

In [76]:
tweet_clean.drop(columns=['doggo','floofer','pupper','puppo','dog_stages'], axis=1, inplace=True)

In [77]:
tweet_clean.sample(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,name,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator,rating_denominator,dog_stages_alt1,dog_stages_alt2,dog_stages_alt3,dog_stages_alt4
58,880935762899988482,,,2017-06-30 23:47:07 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Louis. He's crossing. It's a big deal....,0,,,Louis,http://pbs.twimg.com/media/DDm2Z5aXUAEDS2u.jpg,https://twitter.com/dog_rates/status/880935762...,16498.0,2686.0,13.0,10.0,,,,
718,783466772167098368,,,2016-10-05 00:40:09 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Finn. He likes eavesdropping from fili...,0,,,Finn,http://pbs.twimg.com/media/Ct9u3ljW8AEnVIm.jpg,https://twitter.com/dog_rates/status/783466772...,8879.0,2408.0,11.0,10.0,,,,


#### timestamp from string to datetime

In [78]:
# To datetime
tweet_clean.timestamp = pd.to_datetime(tweet_clean.timestamp)

In [79]:
tweet_clean.timestamp.dtype

datetime64[ns, UTC]

In [80]:
tweet_clean.sample(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,name,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator,rating_denominator,dog_stages_alt1,dog_stages_alt2,dog_stages_alt3,dog_stages_alt4
785,775085132600442880,,,2016-09-11 21:34:30+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Tucker. He would like a hug. 13/10 som...,0,,,Tucker,http://pbs.twimg.com/media/CsGnz64WYAEIDHJ.jpg,https://twitter.com/dog_rates/status/775085132...,16281.0,5031.0,13.0,10.0,,,,
35,885518971528720385,,,2017-07-13 15:19:09+00:00,"<a href=""http://twitter.com/download/iphone"" r...",I have a new hero and his name is Howard. 14/1...,0,,,,,https://twitter.com/dog_rates/status/830583320...,19777.0,3596.0,14.0,10.0,,,,


#### Identifying and dropping rows that are retweets (explicitly not required to be included in the project)
* any tweets that have retweeted_status_id as non zero value will be retweets, so will select those rows and drop them
* prior will visually cross check with other retweet attributes such as  retweeted_status_user_id, retweeted_status_timestamp
* will also drop the columns that are retweet attributes, as they would in any case be NaNs

In [81]:
tweet_clean[tweet_clean.retweeted_status_id != 0].sample(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,name,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator,rating_denominator,dog_stages_alt1,dog_stages_alt2,dog_stages_alt3,dog_stages_alt4
285,838916489579200512,,,2017-03-07 00:57:32+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @KibaDva: I collected all the good dogs!! 1...,838905980628819968,811740800.0,2017-03-07 00:15:46 +0000,,http://pbs.twimg.com/media/C6RkhU6UYAAMDpb.jpg,https://twitter.com/KibaDva/status/83890598062...,738.0,36.0,15.0,10.0,,,,
343,832040443403784192,,,2017-02-16 01:34:34+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Klein. These pics were ...,769940425801170944,4196984000.0,2016-08-28 16:51:16 +0000,Klein,http://pbs.twimg.com/media/Cq9guJ5WgAADfpF.jpg,https://twitter.com/dog_rates/status/769940425...,32790.0,10240.0,12.0,10.0,,,,
767,777641927919427584,,,2016-09-18 22:54:18+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Arnie. He's a Nova Scot...,750429297815552000,4196984000.0,2016-07-05 20:41:01 +0000,Arnie,http://pbs.twimg.com/media/CmoPdmHW8AAi8BI.jpg,https://twitter.com/dog_rates/status/750429297...,13651.0,4527.0,12.0,10.0,floof,,,


In [82]:
tweet_clean = tweet_clean[tweet_clean.retweeted_status_id == 0]

In [83]:
tweet_clean[tweet_clean.retweeted_status_id != 0].shape

(0, 20)

In [84]:
tweet_clean.drop(columns=['retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp'], axis=1, inplace=True)

In [85]:
tweet_clean.sample(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,name,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator,rating_denominator,dog_stages_alt1,dog_stages_alt2,dog_stages_alt3,dog_stages_alt4
1788,677557565589463040,,,2015-12-17 18:34:46+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Reckless pupper here. Not even looking at road...,,http://pbs.twimg.com/media/CWcrAVQWEAA6QMp.jpg,https://twitter.com/dog_rates/status/677557565...,2490.0,1204.0,10.0,10.0,pupper,pupper,,
257,843856843873095681,,,2017-03-20 16:08:44+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Sadie and Daisy. They do all thei...,Sadie,http://pbs.twimg.com/media/C7X7Ui0XgAA3m19.jpg,https://twitter.com/dog_rates/status/843856843...,21985.0,4814.0,12.0,10.0,,,,
2140,670003130994700288,,,2015-11-26 22:16:09+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Raphael. He is a Baskerville Conquista...,Raphael,http://pbs.twimg.com/media/CUxUSuaW4AAdQzv.jpg,https://twitter.com/dog_rates/status/670003130...,333.0,93.0,10.0,10.0,,,,


In [86]:
tweet_clean.shape

(2170, 17)

#### Check if ` dog_picture_url` in df_tweet should be the same as `jpg_url` in df_class; check and drop one
#### Will check, but ultimatelly, since the algorithm for identifying breeds was run on `jpg_url`, this column is the one to keep (unless one had access to the algorithm and could reidentify the breed)

In [87]:
tweet_clean.shape, class_clean.shape

((2170, 17), (2075, 12))

In [88]:
picture_df = pd.merge(tweet_clean, class_clean,
                            on=['tweet_id'], how='left')

In [89]:
picture_df.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'name', 'dog_picture_url', 'tweet_web_url',
       'favourite_count', 'retweet_count', 'rating_numerator',
       'rating_denominator', 'dog_stages_alt1', 'dog_stages_alt2',
       'dog_stages_alt3', 'dog_stages_alt4', 'jpg_url', 'img_num', 'p1',
       'p1_conf', 'p1_dog', 'p2', 'p2_conf', 'p2_dog', 'p3', 'p3_conf',
       'p3_dog'],
      dtype='object')

In [90]:
picture_df.sample().values

array([[722613351520608256, nan, nan,
        Timestamp('2016-04-20 02:30:23+0000', tz='UTC'),
        '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
        "Meet Schnitzel. He's a Tropicana Floofboop. Getting too big for his favorite basket. 12/10 just so damn fluffy https://t.co/qjd0UJKYUY",
        'Schnitzel', 'http://pbs.twimg.com/media/Cgc9AjMVIAERdUA.jpg',
        'https://twitter.com/dog_rates/status/722613351520608256/photo/1',
        5096.0, 1705.0, 12.0, 10.0, 'floof', nan, nan, nan,
        'https://pbs.twimg.com/media/Cgc9AjMVIAERdUA.jpg', 1.0,
        'Labrador_retriever', 0.5309149999999999, True,
        'golden_retriever', 0.28823000000000004, True, 'chow',
        0.044853699999999996, True]], dtype=object)

In [91]:
picture_df.shape

(2170, 28)

In [92]:
picture_df[picture_df.jpg_url.str.strip('https://') != picture_df.dog_picture_url.str.strip('http://')][['jpg_url','dog_picture_url']]

Unnamed: 0,jpg_url,dog_picture_url
4,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,http://pbs.twimg.com/media/DF6hr6AVYAAZ8G8.jpg
7,https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg,http://pbs.twimg.com/media/DFyBag_UQAAhhBC.jpg
18,https://pbs.twimg.com/media/DFTH_O-UQAACu20.jpg,http://pbs.twimg.com/media/DFTH_OiUMAE-k4M.jpg
22,https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg,http://pbs.twimg.com/media/DFDw2tsUAAEw7XW.jpg
25,https://pbs.twimg.com/media/DE8yicJW0AAAvBJ.jpg,http://pbs.twimg.com/media/DE8yicKXoAAnSF8.jpg
29,,
33,,
34,https://pbs.twimg.com/media/DEi_N9qXYAAgEEw.jpg,http://pbs.twimg.com/media/DEi_N9pXsAERvps.jpg
39,,
44,https://pbs.twimg.com/media/DEF2-_hXoAAs62q.jpg,http://pbs.twimg.com/media/DEF2-_jW0AEAS94.jpg


#### Conclusion -- there are too many different pictures used (probably because a tweet contained more than one pic), therefore will keep both ` dog_picture_url` in df_tweet and `jpg_url` in df_class, whereby  ` dog_picture_url` will refer to one of the dog's pictures, and  `jpg_url` will refer to a specific dog picture used in a classification algorithm

#### Rename columns in `class_clean` for easier comrehension

In [93]:
class_clean.columns

Index(['tweet_id', 'jpg_url', 'img_num', 'p1', 'p1_conf', 'p1_dog', 'p2',
       'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog'],
      dtype='object')

In [94]:
class_clean.sample().values

array([[750086836815486976,
        'https://pbs.twimg.com/media/Cmf5WLGWYAAcmRw.jpg', 1, 'pug',
        0.9782770000000001, True, 'teddy', 0.00313446, False,
        'Brabancon_griffon', 0.00306149, True]], dtype=object)

In [95]:
# rename jpg_url to better represent its meaning
class_clean.rename(columns={'jpg_url':'dog_picture_used_for_classification',
                            'img_num':'total_image_number',
                            'p1': 'breed_alt1',
                            'p1_conf': 'probability_alt1',
                            'p1_dog': 'classified_as_dog_alt1',
                            'p2':'breed_alt2',
                            'p2_conf':'probability_alt2',
                            'p2_dog':'classified_as_dog_alt2',
                            'p3':'breed_alt3',
                            'p3_conf':'probability_alt3',
                            'p3_dog':'classified_as_dog_alt3',}, inplace=True)

In [96]:
class_clean.columns

Index(['tweet_id', 'dog_picture_used_for_classification', 'total_image_number',
       'breed_alt1', 'probability_alt1', 'classified_as_dog_alt1',
       'breed_alt2', 'probability_alt2', 'classified_as_dog_alt2',
       'breed_alt3', 'probability_alt3', 'classified_as_dog_alt3'],
      dtype='object')

In [97]:
class_clean.breed_alt1.value_counts()

golden_retriever             150
Labrador_retriever           100
Pembroke                      89
Chihuahua                     83
pug                           57
chow                          44
Samoyed                       43
toy_poodle                    39
Pomeranian                    38
malamute                      30
cocker_spaniel                30
French_bulldog                26
miniature_pinscher            23
Chesapeake_Bay_retriever      23
seat_belt                     22
German_shepherd               20
Staffordshire_bullterrier     20
Siberian_husky                20
Cardigan                      19
web_site                      19
beagle                        18
Eskimo_dog                    18
Shetland_sheepdog             18
Maltese_dog                   18
teddy                         18
Shih-Tzu                      17
Lakeland_terrier              17
Rottweiler                    17
kuvasz                        16
Italian_greyhound             16
          

In [98]:
class_clean[class_clean.breed_alt1 == 'web_site']

Unnamed: 0,tweet_id,dog_picture_used_for_classification,total_image_number,breed_alt1,probability_alt1,classified_as_dog_alt1,breed_alt2,probability_alt2,classified_as_dog_alt2,breed_alt3,probability_alt3,classified_as_dog_alt3
94,667550882905632768,https://pbs.twimg.com/media/CUObvUJVEAAnYPF.jpg,1,web_site,0.998258,False,dishwasher,0.000201,False,oscilloscope,0.000142,False
95,667550904950915073,https://pbs.twimg.com/media/CUOb_gUUkAACXdS.jpg,1,web_site,0.999335,False,vizsla,8.1e-05,True,collie,6.9e-05,True
130,668291999406125056,https://pbs.twimg.com/media/CUZABzGW4AE5F0k.jpg,1,web_site,0.995535,False,skunk,0.001363,False,badger,0.000686,False
213,670040295598354432,https://pbs.twimg.com/media/CUx2F6lVEAAvFev.jpg,1,web_site,0.901552,False,borzoi,0.02666,True,Chihuahua,0.012438,True
270,670822709593571328,https://pbs.twimg.com/media/CU89schWIAIHQmA.jpg,1,web_site,0.993887,False,Chihuahua,0.001252,True,menu,0.000599,False
715,685547936038666240,https://pbs.twimg.com/media/CYOONfZW8AA7IOA.jpg,1,web_site,0.923987,False,oscilloscope,0.009712,False,hand-held_computer,0.008769,False
960,705786532653883392,https://pbs.twimg.com/media/Cct1G6vVAAI9ZjF.jpg,1,web_site,0.550294,False,Labrador_retriever,0.148496,True,golden_retriever,0.148482,True
1210,742465774154047488,https://pbs.twimg.com/media/Ck3EribXEAAPhZn.jpg,1,web_site,0.997154,False,comic_book,0.000439,False,desktop_computer,0.000268,False
1317,755206590534418437,https://pbs.twimg.com/media/CnsIT0WWcAAul8V.jpg,1,web_site,0.906673,False,printer,0.0086,False,carton,0.004533,False
1443,775729183532220416,https://pbs.twimg.com/media/CsPxk85XEAAeMQj.jpg,1,web_site,0.989407,False,hand-held_computer,0.002139,False,menu,0.002115,False


#### Ok, not sure how good is the classification, or maybe could have chosen a different picture for classification, but will not dig much; will just lowercase all breads for clenliness

In [99]:
class_clean.breed_alt1 = class_clean.breed_alt1.str.lower()
class_clean.breed_alt2 = class_clean.breed_alt2.str.lower()
class_clean.breed_alt3 = class_clean.breed_alt3.str.lower()

### Last check and save

In [100]:
tweet_clean.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,name,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator,rating_denominator,dog_stages_alt1,dog_stages_alt2,dog_stages_alt3,dog_stages_alt4
0,892420643555336193,,,2017-08-01 16:23:56+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,Phineas,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,13.0,10.0,,,,
1,892177421306343426,,,2017-08-01 00:17:27+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,Tilly,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,13.0,10.0,,,,
2,891815181378084864,,,2017-07-31 00:18:03+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,Archie,http://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,https://twitter.com/dog_rates/status/891815181...,24251.0,3991.0,12.0,10.0,,,,
3,891689557279858688,,,2017-07-30 15:58:51+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,Darla,http://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,https://twitter.com/dog_rates/status/891689557...,40731.0,8304.0,13.0,10.0,,,,
4,891327558926688256,,,2017-07-29 16:00:24+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,Franklin,http://pbs.twimg.com/media/DF6hr6AVYAAZ8G8.jpg,https://twitter.com/dog_rates/status/891327558...,38968.0,8992.0,12.0,10.0,,,,


In [101]:
class_clean.head()

Unnamed: 0,tweet_id,dog_picture_used_for_classification,total_image_number,breed_alt1,probability_alt1,classified_as_dog_alt1,breed_alt2,probability_alt2,classified_as_dog_alt2,breed_alt3,probability_alt3,classified_as_dog_alt3
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,welsh_springer_spaniel,0.465074,True,collie,0.156665,True,shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,german_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,rottweiler,0.243682,True,doberman,0.154629,True


In [102]:
#write to file

tweet_clean.to_csv('twitter_archive_master.csv',index=False)
class_clean.to_csv('twitter_archive_master_class.csv',index=False)