In [1]:
import pandas as pd
import numpy as np
import json


In [2]:
df_archive = pd.read_csv('twitter-archive-enhanced.csv')
df_archive.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


In [3]:
df_class = pd.read_csv('twitter_image_class.csv')
df_class.head(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True


In [4]:
df_status = pd.read_csv('twitter_status_dogs.csv')
df_status.head(3)

Unnamed: 0,tweet_id,text_check,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
0,892420643555336193,This is Phineas. He's a mystical boy. Only eve...,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434,8138,
1,892177421306343426,This is Tilly. She's just checking pup on you....,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164,6029,
2,891815181378084864,This is Archie. He is a rare Norwegian Pouncin...,http://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,https://twitter.com/dog_rates/status/891815181...,24251,3991,


In [5]:
df_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [6]:
df_class.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [7]:
df_status.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 7 columns):
tweet_id                 2335 non-null int64
text_check               2335 non-null object
dog_picture_url          2335 non-null object
tweet_web_url            2335 non-null object
favourite_count          2335 non-null int64
retweet_count            2335 non-null int64
retweet_status_id_api    2335 non-null object
dtypes: int64(3), object(4)
memory usage: 127.8+ KB


In [8]:
all_columns = pd.Series(list(df_archive) + list(df_class) + list(df_status))
all_columns[all_columns.duplicated()]

17    tweet_id
29    tweet_id
dtype: object

#### Good that names of columns are not duplicated apart from tweet_id on which the tables can be joined;  but there are actually duplicating contents in several columns (eg text in `df_archive` and text_check in `df_status` )

### Tidying

Joining `df_archive` and `df_status`, which can sit in one table (following cleaning), complying with tidyness protocol; also once they do, it will be easier to examine the following
  - dropping rows that have no *favourite_rating* and *retween_count* (this is because those tweets were deleted, and where  subsequntly not received from Twitter API
  - checking tweet text is correct for each tweet (comparing two columns that store tweet text, deciding which to keep),
  - similarly checking other potentially duplicated columns, including tweet_url (columns `expanded_urls` and `tweet_web_url`) and retweet_id (columns `retweeted_status_id` and `retween_status_id_api`)
  - converting the `retweeted_status_id` column to integer (np.int64), in line with tweet_id
  - checking ratings (`rating_numerator` and `rating_denominator`) are correctly imported to ratings columns (from tweet text)
  - creating one column for 'dog_stages' (from the four "dummy" columns of `doggo`, `floofer`, `pupper`, `puppo`)
  - checking 'dog stages' are all correctly extracted
  - identifying and dropping columns that are retweets (explicitly not required to be included in the project)
  - checking other cleanliness issues
  
It seems that the image classification table `df_class` could also be joined without causing tidyness issues, that said, since it is such a different "unit" of information, with its own independent source (ie classification model output), will keep it separate for now.

In [9]:
tweet_clean = pd.merge(df_archive, df_status,
                            on=['tweet_id'], how='left')

In [10]:
tweet_clean

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,doggo,floofer,pupper,puppo,text_check,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,,,,,This is Phineas. He's a mystical boy. Only eve...,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,...,,,,,This is Tilly. She's just checking pup on you....,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,...,,,,,This is Archie. He is a rare Norwegian Pouncin...,http://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,https://twitter.com/dog_rates/status/891815181...,24251.0,3991.0,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,...,,,,,This is Darla. She commenced a snooze mid meal...,http://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,https://twitter.com/dog_rates/status/891689557...,40731.0,8304.0,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,...,,,,,This is Franklin. He would like you to stop ca...,http://pbs.twimg.com/media/DF6hr6AVYAAZ8G8.jpg,https://twitter.com/dog_rates/status/891327558...,38968.0,8992.0,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a majestic great white breaching ...,,,,https://twitter.com/dog_rates/status/891087950...,...,,,,,Here we have a majestic great white breaching ...,http://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg,https://twitter.com/dog_rates/status/891087950...,19575.0,2985.0,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,,,,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",...,,,,,Meet Jax. He enjoys ice cream so much he gets ...,http://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg,https://twitter.com/dog_rates/status/890971913...,11437.0,1979.0,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,...,,,,,When you watch your owner call another dog a g...,http://pbs.twimg.com/media/DFyBag_UQAAhhBC.jpg,https://twitter.com/dog_rates/status/890729181...,63112.0,18106.0,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,,,,https://twitter.com/dog_rates/status/890609185...,...,,,,,This is Zoey. She doesn't want to be one of th...,http://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg,https://twitter.com/dog_rates/status/890609185...,26940.0,4098.0,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,,,,https://twitter.com/dog_rates/status/890240255...,...,doggo,,,,This is Cassie. She is a college pup. Studying...,http://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,https://twitter.com/dog_rates/status/890240255...,30873.0,7078.0,


In [11]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 23 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
text_check                    23

#### Dropping rows that have no favourite_rating and retween_count (this is because those tweets were deleted, and where subsequntly not received from Twitter API

In [12]:
tweet_clean = tweet_clean[tweet_clean.favourite_count.notnull()]

In [13]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2335 entries, 0 to 2355
Data columns (total 23 columns):
tweet_id                      2335 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2335 non-null object
source                        2335 non-null object
text                          2335 non-null object
retweeted_status_id           165 non-null float64
retweeted_status_user_id      165 non-null float64
retweeted_status_timestamp    165 non-null object
expanded_urls                 2276 non-null object
rating_numerator              2335 non-null int64
rating_denominator            2335 non-null int64
name                          2335 non-null object
doggo                         2335 non-null object
floofer                       2335 non-null object
pupper                        2335 non-null object
puppo                         2335 non-null object
text_check                    23

In [14]:
tweet_clean.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,doggo,floofer,pupper,puppo,text_check,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
701,786051337297522688,7.72743e+17,7.30505e+17,2016-10-12 03:50:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",13/10 for breakdancing puppo @shibbnbot,,,,,...,,,,puppo,13/10 for breakdancing puppo @shibbnbot,,https://twitter.com/dog_rates/status/830583320...,1505.0,165.0,
1471,693993230313091072,,,2016-02-01 03:04:14 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",These lil fellas are the best of friends. 12/1...,,,,https://vine.co/v/i5ETazP5hrm,...,,,,,These lil fellas are the best of friends. 12/1...,,https://twitter.com/dog_rates/status/830583320...,1927.0,423.0,
2350,666050758794694657,,,2015-11-16 00:30:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a truly beautiful English Wilson Staff...,,,,https://twitter.com/dog_rates/status/666050758...,...,,,,,This is a truly beautiful English Wilson Staff...,http://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg,https://twitter.com/dog_rates/status/666050758...,129.0,56.0,
2193,668902994700836864,,,2015-11-23 21:24:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Lambeau. He's a Whistling Haiku from the ...,,,,https://twitter.com/dog_rates/status/668902994...,...,,,,,Meet Lambeau. He's a Whistling Haiku from the ...,http://pbs.twimg.com/media/CUhruUgUAAAa8FQ.jpg,https://twitter.com/dog_rates/status/668902994...,317.0,102.0,
304,836380477523124226,,,2017-02-28 01:00:19 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Ava. She just blasted off. Streamline ...,,,,https://twitter.com/dog_rates/status/836380477...,...,,,pupper,,This is Ava. She just blasted off. Streamline ...,http://pbs.twimg.com/media/C5trm6iWgAQ22Hw.jpg,https://twitter.com/dog_rates/status/836380477...,15244.0,3127.0,


In [15]:
#few checks to see values are sensible
tweet_clean.favourite_count.sort_values().head(5)

1295    51.0
342     68.0
2339    75.0
2298    82.0
2319    99.0
Name: favourite_count, dtype: float64

In [16]:
tweet_clean.favourite_count.sort_values().tail(5)

1079    119452.0
135     120097.0
534     124911.0
413     137875.0
1039    161664.0
Name: favourite_count, dtype: float64

In [17]:
tweet_clean.retweet_count.sort_values().head(5)

291     1.0
342     2.0
1295    2.0
274     2.0
113     3.0
Name: retweet_count, dtype: float64

In [18]:
tweet_clean.retweet_count.sort_values().tail(5)

66      42651.0
413     46757.0
534     59854.0
1079    60745.0
1039    82098.0
Name: retweet_count, dtype: float64

#### Checking tweet text is correct for each tweet (comparing two columns that store tweet text, deciding which to keep)

In [19]:
sum(tweet_clean.text != tweet_clean.text_check)

0

Ok, marvelous, all text from tweets was downloaded correctly;
therefore can just drop one of the text columns, will drop `text_check`

In [20]:
tweet_clean.drop(columns=['text_check'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [21]:
tweet_clean.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,name,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,Phineas,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,...,Tilly,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,


#### Similarly checking other potentially duplicated columns, including tweet_url (columns `expanded_urls` and `tweet_web_url`) and retweet_id (columns `retweeted_status_id` and  `retween_status_id_api`)

In [22]:
sum(tweet_clean.expanded_urls != tweet_clean.tweet_web_url)

852

Hm, not such a great match..

In [23]:
tweet_clean[tweet_clean.expanded_urls != tweet_clean.tweet_web_url][['expanded_urls','tweet_web_url']]

Unnamed: 0,expanded_urls,tweet_web_url
4,https://twitter.com/dog_rates/status/891327558...,https://twitter.com/dog_rates/status/891327558...
6,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",https://twitter.com/dog_rates/status/890971913...
7,https://twitter.com/dog_rates/status/890729181...,https://twitter.com/dog_rates/status/890729181...
10,https://twitter.com/dog_rates/status/890006608...,https://twitter.com/dog_rates/status/890006608...
13,https://twitter.com/dog_rates/status/889638837...,https://twitter.com/dog_rates/status/889638837...
17,https://twitter.com/dog_rates/status/888804989...,https://twitter.com/dog_rates/status/888804989...
18,https://twitter.com/dog_rates/status/888554962...,https://twitter.com/dog_rates/status/888554962...
20,https://twitter.com/dog_rates/status/888078434...,https://twitter.com/dog_rates/status/888078434...
23,https://twitter.com/dog_rates/status/887473957...,https://twitter.com/dog_rates/status/887473957...
26,https://twitter.com/dog_rates/status/886983233...,https://twitter.com/dog_rates/status/886983233...


In [24]:
tweet_clean[tweet_clean.expanded_urls != tweet_clean.tweet_web_url].loc[2209,['expanded_urls','tweet_web_url']].values

array(['https://twitter.com/dog_rates/status/668623201287675904/photo/1,https://twitter.com/dog_rates/status/668623201287675904/photo/1,https://twitter.com/dog_rates/status/668623201287675904/photo/1,https://twitter.com/dog_rates/status/668623201287675904/photo/1',
       'https://twitter.com/dog_rates/status/668623201287675904/photo/1'],
      dtype=object)

Ok, seems that `tweet_web_url` is more complete and also devoid of repetitions/multiple entries (unlike `expanded_urls` column); threrefore will drop `expanded_urls` column

In [25]:
tweet_clean.drop(columns=['expanded_urls'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [26]:
tweet_clean.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,rating_numerator,...,name,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count,retweet_status_id_api
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,13,...,Phineas,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,13,...,Tilly,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,


#### Now checking for duplications in retweet_id (columns `retweeted_status_id` and  `retween_status_id_api`)

In [27]:
#normalising for NaN and 'None'
tweet_clean.retweet_status_id_api[tweet_clean.retweet_status_id_api == 'None'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [28]:
sum(tweet_clean.retweet_status_id_api == 'None')

0

In [29]:
sum(tweet_clean.retweet_status_id_api.isnull())

2170

In [30]:
#Ok, now let's compare the two columns
tweet_clean[tweet_clean.retweeted_status_id != tweet_clean.retweet_status_id_api][['retweeted_status_id','retweet_status_id_api']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2335 entries, 0 to 2355
Data columns (total 2 columns):
retweeted_status_id      165 non-null float64
retweet_status_id_api    165 non-null object
dtypes: float64(1), object(1)
memory usage: 54.7+ KB


In [31]:
#ok, for comparison let's try to use `retweet_status_id_api` values as float (also NaNs are accomodated for that way)
sum(tweet_clean.retweeted_status_id == tweet_clean.retweet_status_id_api.astype(float))

165

In [32]:
temp = tweet_clean.retweet_status_id_api.astype(float)
temp2 = tweet_clean.retweeted_status_id
sum(temp[temp.notnull()] != temp2[temp2.notnull()])

0

#### Ok, so in conclusion, the `retweeted_status_id`  and `retweet_status_id_api` are the same; therefore I will drop the `retweet_status_id_api` column

In [33]:
tweet_clean.drop(columns=['retweet_status_id_api'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [34]:
tweet_clean.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,13,10,Phineas,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,13,10,Tilly,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,12,10,Archie,,,,,http://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,https://twitter.com/dog_rates/status/891815181...,24251.0,3991.0


#### For completeness with the `retweeted_status_id` column, will also convert it to integer (np.int64), in line with tweet_id
To deal with NaNs I will first convert all NaNs to 0s, then the column to int64

In [35]:
tweet_clean.retweeted_status_id[tweet_clean.retweeted_status_id.isnull()] = 0
tweet_clean.retweeted_status_id = tweet_clean.retweeted_status_id.astype(np.int64)
#tweet_clean.retweeted_status_id[tweet_clean.retweeted_status_id == 0] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [36]:
tweet_clean.retweeted_status_id.dtype

dtype('int64')

#### Checking ratings are correctly imported to ratings columns (from tweet text)

In [37]:
tweet_clean.loc[0].text

"This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU"

In [38]:
test = tweet_clean.text.str.extract(r'(\d{1,4}[\.]?[\d{1,2}]?)([\s]?[/][\s]?)(\d{2,4})',expand = True)
test

Unnamed: 0,0,1,2
0,13,/,10
1,13,/,10
2,12,/,10
3,13,/,10
4,12,/,10
5,13,/,10
6,13,/,10
7,13,/,10
8,13,/,10
9,14,/,10


In [39]:
sum(test[0].isnull())

1

In [40]:
#let's check where NaNs are coming from
test = test.join(tweet_clean['text'], how='left')
test.to_csv('test.csv')

In [41]:
tweet_clean.rating_numerator.value_counts()

12      554
11      464
10      459
13      340
9       157
8       102
7        55
14       51
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

In [42]:
test[0].value_counts()

12      554
11      464
10      459
13      340
9       158
8       102
7        54
14       51
5        35
6        32
3        19
4        17
2         9
1         8
0         2
15        2
420       2
75        2
9.5       1
80        1
007       1
44        1
144       1
27        1
960       1
666       1
17        1
121       1
88        1
1776      1
26        1
60        1
13.5      1
99        1
50        1
45        1
165       1
143       1
20        1
182       1
84        1
204       1
Name: 0, dtype: int64

In [43]:
tweet_clean.rating_denominator.value_counts()

10     2313
50        3
80        2
11        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [44]:
test[2].value_counts()

10     2314
50        3
11        2
20        2
80        2
120       1
16        1
170       1
130       1
90        1
150       1
40        1
70        1
15        1
00        1
110       1
Name: 2, dtype: int64

#### Overall quality of extratction looks good (check in CSV for exceptions, because it's much easier to see full text there); 
It seems that the really high values in denominator and occosationally numerator eg 1776 or 960 are actually valid ratings, and may be in some cases ratings of multiple of dogs in one picture.  Evcen 0 in denominator is a valid number, which was given twice, what it seems not to dogs but to random picture sent to @dog_rates. One real exception was that code reads 9.75 as 75.
For now, will not delete any extreme values, because they're valid, will deal with these in visuals/analysis as relevant.

#### Therefore will now attach new ratings to `tweet_clean` dataframe, to carry out comparison between existings ratings there and newly extracted ratings, and to drop or keep columns accordingly.

In [45]:
test.columns

Index([0, 1, 2, 'text'], dtype='object')

In [46]:
tweet_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'rating_numerator', 'rating_denominator',
       'name', 'doggo', 'floofer', 'pupper', 'puppo', 'dog_picture_url',
       'tweet_web_url', 'favourite_count', 'retweet_count'],
      dtype='object')

In [47]:
#drop uncenessary columns from test dataframe
test.drop(columns=[1,'text'], axis = 1, inplace=True)

#rename columns
test.rename(columns={0:'rating_numerator_alt',2:'rating_denominator_alt'}, inplace=True)

#join test to tweet_clean
tweet_clean = tweet_clean.join(test, how='left')

In [48]:
tweet_clean.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,rating_numerator,...,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator_alt,rating_denominator_alt
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,0,,,13,...,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,13,10
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,0,,,13,...,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,13,10


In [49]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2335 entries, 0 to 2355
Data columns (total 22 columns):
tweet_id                      2335 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2335 non-null object
source                        2335 non-null object
text                          2335 non-null object
retweeted_status_id           2335 non-null int64
retweeted_status_user_id      165 non-null float64
retweeted_status_timestamp    165 non-null object
rating_numerator              2335 non-null int64
rating_denominator            2335 non-null int64
name                          2335 non-null object
doggo                         2335 non-null object
floofer                       2335 non-null object
pupper                        2335 non-null object
puppo                         2335 non-null object
dog_picture_url               2335 non-null object
tweet_web_url                 233

#### In order to do the comparison between existing ratings numerator and denominator columns (`rating_numerator` and `rating_denominator`) and newly extracted ones, we will need to convert both to float -- there were some ratings (e.g. 13.5/10) that were not intergers, so I decided to follow this rule

In [50]:
sum(tweet_clean.rating_numerator.astype(float) != tweet_clean.rating_numerator_alt.astype(float))

4

In [51]:
print(tweet_clean[tweet_clean.rating_numerator.astype(float) != tweet_clean.rating_numerator_alt.astype(float)]['text'].values)

['This is Bella. She hopes her smile made you smile. If not, she is also offering you her favorite monkey. 13.5/10 https://t.co/qjrljjt948'
 'Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx'
 "I've been told there's a slight possibility he's checking his mirror. We'll bump to 9.5/10. Still a menace"
 'This is an Albanian 3 1/2 legged  Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv']


In [52]:
tweet_clean[tweet_clean.rating_numerator.astype(float) != tweet_clean.rating_numerator_alt.astype(float)][['rating_numerator','rating_numerator_alt']]

Unnamed: 0,rating_numerator,rating_numerator_alt
45,5,13.5
516,24,
1689,5,9.5
2335,1,9.0


#### Conclusion:
Newly extraceted ratings numerators (in column `rating_numerator_alt`) are the accurate ones for these 4 instances of mismatch

In [53]:
sum(tweet_clean.rating_denominator.astype(float) != tweet_clean.rating_denominator_alt.astype(float))

2

In [54]:
print(tweet_clean[tweet_clean.rating_denominator.astype(float) != tweet_clean.rating_denominator_alt.astype(float)]['text'].values)

['Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx'
 'This is an Albanian 3 1/2 legged  Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv']


In [55]:
tweet_clean[tweet_clean.rating_denominator.astype(float) != tweet_clean.rating_denominator_alt.astype(float)][['rating_denominator','rating_denominator_alt']]

Unnamed: 0,rating_denominator,rating_denominator_alt
516,7,
2335,2,10.0


#### Conclusion:
Newly extraceted ratings denominators (in column `rating_denominator_alt`) are the accurate ones for these 2 instances of mismatch

#### ACTION:
Drop existing ratings columns and replace them with newly extracted ratings columns.

In [56]:
tweet_clean.drop(columns=['rating_numerator','rating_denominator'], axis=1, inplace=True)
tweet_clean.rename(columns={'rating_numerator_alt':'rating_numerator','rating_denominator_alt':'rating_denominator'}, inplace=True)

In [57]:
tweet_clean.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,name,doggo,floofer,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator,rating_denominator
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,0,,,Phineas,,,,,http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,https://twitter.com/dog_rates/status/892420643...,37434.0,8138.0,13,10
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,0,,,Tilly,,,,,http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,https://twitter.com/dog_rates/status/892177421...,32164.0,6029.0,13,10


#### Will also fix the two numerator values that were 75 instead of actual 9.5, and convert all values to Float type, to complete cleaning of this column 

In [58]:
print(tweet_clean[tweet_clean.rating_numerator.astype(float) == 75][['text','rating_numerator']].values)

[["RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu…"
  '75']
 ["This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS"
  '75']]


In [59]:
tweet_clean.loc[tweet_clean.rating_numerator.astype(float) == 75,'rating_numerator'] = 9.75

In [60]:
tweet_clean.loc[tweet_clean.rating_numerator.astype(float) == 75,'rating_numerator']

Series([], Name: rating_numerator, dtype: object)

In [61]:
tweet_clean.rating_numerator = tweet_clean.rating_numerator.astype(float)

In [62]:
tweet_clean.rating_denominator = tweet_clean.rating_denominator.astype(float)

#### Will now proceed to check if 'dog stages' were correctly extracted, steps will include:
* Creating one column for 'dog_stages' (from the four "dummy" columns of `doggo`, `floofer`, `pupper`, `puppo`)
* Extracting 'dog stages' anew
* Comparing to existing 'dog stages'
* Identifying and keeping accurate column, dropping inaccurate column

In [63]:
tweet_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'name', 'doggo', 'floofer', 'pupper',
       'puppo', 'dog_picture_url', 'tweet_web_url', 'favourite_count',
       'retweet_count', 'rating_numerator', 'rating_denominator'],
      dtype='object')

In [64]:
tweet_clean.loc[tweet_clean.puppo == 'None','puppo'] = ''
tweet_clean.loc[tweet_clean.doggo == 'None','doggo'] = ''
tweet_clean.loc[tweet_clean.floofer == 'None','floofer'] = ''
tweet_clean.loc[tweet_clean.pupper == 'None','pupper'] = ''

In [65]:
#checking the possible outcome of concatenation
(tweet_clean.puppo+tweet_clean.doggo+tweet_clean.floofer+tweet_clean.pupper).value_counts()

                1959
pupper           244
doggo             80
puppo             29
doggopupper       12
floofer            9
puppodoggo         1
doggofloofer       1
dtype: int64

Ok, looks like the given extraction was not too accurate, but the concatenation did work, so proceeding..

In [66]:
tweet_clean['dog_stages'] = tweet_clean.puppo+tweet_clean.doggo+tweet_clean.floofer+tweet_clean.pupper

In [67]:
#Quick sample check
tweet_clean.sample().values

array([[681981167097122816, nan, nan, '2015-12-29 23:32:35 +0000',
        '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
        "This is Jimothy. He's a Trinidad Poliwhirl. Father was a velociraptor. Exceptionally unamused. 12/10 would adopt https://t.co/VwdIk0OwVx",
        0, nan, nan, 'Jimothy', '', '', '', '',
        'http://pbs.twimg.com/media/CXbiQHmWcAAt6Lm.jpg',
        'https://twitter.com/dog_rates/status/681981167097122816/photo/1',
        2886.0, 1085.0, 12.0, 10.0, '']], dtype=object)

In [68]:
#Checking out "floofer" class which is actually not defined, but will include it as separate from 'floof' and see what happens
tweet_clean[tweet_clean.floofer != ''].sample().values

array([[776218204058357768, nan, nan, '2016-09-15 00:36:55 +0000',
        '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
        "Atlas rolled around in some chalk and now he's a magical rainbow floofer. 13/10 please never take a bath https://t.co/nzqTNw0744",
        0, nan, nan, 'None', '', 'floofer', '', '',
        'http://pbs.twimg.com/media/CsWuVEdWcAAqbe9.jpg',
        'https://twitter.com/dog_rates/status/776218204058357768/photo/1',
        31404.0, 17075.0, 13.0, 10.0, 'floofer']], dtype=object)

In [69]:
#extract anew
dog_stages_full = ['doggo', 'pupper','puppo','blep','snoot','floof','floofer']
test = tweet_clean.text.str.extract(r'([Pp]uppo|[Dd]oggo|[Pp]upper|[Bb]lep|[Ss]noot|[Ff]loofer|[Ff]loof)', expand =True)
test[0].value_counts()

pupper     262
doggo       88
puppo       37
floof       20
Floof       10
Pupper       8
Doggo        8
Floofer      5
floofer      3
Blep         2
Puppo        1
blep         1
Name: 0, dtype: int64

In [70]:
#attach new values, lowercase them
test[0] = test[0].str.lower()
test.rename(columns={0:'dog_stages_alt'}, inplace=True)
tweet_clean = tweet_clean.join(test, how='left')

In [78]:
#Quick sample check
tweet_clean[tweet_clean.dog_stages_alt.notnull()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,name,...,pupper,puppo,dog_picture_url,tweet_web_url,favourite_count,retweet_count,rating_numerator,rating_denominator,dog_stages,dog_stages_alt
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,0,,,Cassie,...,,,http://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,https://twitter.com/dog_rates/status/890240255...,30873.0,7078.0,14.0,10.0,doggo,doggo
12,889665388333682689,,,2017-07-25 01:55:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a puppo that seems to be on the fence a...,0,,,,...,,puppo,http://pbs.twimg.com/media/DFi579UWsAAatzw.jpg,https://twitter.com/dog_rates/status/889665388...,46509.0,9627.0,13.0,10.0,puppo,puppo
14,889531135344209921,,,2017-07-24 17:02:04 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Stuart. He's sporting his favorite fan...,0,,,Stuart,...,,puppo,http://pbs.twimg.com/media/DFg_2PVW0AEHN3p.jpg,https://twitter.com/dog_rates/status/889531135...,14628.0,2163.0,13.0,10.0,puppo,puppo
25,887101392804085760,,,2017-07-18 00:07:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This... is a Jubilant Antarctic House Bear. We...,0,,,,...,,,http://pbs.twimg.com/media/DE-eAq6UwAA-jaE.jpg,https://twitter.com/dog_rates/status/887101392...,29581.0,5723.0,12.0,10.0,,floof
29,886366144734445568,,,2017-07-15 23:25:31 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Roscoe. Another pupper fallen victim t...,0,,,Roscoe,...,pupper,,http://pbs.twimg.com/media/DE0BTnQUwAApKEH.jpg,https://twitter.com/dog_rates/status/886366144...,20483.0,3071.0,12.0,10.0,pupper,pupper
43,884162670584377345,,,2017-07-09 21:29:42 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Yogi. He doesn't have any important dog m...,0,,,Yogi,...,,,http://pbs.twimg.com/media/DEUtQbzW0AUTv_o.jpg,https://twitter.com/dog_rates/status/884162670...,19706.0,2869.0,12.0,10.0,doggo,doggo
46,883360690899218434,,,2017-07-07 16:22:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Grizzwald. He may be the floofiest floofe...,0,,,Grizzwald,...,,,http://pbs.twimg.com/media/DEJT3FeXoAAtwUy.jpg,https://twitter.com/dog_rates/status/883360690...,21957.0,3561.0,13.0,10.0,floofer,floof
49,882762694511734784,,,2017-07-06 00:46:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Gus. He's quite the cheeky pupper. Alr...,0,,,Gus,...,pupper,,http://pbs.twimg.com/media/DEAz_HHXsAA-p_z.jpg,https://twitter.com/dog_rates/status/882762694...,27361.0,4712.0,12.0,10.0,pupper,pupper
53,881906580714921986,,,2017-07-03 16:04:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Rey. He's a Benebop Cumberfloof. 12/10...,0,,,Rey,...,,,http://pbs.twimg.com/media/DD0pWm9XcAAeSBL.jpg,https://twitter.com/dog_rates/status/881906580...,23528.0,3271.0,12.0,10.0,,floof
54,881666595344535552,,,2017-07-03 00:11:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Gary. He couldn't miss this puppertuni...,0,,,Gary,...,,,http://pbs.twimg.com/media/DDxPFwbWAAEbVVR.jpg,https://twitter.com/dog_rates/status/881666595...,48983.0,10269.0,13.0,10.0,,pupper


In [82]:
tweet_clean.loc[61].values

array([880221127280381952, nan, nan, '2017-06-29 00:27:25 +0000',
       '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
       0, nan, nan, 'Jesse', '', '', '', '',
       'http://pbs.twimg.com/media/DDcscbXU0AIfDzs.jpg',
       'https://twitter.com/dog_rates/status/880221127280381952/photo/1',
       26148.0, 4059.0, 12.0, 10.0, '', 'blep'], dtype=object)