# Clean and analyse tweets
We will clean the Twitter data and then create additional features based on the message text. This code draws heavily on cleaning code I produced previously for Visual Analytics coursework, which in turn was inspired by https://ourcodingclub.github.io/tutorials/topic-modelling-python/

First load the file of twitter data created by 1_6 Get Tweets

In [2]:
import os
import tweepy
import datetime
import pandas as pd
import numpy as np

## 1. Load prefetched Twitter data

In [3]:
load_file_name = "./DataSources/TwitterData/raw_tweets_20210722.csv"

all_tweets = pd.read_csv(load_file_name)
print(all_tweets.shape)
all_tweets.head()

(2146, 12)


Unnamed: 0,tweet_id,tweet_date,tweeter_id,tweeter_user_name,tweeter_screen_name,tweeter_location,tweeter_coordinates,message_text,favourite_count,retweet_count,extract_run_date,retrieved_using_search_term
0,1418136545752752128,2021-07-22 09:11:14,2442936783,Jill,witherjay,,,"RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime
1,1418136215862321154,2021-07-22 09:09:56,82400915,mal ashy,MarilynAshy,manchester,,"RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime
2,1418135592165052417,2021-07-22 09:07:27,1097161319822909442,My Friends call me T,Tilly_TeaCup,"North West, England",,"RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime
3,1418135462976294912,2021-07-22 09:06:56,2263728375,Michael Fernandez,Ferdys_Tips,Harrow,,"RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime
4,1418134918346022915,2021-07-22 09:04:46,20128333,Trippy Pip,TrippyPip,"London, England",,"RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime


In [4]:
all_tweets = all_tweets.dropna(how='all') # only drops a row when every column is NA

all_tweets.shape

(2146, 12)

In [5]:
# Now check for individual NaN values
nan_values = all_tweets[all_tweets.isna().any(axis=1)]
print(nan_values.count())

tweet_id                       2145
tweet_date                     2145
tweeter_id                     2145
tweeter_user_name              2145
tweeter_screen_name            2145
tweeter_location               1374
tweeter_coordinates               0
message_text                   2145
favourite_count                2145
retweet_count                  2145
extract_run_date               2145
retrieved_using_search_term    2145
dtype: int64


### Comments
- tweeter_coordinates is nearly always NaN so drop it
- tweeter_location is often NaN so set it to "unknown"

In [6]:
all_tweets.drop('tweeter_coordinates', axis=1, inplace=True)

all_tweets.loc[all_tweets['tweeter_location'].isna(), 'tweeter_location'] = 'unknown'

nan_values = all_tweets[all_tweets.isna().any(axis=1)]
print(nan_values.count())
nan_values.head()

tweet_id                       0
tweet_date                     0
tweeter_id                     0
tweeter_user_name              0
tweeter_screen_name            0
tweeter_location               0
message_text                   0
favourite_count                0
retweet_count                  0
extract_run_date               0
retrieved_using_search_term    0
dtype: int64


Unnamed: 0,tweet_id,tweet_date,tweeter_id,tweeter_user_name,tweeter_screen_name,tweeter_location,message_text,favourite_count,retweet_count,extract_run_date,retrieved_using_search_term


### Comments
No more columns with nulls, now just select the subset of tweets that contain meaningful text, i.e. more than 4 characters
Also remove rows where message text comprises up to 15 blanks

In [7]:
all_tweets = all_tweets[all_tweets['message_text'].str.len() > 4]

print(all_tweets.shape)

(2146, 11)


In [8]:
all_tweets['message_text'] = all_tweets['message_text'].fillna('')
all_tweets['message_text'].replace('', np.nan, inplace=True)
all_tweets['message_text'].replace(' ', np.nan, inplace=True)
all_tweets['message_text'].replace('   ', np.nan, inplace=True)
all_tweets['message_text'].replace('    ', np.nan, inplace=True)
all_tweets['message_text'].replace('     ', np.nan, inplace=True)
all_tweets['message_text'].replace('      ', np.nan, inplace=True)
all_tweets['message_text'].replace('       ', np.nan, inplace=True)
all_tweets['message_text'].replace('        ', np.nan, inplace=True)
all_tweets['message_text'].replace('         ', np.nan, inplace=True)
all_tweets['message_text'].replace('          ', np.nan, inplace=True)
all_tweets['message_text'].replace('           ', np.nan, inplace=True)
all_tweets['message_text'].replace('            ', np.nan, inplace=True)
all_tweets['message_text'].replace('             ', np.nan, inplace=True)
all_tweets['message_text'].replace('              ', np.nan, inplace=True)
all_tweets['message_text'].replace('               ', np.nan, inplace=True)
all_tweets['message_text'].replace('                ', np.nan, inplace=True)
all_tweets['message_text'].replace('                 ', np.nan, inplace=True)
all_tweets['message_text'].replace('                  ', np.nan, inplace=True)
all_tweets['message_text'].replace('                   ', np.nan, inplace=True)
all_tweets['message_text'].replace('                    ', np.nan, inplace=True)
all_tweets['message_text'].replace('                     ', np.nan, inplace=True)
all_tweets['message_text'].replace('                              ', np.nan, inplace=True)
all_tweets['message_text'].replace('                                                                        ', np.nan, inplace=True)
all_tweets['message_text'].replace('                                                                                                                     ', np.nan, inplace=True)

all_tweets.dropna(subset=['message_text'], inplace=True)

print(all_tweets.shape)

(2146, 11)


### Strip out noise
Code taken from Visual Analytics_Lab08 (Text)

In [9]:
def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace("(<br/>)", "")
    ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
    ReviewText = ReviewText.str.replace('(&amp)', '')
    ReviewText = ReviewText.str.replace('(&gt)', '')
    ReviewText = ReviewText.str.replace('(&lt)', '')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')  
    return ReviewText
all_tweets['message_text'] = preprocess(all_tweets['message_text'])

### find where tweets originated

In [10]:
unknown_location = all_tweets['tweeter_location'][all_tweets['tweeter_location'].str.contains('unknown')].count()
not_london = all_tweets['tweeter_location'][(~all_tweets['tweeter_location'].str.contains('London')) &
                                           (~all_tweets['tweeter_location'].str.contains('unknown'))].count()

count_london = all_tweets['tweeter_location'][all_tweets['tweeter_location'].str.contains('London')].count()

unique_london = all_tweets['tweeter_location'][all_tweets['tweeter_location'].str.contains('London')].unique()

print('Number of tweets where location is not known is {}, while number of tweets originating outside London is {}'.format(unknown_location, not_london))
print('Number of tweets where location is London is {}, and these unique locations are as follows\n'.format(count_london))

unique_london



Number of tweets where location is not known is 771, while number of tweets originating outside London is 1025
Number of tweets where location is London is 350, and these unique locations are as follows



array(['London, England', 'London', 'Central London ',
       'West Ealing, London', 'Barnet, London', 'Custom House, London',
       'London, UK', 'South West London', 'London 1888',
       'London, England, UK', 'South London', 'London, United Kingdom',
       'Kensington, London', 'London Town', 'Sarf’ East London',
       '90, Ebury Street, London SW1W 9QD', 'London & Yorkshire',
       'Croydon, London', 'London ', 'Sarf London', 'London Cali',
       "London via God's Own Country", 'Tottenham, London', 'SE London',
       'Stratford, London', 'Hackney, London', 'London.',
       'City of London, Londoni', ' London', 'Brent, London',
       'Brentford🐝 West London bro.', 'London and Cape Town',
       'London England; Europe -not EU', 'Abbey Rd, London NW6 4DN',
       'Camden Town, London', 'West London ', 'East London',
       'City of London, London', 'London/Edinburgh', 'North London',
       'Westminster, London', 'London(ish)', 'Wandsworth, London',
       'South West Englan

### Now have a look at tweets by time

In [11]:
from datetime import timedelta
all_tweets['tweet_date_dt'] = pd.to_datetime(all_tweets['tweet_date'])

start_date_time = all_tweets['tweet_date_dt'].min()
all_tweets['duration'] = all_tweets['tweet_date_dt'] - start_date_time
all_tweets['duration_days'] = all_tweets['duration']/timedelta(days=1)
all_tweets['duration_days'] = all_tweets['duration_days'].astype(int) 
all_tweets['duration_hours'] = all_tweets['duration']/timedelta(hours=1)
all_tweets['duration_hours'] = all_tweets['duration_hours'].astype(int)

all_tweets.head()

Unnamed: 0,tweet_id,tweet_date,tweeter_id,tweeter_user_name,tweeter_screen_name,tweeter_location,message_text,favourite_count,retweet_count,extract_run_date,retrieved_using_search_term,tweet_date_dt,duration,duration_days,duration_hours
0,1418136545752752128,2021-07-22 09:11:14,2442936783,Jill,witherjay,unknown,"RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime,2021-07-22 09:11:14,8 days 06:57:12,8,198
1,1418136215862321154,2021-07-22 09:09:56,82400915,mal ashy,MarilynAshy,manchester,"RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime,2021-07-22 09:09:56,8 days 06:55:54,8,198
2,1418135592165052417,2021-07-22 09:07:27,1097161319822909442,My Friends call me T,Tilly_TeaCup,"North West, England","RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime,2021-07-22 09:07:27,8 days 06:53:25,8,198
3,1418135462976294912,2021-07-22 09:06:56,2263728375,Michael Fernandez,Ferdys_Tips,Harrow,"RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime,2021-07-22 09:06:56,8 days 06:52:54,8,198
4,1418134918346022915,2021-07-22 09:04:46,20128333,Trippy Pip,TrippyPip,"London, England","RT @MarcherLord1: Schoolboy, aged 7, took knif...",0,10,20210722,London AND knife AND crime,2021-07-22 09:04:46,8 days 06:50:44,8,198


In [12]:
tweets_by_time = all_tweets.groupby(["duration_hours"]).apply(lambda x: x['duration_hours'].count()).reset_index()
tweets_by_time.rename(columns = {0:'Count'}, inplace = True)
tweets_by_time.tail(10)

Unnamed: 0,duration_hours,Count
171,190,4
172,191,2
173,192,5
174,193,8
175,194,1
176,195,10
177,196,19
178,197,24
179,198,18
180,199,3


In [19]:
tweets_by_day = all_tweets.groupby(["duration_days"]).apply(lambda x: x['duration_days'].count()).reset_index()
tweets_by_day.rename(columns = {0:'Count'}, inplace = True)
tweets_by_day.tail(10)

Unnamed: 0,duration_days,Count
0,0,339
1,1,191
2,2,72
3,3,334
4,4,311
5,5,149
6,6,180
7,7,482
8,8,88


In [35]:
import altair as alt
alt.data_transformers.disable_max_rows()

tweet_by_hours = alt.Chart(tweets_by_time, title='Tweet volume by hour').mark_line(opacity=0.6, color='firebrick').encode(
    x=alt.X('duration_hours:Q', axis=alt.Axis(title='duration (hours)', ticks=False, values=[0,24,48,72,96,120,144,168,192])),
    y=alt.Y('Count:Q', axis=alt.Axis(title='count', ticks=False, values=[0, 10, 20, 30, 40, 50, 60])),  
).properties(
    width=400,
    height=300)

tweet_by_days_bars = alt.Chart(tweets_by_day, title='Tweet volume by day').mark_bar(opacity=0.6, color='firebrick', size=30).encode(
    x=alt.X('duration_days:Q', axis=alt.Axis(title='duration (days)', ticks=False, values=[0,1,2,3,4,5,6,7,8])),
    y=alt.Y('Count:Q', axis=None),  
)

tweet_by_days_text = alt.Chart(tweets_by_day).mark_text(
    align='center',
    baseline='middle',
    color='white',
    dy=8 
).encode(
    x=alt.X('duration_days:Q', axis=None),
    y=alt.Y('Count:Q', axis=None),  
    text=alt.Text('Count:Q')
)

tweet_by_days = (tweet_by_days_bars + tweet_by_days_text).properties(
    width=400,
    height=300)

(tweet_by_hours | tweet_by_days).configure_axis(
    grid=False,
    domain=False
).configure_view(
    strokeWidth=0)


Unnamed: 0,duration_days,Count
0,0,339
1,1,191
2,2,72
3,3,334
4,4,311
5,5,149
6,6,180
7,7,482
8,8,88
