## Data Preprocessing

### read data

In [1]:
import pandas as pd

In [2]:
trump_tweets = pd.read_csv('../data/RAW_trump_tweets_20160101_20200930.csv', parse_dates=['created_at'])
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,I won the debate big based on compilation of p...,2020-10-01 15:14:28,44961,337926,False,1311685923097260034
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-10-01 03:45:25,19616,65721,False,1311512518800470016
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-10-01 03:25:31,29393,0,True,1311507509958471680
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-10-01 03:00:33,15992,63294,False,1311501225423073281
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-10-01 02:59:02,51445,190750,False,1311500843309387781


In [3]:
trump_tweets.dtypes

source                    object
text                      object
created_at        datetime64[ns]
retweet_count              int64
favorite_count             int64
is_retweet                object
id_str                     int64
dtype: object

### convert GMT into US Eastern timezone

In [4]:
import datetime
import pytz

In [5]:
gmt = pytz.timezone('GMT')
us_eastern = pytz.timezone('US/Eastern')

In [6]:
def convert_to_us_eastern(row):
    date = row['created_at']
    date_gmt = gmt.localize(date)
    date_us_eastern = date_gmt.astimezone(us_eastern).tz_localize(None) # remove tz info to obtain naive local time
    return date_us_eastern

In [7]:
trump_tweets['created_at'] = trump_tweets.apply(convert_to_us_eastern, axis=1)

In [8]:
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,I won the debate big based on compilation of p...,2020-10-01 11:14:28,44961,337926,False,1311685923097260034
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781


### filter tweets from 1 Jan 2016 to 30 Sep 2020

In [9]:
start_date = datetime.datetime.strptime('2016-01-01', '%Y-%m-%d')
end_date = datetime.datetime.strptime('2020-09-30', '%Y-%m-%d')

In [10]:
trump_tweets = trump_tweets[(trump_tweets.created_at >= start_date) & (trump_tweets.created_at <= end_date)]

In [11]:
trump_tweets

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
69,Twitter for iPhone,https://t.co/HUSFkHqsyC,2020-09-29 23:57:50,75761,321378,False,1311153253472636928
70,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640
71,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123
72,Twitter for iPhone,https://t.co/58ssX7EfUj,2020-09-29 22:35:11,20658,65014,False,1311132452853706752
73,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885
...,...,...,...,...,...,...,...
27059,Twitter for Android,Well the year has officially begun. I have man...,2016-01-01 18:00:09,2642,8495,False,683060169677344768
27060,Twitter for Android,@sprinklermanus: @CNN @realDonaldTrump they're...,2016-01-01 16:29:56,933,3330,False,683037464504745985
27061,Twitter for Android,@jallenaip: Hillary said she was in a Fog of W...,2016-01-01 01:08:06,2721,7490,False,682805477168779264
27062,Twitter for iPhone,Happy New Year from #MarALago! Thank you to my...,2016-01-01 01:07:28,1948,8258,False,682805320217980929


### remove links

In [12]:
import re

In [13]:
def remove_links(row):
    tweet = row['text']
    tweet = re.sub(r"http\S+", "", tweet)
    return tweet

In [14]:
trump_tweets['text'] = trump_tweets.apply(remove_links, axis=1)

In [15]:
trump_tweets

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
69,Twitter for iPhone,,2020-09-29 23:57:50,75761,321378,False,1311153253472636928
70,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640
71,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123
72,Twitter for iPhone,,2020-09-29 22:35:11,20658,65014,False,1311132452853706752
73,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885
...,...,...,...,...,...,...,...
27059,Twitter for Android,Well the year has officially begun. I have man...,2016-01-01 18:00:09,2642,8495,False,683060169677344768
27060,Twitter for Android,@sprinklermanus: @CNN @realDonaldTrump they're...,2016-01-01 16:29:56,933,3330,False,683037464504745985
27061,Twitter for Android,@jallenaip: Hillary said she was in a Fog of W...,2016-01-01 01:08:06,2721,7490,False,682805477168779264
27062,Twitter for iPhone,Happy New Year from #MarALago! Thank you to my...,2016-01-01 01:07:28,1948,8258,False,682805320217980929


### extract hashtags

In [16]:
# create a new column to store the hashtags
trump_tweets['hashtag'] = trump_tweets['text'].apply(lambda x: re.findall(r'\B#\w*[a-zA-Z]+\w*', x))

In [17]:
trump_tweets

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag
69,Twitter for iPhone,,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[]
70,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[]
71,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[]
72,Twitter for iPhone,,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[]
73,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain]
...,...,...,...,...,...,...,...,...
27059,Twitter for Android,Well the year has officially begun. I have man...,2016-01-01 18:00:09,2642,8495,False,683060169677344768,[]
27060,Twitter for Android,@sprinklermanus: @CNN @realDonaldTrump they're...,2016-01-01 16:29:56,933,3330,False,683037464504745985,[]
27061,Twitter for Android,@jallenaip: Hillary said she was in a Fog of W...,2016-01-01 01:08:06,2721,7490,False,682805477168779264,[]
27062,Twitter for iPhone,Happy New Year from #MarALago! Thank you to my...,2016-01-01 01:07:28,1948,8258,False,682805320217980929,[#MarALago]


### extract mentions

In [18]:
# extract mentions
# for retweets, this regex expression only extracts mentions, but ignores the user handle of the OP
# e.g. the regex expression on "RT @realDonaldTrump: Biden for resident! @JoeBiden" will extract @JoeBiden but not @realDonaldTrump
# as @realDonaldTrump is the OP of the tweet that has been retweeted, rather than a direct mention in the retweet.
trump_tweets['mention'] = trump_tweets['text'].apply(lambda x: re.findall(r'(?<!RT\s)@[a-zA-Z0-9]{1,15}', x))

In [19]:
trump_tweets

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention
69,Twitter for iPhone,,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[],[]
70,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[]
71,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[]
72,Twitter for iPhone,,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[],[]
73,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[]
...,...,...,...,...,...,...,...,...,...
27059,Twitter for Android,Well the year has officially begun. I have man...,2016-01-01 18:00:09,2642,8495,False,683060169677344768,[],[]
27060,Twitter for Android,@sprinklermanus: @CNN @realDonaldTrump they're...,2016-01-01 16:29:56,933,3330,False,683037464504745985,[],"[@sprinklermanus, @CNN, @realDonaldTrump]"
27061,Twitter for Android,@jallenaip: Hillary said she was in a Fog of W...,2016-01-01 01:08:06,2721,7490,False,682805477168779264,[],[@jallenaip]
27062,Twitter for iPhone,Happy New Year from #MarALago! Thank you to my...,2016-01-01 01:07:28,1948,8258,False,682805320217980929,[#MarALago],[]


### drop empty text

In [20]:
# after removal of links, some tweets do not contain any text or only contains empty space. Such occurrences are removed.
# firstly, remove extra white spaces using strip()
trump_tweets['text'] = trump_tweets['text'].apply(lambda x: x.strip())

In [21]:
# then, drop rows with no content in their tweets
trump_tweets = trump_tweets[trump_tweets.text != '']

### reset index

In [22]:
trump_tweets = trump_tweets.reset_index().drop(columns=['index'])
trump_tweets

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention
0,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[]
1,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[]
2,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[]
3,Twitter for iPhone,“EPA: One Trillion Trees Initiative will build...,2020-09-29 22:26:18,10408,36582,False,1311130217348345856,[],[]
4,Twitter for iPhone,Joe Biden could not name a single Law Enforcem...,2020-09-29 22:15:30,4988,20767,False,1311127502954196993,[],[]
...,...,...,...,...,...,...,...,...,...
26074,Twitter for Android,Well the year has officially begun. I have man...,2016-01-01 18:00:09,2642,8495,False,683060169677344768,[],[]
26075,Twitter for Android,@sprinklermanus: @CNN @realDonaldTrump they're...,2016-01-01 16:29:56,933,3330,False,683037464504745985,[],"[@sprinklermanus, @CNN, @realDonaldTrump]"
26076,Twitter for Android,@jallenaip: Hillary said she was in a Fog of W...,2016-01-01 01:08:06,2721,7490,False,682805477168779264,[],[@jallenaip]
26077,Twitter for iPhone,Happy New Year from #MarALago! Thank you to my...,2016-01-01 01:07:28,1948,8258,False,682805320217980929,[#MarALago],[]


### save cleaned data

In [23]:
trump_tweets.to_csv('../data/CLEANED_trump_tweets_20160101_20200930.csv', index=False)