In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import re
from data_science_toolkit.string_ops import remove_newlines, remove_excess_spaces, normalize_links, lower, custom_replace, html_to_unicode, unicode_to_html
from data_science_toolkit.utils import parallel_compute
import nltk
from unidecode import unidecode
import numpy as np
from collections import defaultdict
from dateutil import relativedelta
from data_science_toolkit.file_ops import write_pkl

In [2]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

# CHANGE THIS WHEN USING A DIFFERENT TWITTER ACCOUNT!!

In [3]:
input_file_name = 'Trump_Tweets.csv'
output_file_name = "trump_tweets_sp500.csv"

In [4]:
trumptweets = pd.read_csv("./tweets/{}".format(input_file_name))
trumptweets = trumptweets.dropna()
# stocks = pd.read_csv('./stocks/spx_preprocessed.csv')
# stocks['Date']= pd.to_datetime(stocks['Date']) 

In [5]:
trumptweets.head(5)

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter Media Studio,Thank you to @MarthaRaddatz and @TerryMoran for a job well done! https://t.co/mcHjqX1K2L,10-27-2019 21:24:55,11176.0,41087,False,1.188567e+18
1,Twitter for iPhone,RT @StateDept: Last night the United States brought the world's number one terrorist leader to justice. President @realDonaldTrump address…,10-27-2019 16:50:08,16384.0,0,True,1.188498e+18
2,Twitter for iPhone,RT @WhiteHouse: Thank you to the service members military leaders and agency officials who were critical to the success of this mission.…,10-27-2019 16:49:45,11357.0,0,True,1.188498e+18
3,Twitter for iPhone,https://t.co/7esnNSoa5D,10-27-2019 16:25:12,25546.0,108756,False,1.188492e+18
4,Twitter for iPhone,https://t.co/yJ0VKdNxHP,10-27-2019 14:31:33,22275.0,76549,False,1.188463e+18


In [6]:
trumptweets = trumptweets.drop(["source", "id_str"], axis=1)

In [7]:
trumptweets['created_at']= pd.to_datetime(trumptweets['created_at'], format="%m-%d-%Y %H:%M:%S") 

In [8]:
trumptweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39096 entries, 0 to 39160
Data columns (total 5 columns):
text              39096 non-null object
created_at        39096 non-null datetime64[ns]
retweet_count     39096 non-null float64
favorite_count    39096 non-null object
is_retweet        39096 non-null object
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 1.8+ MB


### Only keep original tweets

In [9]:
trumptweets['is_retweet'] = (trumptweets['is_retweet'] == 'true').astype(bool)
trumptweets = trumptweets[trumptweets['is_retweet'] == False]

In [10]:
trumptweets = trumptweets.sort_values(by="created_at")
trumptweets = trumptweets.reset_index(drop=True)
# stocks = stocks.sort_values(by="Date")
# stocks = stocks.reset_index(drop=True)

## Easy Text preprocessing
- Convert to lower case
- Convert links to `msciurl`
- Remove 'RT'
- Turn @mentions into `twitmention` (nltk splits the @ and the name)
- Turn #hashtags into `twithashtag` (nltk splits the # and the content)
- Tokenize sentence (split punctuation into its own tokens)
- Replace unicode characters with their ascii equivalents (makes things easier to analyze)
- Remove excess spaces/newlines

In [11]:
def preprocess_tweet(tweet):
    tweet = normalize_links(remove_newlines(html_to_unicode(tweet)), 'msciurl')
    # NLTK by default turns mentions into @ <tag>.  Makes for easier analysis this stops the mentions from being separated, and allows them to be used in the clustering and stuff later...
    tweet = custom_replace(tweet,
                          [re.compile('RT'), re.compile(r"@(?=\S)"), re.compile(r"#(?=\S)")],
                          [' ', 'twitmention', 'twithashtag'])
    tweet = ' '.join([word for sent in nltk.sent_tokenize(tweet) for word in nltk.word_tokenize(sent)])
    return unidecode(lower(remove_excess_spaces(tweet)))

In [12]:
tweets = trumptweets["text"].tolist()
processed_tweets = parallel_compute(tweets, preprocess_tweet)

100%|██████████| 36305/36305 [00:03<00:00, 9909.92it/s] 


In [13]:
trumptweets['preprocessed_text'] = processed_tweets

## Add Other Columns for Analysis

### Hour of day tweet was created

In [14]:
trumptweets["created_hour"] = trumptweets["created_at"].apply(lambda x: x.hour)

### Day of week tweet was created

In [15]:
trumptweets['dow'] = trumptweets['created_at'].dt.dayofweek

### Number of links in tweet

In [16]:
trumptweets['num_links'] = trumptweets.preprocessed_text.apply(lambda tweet: sum([1 if 'msciurl' in word else 0 for word in tweet.split("  ")]))

### Number of words in tweet (ignoring mentions, hashtags, and links)

In [17]:
trumptweets['num_words'] = trumptweets.preprocessed_text.apply(lambda tweet: sum([1 if len(word) > 1 and 'msciurl' not in word and 'twitmention' not in word and 'twithashtag' not in word else 0 for word in tweet.split(" ")]))

## There is one tweets that is messed up... remove it

In [18]:
trumptweets = trumptweets[trumptweets.num_words < 55]

In [19]:
trumptweets.head(5)

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,preprocessed_text,created_hour,dow,num_links,num_words
0,Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!,2009-05-04 18:54:25,253.0,202,False,be sure to tune in and watch donald trump on late night with david letterman as he presents the top ten list tonight !,18,0,0,23
1,Donald Trump will be appearing on The View tomorrow morning to discuss Celebrity Apprentice and his new book Think Like A Champion!,2009-05-05 01:00:10,2.0,3,False,donald trump will be appearing on the view tomorrow morning to discuss celebrity apprentice and his new book think like a champion !,1,1,0,21
2,Donald Trump reads Top Ten Financial Tips on Late Show with David Letterman: http://tinyurl.com/ooafwn - Very funny!,2009-05-08 13:38:08,3.0,2,False,donald trump reads top ten financial tips on late show with david letterman : msciurl - very funny !,13,4,1,15
3,New Blog Post: Celebrity Apprentice Finale and Lessons Learned Along the Way: http://tinyurl.com/qlux5e,2009-05-08 20:40:15,8.0,27,False,new blog post : celebrity apprentice finale and lessons learned along the way : msciurl,20,4,1,12
4,My persona will never be that of a wallflower - I’d rather build walls than cling to them --Donald J. Trump,2009-05-12 14:07:28,1421.0,1950,False,my persona will never be that of a wallflower - i ' d rather build walls than cling to them -- donald j. trump,14,1,0,19


# MORE OR LESS RETWEETS THAN AVG!!

In [20]:
min_year = min([x.year for x in trumptweets['created_at'].tolist()])
max_year = max([x.year for x in trumptweets['created_at'].tolist()])+1

In [21]:
curr_dt = datetime(min_year, 1, 1)
dt_avg = defaultdict(lambda: defaultdict(int))
while curr_dt < datetime(max_year, 1, 1):
    dt_avg[curr_dt.year][curr_dt.month] = np.mean(trumptweets[trumptweets['created_at'].between(curr_dt, curr_dt+relativedelta.relativedelta(months=1))]['retweet_count'].tolist())  
    curr_dt += relativedelta.relativedelta(months=1)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


## Save averages to pkl to read in EDA.ipynb

In [22]:
write_pkl("dt_avg_{}.pkl".format(output_file_name), dict(dt_avg))

In [23]:
greater_than_monthly_avg = list(map(int, [count > dt_avg[curr_date.year][curr_date.month] 
                                          for count, curr_date in zip(trumptweets['retweet_count'].tolist(), trumptweets['created_at'].tolist())]))
trumptweets['above_monthly_avg'] = greater_than_monthly_avg

## Percent Caps

In [24]:
orig = trumptweets['text'].tolist()
percent_caps = []
for tweet in orig:
    words = tweet.split(" ")
    num_allcaps = sum([1 for x in words if x.upper() == x and re.search('[a-zA-Z]', x)])
    percent_caps.append(int(num_allcaps/len(words)*10)/10 if len(words) else 0)

In [25]:
trumptweets['percent_caps'] = percent_caps
print(set(percent_caps))

{0.0, 0.4, 0.1, 0.3, 0.2, 1.0, 0.7, 0.5, 0.6, 0.8, 0.9}


## Num Hashtags

In [26]:
preprocessed = trumptweets['preprocessed_text'].tolist()
num_hashtags = []
for tweet in preprocessed:
    words = tweet.split(" ")
    num_hashtag = sum([1 for x in words if 'twithashtag' in x and re.search('[a-zA-Z]', x)])
    num_hashtags.append(num_hashtag if num_hashtag else 0)

In [27]:
trumptweets['num_hashtags'] = num_hashtags
print(set(num_hashtags))

{0, 1, 2, 3, 4, 5, 6, 7, 8}


## Num Mentions

In [28]:
num_mentions = []
for tweet in preprocessed:
    words = tweet.split(" ")
    num_mention = sum([1 for x in words if 'twitmention' in x and re.search('[a-zA-Z]', x)])
    num_mentions.append(num_mention if num_mention else 0)

In [29]:
trumptweets['num_mentions'] = num_mentions
print(set(num_mentions))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}


### Final Dataframe...

In [30]:
trumptweets.head(5)

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,preprocessed_text,created_hour,dow,num_links,num_words,above_monthly_avg,percent_caps,num_hashtags,num_mentions
0,Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!,2009-05-04 18:54:25,253.0,202,False,be sure to tune in and watch donald trump on late night with david letterman as he presents the top ten list tonight !,18,0,0,23,1,0.0,0,0
1,Donald Trump will be appearing on The View tomorrow morning to discuss Celebrity Apprentice and his new book Think Like A Champion!,2009-05-05 01:00:10,2.0,3,False,donald trump will be appearing on the view tomorrow morning to discuss celebrity apprentice and his new book think like a champion !,1,1,0,21,0,0.0,0,0
2,Donald Trump reads Top Ten Financial Tips on Late Show with David Letterman: http://tinyurl.com/ooafwn - Very funny!,2009-05-08 13:38:08,3.0,2,False,donald trump reads top ten financial tips on late show with david letterman : msciurl - very funny !,13,4,1,15,0,0.0,0,0
3,New Blog Post: Celebrity Apprentice Finale and Lessons Learned Along the Way: http://tinyurl.com/qlux5e,2009-05-08 20:40:15,8.0,27,False,new blog post : celebrity apprentice finale and lessons learned along the way : msciurl,20,4,1,12,0,0.0,0,0
4,My persona will never be that of a wallflower - I’d rather build walls than cling to them --Donald J. Trump,2009-05-12 14:07:28,1421.0,1950,False,my persona will never be that of a wallflower - i ' d rather build walls than cling to them -- donald j. trump,14,1,0,19,1,0.0,0,0


### Save

In [31]:
trumptweets.to_csv("./stocks/{}".format(output_file_name), index=False)