In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import re

In [2]:
trumptweets = pd.read_csv('./tweets/Trump_Tweets.csv')
trumptweets = trumptweets.dropna()
stocks = pd.read_csv('./stocks/spx_preprocessed.csv')
stocks['Date']= pd.to_datetime(stocks['Date']) 

In [3]:
trumptweets.head(5)

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter Media Studio,Thank you to @MarthaRaddatz and @TerryMoran fo...,10-27-2019 21:24:55,11176.0,41087,False,1.188567e+18
1,Twitter for iPhone,RT @StateDept: Last night the United States br...,10-27-2019 16:50:08,16384.0,0,True,1.188498e+18
2,Twitter for iPhone,RT @WhiteHouse: Thank you to the service membe...,10-27-2019 16:49:45,11357.0,0,True,1.188498e+18
3,Twitter for iPhone,https://t.co/7esnNSoa5D,10-27-2019 16:25:12,25546.0,108756,False,1.188492e+18
4,Twitter for iPhone,https://t.co/yJ0VKdNxHP,10-27-2019 14:31:33,22275.0,76549,False,1.188463e+18


In [4]:
trumptweets = trumptweets.drop(["source", "id_str"], axis=1)

In [5]:
trumptweets['created_at']= pd.to_datetime(trumptweets['created_at'], format="%m-%d-%Y %H:%M:%S") 

In [6]:
trumptweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39096 entries, 0 to 39160
Data columns (total 5 columns):
text              39096 non-null object
created_at        39096 non-null datetime64[ns]
retweet_count     39096 non-null float64
favorite_count    39096 non-null object
is_retweet        39096 non-null object
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 1.8+ MB


### We only want tweets where there are associated stock prices

In [7]:
trumptweets = trumptweets[trumptweets["created_at"].between(str(stocks["Date"].min() - timedelta(days=1)), str(stocks["Date"].max() + timedelta(days=1)))]

In [8]:
trumptweets = trumptweets.sort_values(by="created_at")
trumptweets = trumptweets.reset_index(drop=True)
stocks = stocks.sort_values(by="Date")
stocks = stocks.reset_index(drop=True)

In [9]:
stocks.head(10)

Unnamed: 0,Date,Open,Close,1 Day Open,1 Day Close,2 Day Open,2 Day Close,EOW Close,SOW Open
0,2010-01-04,1116.560059,1132.98999,1132.660034,1136.52002,1135.709961,1137.140015,1144.97998,1145.959961
1,2010-01-05,1132.660034,1136.52002,1135.709961,1137.140015,1136.27002,1141.689941,1144.97998,1145.959961
2,2010-01-06,1135.709961,1137.140015,1136.27002,1141.689941,1140.52002,1144.97998,1144.97998,1145.959961
3,2010-01-07,1136.27002,1141.689941,1140.52002,1144.97998,1145.959961,1146.97998,1144.97998,1145.959961
4,2010-01-08,1140.52002,1144.97998,1145.959961,1146.97998,1143.810059,1136.219971,1144.97998,1145.959961
5,2010-01-11,1145.959961,1146.97998,1143.810059,1136.219971,1137.310059,1145.680054,1136.030029,1136.030029
6,2010-01-12,1143.810059,1136.219971,1137.310059,1145.680054,1145.680054,1148.459961,1136.030029,1136.030029
7,2010-01-13,1137.310059,1145.680054,1145.680054,1148.459961,1147.719971,1136.030029,1136.030029,1136.030029
8,2010-01-14,1145.680054,1148.459961,1147.719971,1136.030029,1136.030029,1150.22998,1136.030029,1136.030029
9,2010-01-15,1147.719971,1136.030029,1136.030029,1150.22998,1147.949951,1138.040039,1136.030029,1136.030029


### Search for closest value, round up or down depending...

In [10]:
def get_closest_value(arr, target, round_dir):
    n = len(arr)
    left = 0
    right = n - 1
    mid = 0

    # edge case - last or above all
    if target >= arr[n - 1]:
        return arr[n - 1]
    # edge case - first or below all
    if target <= arr[0]:
        return arr[0]
    # BSearch solution: Time & Space: Log(N)

    while left < right:
        mid = (left + right) // 2  # find the mid
        if target < arr[mid]:
            right = mid
        elif target > arr[mid]:
            left = mid + 1
        else:
            return arr[mid]

    if target < arr[mid]:
        return find_closest(arr[mid - 1], arr[mid], target, round_dir)
    else:
        return find_closest(arr[mid], arr[mid + 1], target, round_dir)


# findClosest
# We find the closest by taking the difference
# between the target and both values. It assumes
# that val2 is greater than val1 and target lies
# between these two. 
def find_closest(val1, val2, target, round_dir):
    if val1 == target:
        return val1
    elif val2 == target:
        return val2
    return min(val1, val2) if round_dir.lower() == "down" else max(val1, val2)
#     return val2 if target - val1 >= val2 - target else val1

### Find stock date/metric directly before/after the tweet was made (this should be the nearest timestep)

In [11]:
tweet_dates, stock_dates = trumptweets["created_at"].tolist(), stocks["Date"].tolist()
before_dates, before_metrics, after_dates, after_metrics = [], [], [], []
for tweet_date in tweet_dates:
    if tweet_date.hour < 9:
        # Use previous day's close as before price
        stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date()-timedelta(days=1), datetime.min.time()), "down")
        before_dates.append(stock_date)
        before_metrics.append("Close")
        # Use current day's open as after price
        after_stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date(), datetime.min.time()), "up")
        after_dates.append(after_stock_date)
        after_metrics.append("Open")
    elif tweet_date.hour >= 17:
        # Use current day's close as before price
        stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date(), datetime.min.time()), "down")
        before_dates.append(stock_date)
        before_metrics.append("Close")
        # Use next day's open as after price
        after_stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date()+timedelta(days=1), datetime.min.time()), "up")
        after_dates.append(after_stock_date)
        after_metrics.append("Open")
    else:
        # Use current day's open as before price
        stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date(), datetime.min.time()), "down")
        before_dates.append(stock_date)
        if stock_date.date() == tweet_date.date():
            before_metrics.append("Open")
        else:
            before_metrics.append("Close")
        # Use current day's close as after price
        after_stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date(), datetime.min.time()), "up")
        after_dates.append(after_stock_date)
        if after_stock_date.date() == tweet_date.date():
            after_metrics.append("Close")
        else:
            after_metrics.append("Open")

In [12]:
trumptweets["before_date"] = pd.Series(before_dates)
trumptweets["before_metric"] = pd.Series(before_metrics)
trumptweets["after_date"] = pd.Series(after_dates)
trumptweets["after_metric"] = pd.Series(after_metrics)

### We got them all (yeet)

In [13]:
trumptweets[trumptweets.isnull().any(axis=1)]

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,before_date,before_metric,after_date,after_metric


### Get price associated with before/after dates and metrics

In [14]:
tweet_stock = pd.merge(left=trumptweets,right=stocks[["Date", "Open", "Close"]], how='left', left_on='before_date', right_on='Date')
tweet_stock['before_open'] = np.where(tweet_stock['before_metric'] == "Open", 1, 0)
tweet_stock['before_close'] = np.where(tweet_stock['before_metric'] == "Close", 1, 0)
tweet_stock['before_price'] = tweet_stock["Open"] * tweet_stock["before_open"] + tweet_stock["Close"] * tweet_stock['before_close']
tweet_stock = tweet_stock.drop(['before_open', 'before_close', "Open", "Close", "Date"], axis=1)

In [15]:
tweet_stock = pd.merge(left=tweet_stock, right=stocks, how='left', left_on='after_date', right_on='Date')
tweet_stock['after_open'] = np.where(tweet_stock['after_metric'] == "Open", 1, 0)
tweet_stock['after_close'] = np.where(tweet_stock['after_metric'] == "Close", 1, 0)
tweet_stock["after_price"] = tweet_stock["Open"] * tweet_stock["after_open"] + tweet_stock["Close"] * tweet_stock['after_close']
tweet_stock = tweet_stock.drop(['after_open', 'after_close', "Open", "Close", "Date"], axis=1)

In [16]:
tweet_stock = tweet_stock.drop(['before_date', 'before_metric', 'after_date', 'after_metric'], axis=1)

In [17]:
tweet_stock

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,before_price,1 Day Open,1 Day Close,2 Day Open,2 Day Close,EOW Close,SOW Open,after_price
0,Celebrity Apprentice returns to NBC Sunday 3/1...,2010-01-12 18:05:08,20.0,3,false,1136.219971,1145.680054,1148.459961,1147.719971,1136.030029,1136.030029,1136.030029,1137.310059
1,Trump Tycoon App for iPhone & iPod Touch - It'...,2010-01-15 16:28:02,23.0,20,false,1147.719971,1136.030029,1150.229980,1147.949951,1138.040039,1136.030029,1136.030029,1136.030029
2,"from Donald Trump: ""I saw Lady Gaga last night...",2010-01-21 16:58:43,3813.0,3223,false,1138.680054,1115.489990,1091.760010,1092.400024,1096.780029,1091.760010,1092.400024,1116.479980
3,Golf Channel & Donald Trump's World of Golf ho...,2010-01-22 20:51:00,12.0,1,false,1091.760010,1095.800049,1092.170044,1091.939941,1097.500000,1073.869995,1073.890015,1092.400024
4,Superbowl Sunday is a great American tradition...,2010-02-07 16:54:42,16.0,3,false,1066.189941,1060.060059,1070.520020,1069.680054,1068.130005,1075.510010,1079.130005,1065.510010
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39035,https://t.co/yJ0VKdNxHP,2019-10-27 14:31:33,22275.0,76549,false,3022.550049,3035.389893,3036.889893,3039.739990,3046.770020,3066.909912,3078.959961,3032.120117
39036,https://t.co/7esnNSoa5D,2019-10-27 16:25:12,25546.0,108756,false,3022.550049,3035.389893,3036.889893,3039.739990,3046.770020,3066.909912,3078.959961,3032.120117
39037,RT @WhiteHouse: Thank you to the service membe...,2019-10-27 16:49:45,11357.0,0,true,3022.550049,3035.389893,3036.889893,3039.739990,3046.770020,3066.909912,3078.959961,3032.120117
39038,RT @StateDept: Last night the United States br...,2019-10-27 16:50:08,16384.0,0,true,3022.550049,3035.389893,3036.889893,3039.739990,3046.770020,3066.909912,3078.959961,3032.120117


### Now we need to change these to outputs - Up/No change = 1, Down = 0

In [18]:
tweet_stock["after_dir"] = np.where(tweet_stock['after_price'] - tweet_stock["before_price"] > 0, 1, 0)

In [19]:
tweet_stock["1_open_dir"] = np.where(tweet_stock["1 Day Open"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["1_close_dir"] = np.where(tweet_stock["1 Day Close"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["2_open_dir"] = np.where(tweet_stock["2 Day Open"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["2_close_dir"] = np.where(tweet_stock["2 Day Close"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["eow_close_dir"] = np.where(tweet_stock["EOW Close"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["sow_open_dir"] = np.where(tweet_stock["SOW Open"] - tweet_stock["before_price"] > 0, 1, 0)

In [20]:
set(tweet_stock['after_dir'].tolist())

{0, 1}

## We only care if the change is significant - let's take a look at after_price - before_price here only!!!!!

In [21]:
stock_drops = tweet_stock[tweet_stock['after_dir'] == 0]
average_drop = np.mean(np.abs(stock_drops['after_price'] - stock_drops['before_price']))
sd_drop = np.std(np.abs(stock_drops['after_price'] - stock_drops['before_price']))
print("avg drop: ", average_drop, "sd_drop: ", sd_drop)
stock_incs = tweet_stock[tweet_stock['after_dir'] == 1]
average_inc = np.mean(np.abs(stock_incs['after_price'] - stock_incs['before_price']))
sd_inc = np.std(np.abs(stock_incs['after_price'] - stock_incs['before_price']))
print("avg inc: ", average_inc, "sd_inc: ", sd_inc)

avg drop:  5.7550944555856445 sd_drop:  9.616544089711576
avg inc:  5.702514808508466 sd_inc:  7.186161278596358


In [22]:
# tweet_stock['is_significant'] = tweet_stock[((tweet_stock['after_dir'] == 0) & (tweet_stock['after_price'] - tweet_stock['before_price'] < -1*(average_drop+0.5*sd_drop)))]
#                                             | ((tweet_stock['after_dir'] == 1) & (tweet_stock['after_price'] - tweet_stock['before_price'] > (average_inc+0.5*sd_inc))))]

In [23]:
percent_sd = 0.01
tweet_stock['sig_up'] = np.where(((tweet_stock['after_dir'] == 1) & (tweet_stock['after_price'] - tweet_stock['before_price'] > (average_inc+percent_sd*sd_inc))), 1, 0)
tweet_stock['sig_down'] = np.where(((tweet_stock['after_dir'] == 0) & (tweet_stock['after_price'] - tweet_stock['before_price'] < -1*(average_drop+percent_sd*sd_drop))), 1, 0)
tweet_stock['is_sig'] = tweet_stock['sig_up'] + tweet_stock['sig_down']

In [24]:
set(tweet_stock['after_dir'].tolist())

{0, 1}

In [25]:
from collections import Counter
set(tweet_stock['sig_down'].tolist())

{0, 1}

In [26]:
Counter(tweet_stock['sig_down'].tolist())

Counter({0: 33685, 1: 5355})

In [27]:
Counter(tweet_stock['sig_up'].tolist())

Counter({0: 32863, 1: 6177})

In [28]:
Counter(tweet_stock['is_sig'].tolist())

Counter({0: 27508, 1: 11532})

In [29]:
tweet_stock[tweet_stock.is_sig == 2].head(10)

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,before_price,1 Day Open,1 Day Close,2 Day Open,2 Day Close,...,after_dir,1_open_dir,1_close_dir,2_open_dir,2_close_dir,eow_close_dir,sow_open_dir,sig_up,sig_down,is_sig


In [30]:
len(tweet_stock.index)

39040

In [31]:
len(tweet_stock[tweet_stock['is_sig'] == 1].index)

11532

In [32]:
tweet_stock = tweet_stock[tweet_stock['is_sig'] == 1]

In [33]:
tweet_stock = tweet_stock.drop(["before_price", "1 Day Open", "1 Day Close", "2 Day Open", "2 Day Close", "EOW Close", "SOW Open", "after_price"], axis=1)

## Easy Text preprocessing
- Convert to lower case
- Convert links to <link>
- Remove excess spaces/newlines

In [34]:
from data_science_toolkit.string_ops import remove_newlines, remove_excess_spaces, normalize_links, lower, custom_replace, html_to_unicode, unicode_to_html
from data_science_toolkit.utils import parallel_compute
import nltk

In [35]:
def preprocess_tweet(tweet):
    tweet = normalize_links(remove_newlines(html_to_unicode(tweet)), 'msciurl')
#     , re.compile(r"['\"]"), re.compile(r"\.\.\.")
    # NLTK by default turns mentions into @ <tag>.  Makes for easier analysis this stops the mentions from being separated, and allows them to be used in the clustering and stuff later...
    tweet = custom_replace(tweet,
                          [re.compile('RT'), re.compile(r"@(?=\S)"), re.compile(r"#(?=\S)")],
                          [' ', 'twitmention', 'twithashtag'])
    tweet = ' '.join([word for sent in nltk.sent_tokenize(tweet) for word in nltk.word_tokenize(sent)])
    return lower(remove_excess_spaces(tweet))

In [36]:
tweets = tweet_stock["text"].tolist()
processed_tweets = parallel_compute(tweets, preprocess_tweet)

100%|██████████| 11532/11532 [00:00<00:00, 35857.58it/s]


In [37]:
tweet_stock['preprocessed_text'] = processed_tweets

## Add Other Columns for Analysis

In [38]:
tweet_stock["created_hour"] = tweet_stock["created_at"].apply(lambda x: x.hour)

In [39]:
tweet_stock['dow'] = tweet_stock['created_at'].dt.dayofweek

In [40]:
len(tweet_stock.index)

11532

In [41]:
tweet_stock['is_retweet'] = (tweet_stock['is_retweet'] == 'true').astype(bool)

In [42]:
tweet_stock = tweet_stock[tweet_stock['is_retweet'] == False]

In [43]:
len(tweet_stock.index)

9924

In [44]:
tweet_stock.head(10)

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,after_dir,1_open_dir,1_close_dir,2_open_dir,2_close_dir,eow_close_dir,sow_open_dir,sig_up,sig_down,is_sig,preprocessed_text,created_hour,dow
1,Trump Tycoon App for iPhone & iPod Touch - It'...,2010-01-15 16:28:02,23.0,20,False,0,0,1,1,0,0,0,0,1,1,trump tycoon app for iphone & ipod touch - it ...,16,4
2,"from Donald Trump: ""I saw Lady Gaga last night...",2010-01-21 16:58:43,3813.0,3223,False,0,0,0,0,0,0,0,0,1,1,from donald trump : `` i saw lady gaga last ni...,16,3
8,From Donald Trump: Andrea Bocelli @ Mar-a-Lago...,2010-03-05 14:51:32,7.0,6,False,1,1,1,1,1,1,1,1,0,1,from donald trump : andrea bocelli @ mar-a-lag...,14,4
11,Olympic Gold Medalist Evan Lysacek just left ...,2010-03-17 15:14:13,13.0,7,False,1,1,1,1,0,0,0,1,0,1,olympic gold medalist evan lysacek just left m...,15,2
13,Last week's episode of the Celebrity Apprentic...,2010-03-19 14:08:33,9.0,2,False,0,0,0,0,1,0,0,0,1,1,last week 's episode of the celebrity apprenti...,14,4
14,The Trump Hotel Collection is currently nomina...,2010-03-23 16:26:23,13.0,5,False,1,1,1,1,0,1,1,1,0,1,the trump hotel collection is currently nomina...,16,1
23,This is a terrific day for downtown New York. ...,2010-04-09 16:36:36,10.0,5,False,1,1,1,1,1,1,1,1,0,1,this is a terrific day for downtown new york ....,16,4
27,On Sunday Jerome Bettis 'the bus' from the Pit...,2010-04-16 15:55:06,8.0,3,False,0,0,0,0,0,0,0,0,1,1,on sunday jerome bettis 'the bus ' from the pi...,15,4
30,Melania and I will be appearing on The View to...,2010-04-22 14:31:53,19.0,5,False,1,1,1,1,1,1,1,1,0,1,melania and i will be appearing on the view to...,14,3
31,To put on your calendar for May: Miss USA 2010...,2010-04-23 15:22:08,25.0,3,False,1,1,1,1,0,1,1,1,0,1,to put on your calendar for may : miss usa 201...,15,4


### Save

In [45]:
tweet_stock.to_csv('./stocks/trump_tweets_sp500.csv', index=False)

In [46]:
# NEXT STEPS:
# - ACTUALLY GET THE PRICES FOR THESE DAYS TO GET PROPER INPUT/OUTPUT
# - SPLIT DATA INTO TRAIN/TEST SETS (MAKE SURE TO RANDOMIZE DATES!)
# - EDA (CLASS BALANCE, MOST COMMON WORDS, CLUSTERING FOR COMMON TOPICS)
# - CLASSIFICATION (RANDOM FOREST? SOFT MARGIN SVM?)