In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import re

In [2]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

# CHANGE THIS WHEN USING A DIFFERENT TWITTER ACCOUNT!!

In [3]:
input_file_name = 'Trump_Tweets.csv'
output_file_name = "trump_tweets_sp500.csv"

In [4]:
trumptweets = pd.read_csv("./tweets/{}".format(input_file))
trumptweets = trumptweets.dropna()
stocks = pd.read_csv('./stocks/spx_preprocessed.csv')
stocks['Date']= pd.to_datetime(stocks['Date']) 

In [5]:
trumptweets.head(5)

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter Media Studio,Thank you to @MarthaRaddatz and @TerryMoran for a job well done! https://t.co/mcHjqX1K2L,10-27-2019 21:24:55,11176.0,41087,False,1.188567e+18
1,Twitter for iPhone,RT @StateDept: Last night the United States brought the world's number one terrorist leader to justice. President @realDonaldTrump address…,10-27-2019 16:50:08,16384.0,0,True,1.188498e+18
2,Twitter for iPhone,RT @WhiteHouse: Thank you to the service members military leaders and agency officials who were critical to the success of this mission.…,10-27-2019 16:49:45,11357.0,0,True,1.188498e+18
3,Twitter for iPhone,https://t.co/7esnNSoa5D,10-27-2019 16:25:12,25546.0,108756,False,1.188492e+18
4,Twitter for iPhone,https://t.co/yJ0VKdNxHP,10-27-2019 14:31:33,22275.0,76549,False,1.188463e+18


In [6]:
trumptweets = trumptweets.drop(["source", "id_str"], axis=1)

In [7]:
trumptweets['created_at']= pd.to_datetime(trumptweets['created_at'], format="%m-%d-%Y %H:%M:%S") 

In [8]:
trumptweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39096 entries, 0 to 39160
Data columns (total 5 columns):
text              39096 non-null object
created_at        39096 non-null datetime64[ns]
retweet_count     39096 non-null float64
favorite_count    39096 non-null object
is_retweet        39096 non-null object
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 1.8+ MB


### We only want tweets where there are associated stock prices

In [9]:
trumptweets = trumptweets[trumptweets["created_at"].between(str(stocks["Date"].min() - timedelta(days=1)), str(stocks["Date"].max() + timedelta(days=1)))]

In [10]:
trumptweets = trumptweets.sort_values(by="created_at")
trumptweets = trumptweets.reset_index(drop=True)
stocks = stocks.sort_values(by="Date")
stocks = stocks.reset_index(drop=True)

In [11]:
stocks.head(10)

Unnamed: 0,Date,Open,Close,1 Day Open,1 Day Close,2 Day Open,2 Day Close,EOW Close,SOW Open
0,2010-01-04,1116.560059,1132.98999,1132.660034,1136.52002,1135.709961,1137.140015,1144.97998,1145.959961
1,2010-01-05,1132.660034,1136.52002,1135.709961,1137.140015,1136.27002,1141.689941,1144.97998,1145.959961
2,2010-01-06,1135.709961,1137.140015,1136.27002,1141.689941,1140.52002,1144.97998,1144.97998,1145.959961
3,2010-01-07,1136.27002,1141.689941,1140.52002,1144.97998,1145.959961,1146.97998,1144.97998,1145.959961
4,2010-01-08,1140.52002,1144.97998,1145.959961,1146.97998,1143.810059,1136.219971,1144.97998,1145.959961
5,2010-01-11,1145.959961,1146.97998,1143.810059,1136.219971,1137.310059,1145.680054,1136.030029,1136.030029
6,2010-01-12,1143.810059,1136.219971,1137.310059,1145.680054,1145.680054,1148.459961,1136.030029,1136.030029
7,2010-01-13,1137.310059,1145.680054,1145.680054,1148.459961,1147.719971,1136.030029,1136.030029,1136.030029
8,2010-01-14,1145.680054,1148.459961,1147.719971,1136.030029,1136.030029,1150.22998,1136.030029,1136.030029
9,2010-01-15,1147.719971,1136.030029,1136.030029,1150.22998,1147.949951,1138.040039,1136.030029,1136.030029


### Search for closest value, round up or down depending...

In [12]:
def get_closest_value(arr, target, round_dir):
    n = len(arr)
    left = 0
    right = n - 1
    mid = 0

    # edge case - last or above all
    if target >= arr[n - 1]:
        return arr[n - 1]
    # edge case - first or below all
    if target <= arr[0]:
        return arr[0]
    # BSearch solution: Time & Space: Log(N)

    while left < right:
        mid = (left + right) // 2  # find the mid
        if target < arr[mid]:
            right = mid
        elif target > arr[mid]:
            left = mid + 1
        else:
            return arr[mid]

    if target < arr[mid]:
        return find_closest(arr[mid - 1], arr[mid], target, round_dir)
    else:
        return find_closest(arr[mid], arr[mid + 1], target, round_dir)


# findClosest
# We find the closest by taking the difference
# between the target and both values. It assumes
# that val2 is greater than val1 and target lies
# between these two. 
def find_closest(val1, val2, target, round_dir):
    if val1 == target:
        return val1
    elif val2 == target:
        return val2
    return min(val1, val2) if round_dir.lower() == "down" else max(val1, val2)
#     return val2 if target - val1 >= val2 - target else val1

### Find stock date/metric directly before/after the tweet was made (this should be the nearest timestep)

In [13]:
tweet_dates, stock_dates = trumptweets["created_at"].tolist(), stocks["Date"].tolist()
before_dates, before_metrics, after_dates, after_metrics = [], [], [], []
for tweet_date in tweet_dates:
    if tweet_date.hour < 9:
        # Use previous day's close as before price
        stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date()-timedelta(days=1), datetime.min.time()), "down")
        before_dates.append(stock_date)
        before_metrics.append("Close")
        # Use current day's open as after price
        after_stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date(), datetime.min.time()), "up")
        after_dates.append(after_stock_date)
        after_metrics.append("Open")
    elif tweet_date.hour >= 17:
        # Use current day's close as before price
        stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date(), datetime.min.time()), "down")
        before_dates.append(stock_date)
        before_metrics.append("Close")
        # Use next day's open as after price
        after_stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date()+timedelta(days=1), datetime.min.time()), "up")
        after_dates.append(after_stock_date)
        after_metrics.append("Open")
    else:
        # Use current day's open as before price
        stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date(), datetime.min.time()), "down")
        before_dates.append(stock_date)
        if stock_date.date() == tweet_date.date():
            before_metrics.append("Open")
        else:
            before_metrics.append("Close")
        # Use current day's close as after price
        after_stock_date = get_closest_value(stock_dates, datetime.combine(tweet_date.date(), datetime.min.time()), "up")
        after_dates.append(after_stock_date)
        if after_stock_date.date() == tweet_date.date():
            after_metrics.append("Close")
        else:
            after_metrics.append("Open")

In [14]:
trumptweets["before_date"] = pd.Series(before_dates)
trumptweets["before_metric"] = pd.Series(before_metrics)
trumptweets["after_date"] = pd.Series(after_dates)
trumptweets["after_metric"] = pd.Series(after_metrics)

### We got them all (yeet)

In [15]:
trumptweets[trumptweets.isnull().any(axis=1)]

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,before_date,before_metric,after_date,after_metric


### Get price associated with before/after dates and metrics

In [16]:
tweet_stock = pd.merge(left=trumptweets,right=stocks[["Date", "Open", "Close"]], how='left', left_on='before_date', right_on='Date')
tweet_stock['before_open'] = np.where(tweet_stock['before_metric'] == "Open", 1, 0)
tweet_stock['before_close'] = np.where(tweet_stock['before_metric'] == "Close", 1, 0)
tweet_stock['before_price'] = tweet_stock["Open"] * tweet_stock["before_open"] + tweet_stock["Close"] * tweet_stock['before_close']
tweet_stock = tweet_stock.drop(['before_open', 'before_close', "Open", "Close", "Date"], axis=1)

In [17]:
tweet_stock = pd.merge(left=tweet_stock, right=stocks, how='left', left_on='after_date', right_on='Date')
tweet_stock['after_open'] = np.where(tweet_stock['after_metric'] == "Open", 1, 0)
tweet_stock['after_close'] = np.where(tweet_stock['after_metric'] == "Close", 1, 0)
tweet_stock["after_price"] = tweet_stock["Open"] * tweet_stock["after_open"] + tweet_stock["Close"] * tweet_stock['after_close']
tweet_stock = tweet_stock.drop(['after_open', 'after_close', "Open", "Close", "Date"], axis=1)

In [18]:
tweet_stock = tweet_stock.drop(['before_date', 'before_metric', 'after_date', 'after_metric'], axis=1)

In [19]:
tweet_stock

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,before_price,1 Day Open,1 Day Close,2 Day Open,2 Day Close,EOW Close,SOW Open,after_price
0,Celebrity Apprentice returns to NBC Sunday 3/14 9-11PM ET/PT. Outstanding list of celebrities & season should be the best one yet!,2010-01-12 18:05:08,20.0,3,false,1136.219971,1145.680054,1148.459961,1147.719971,1136.030029,1136.030029,1136.030029,1137.310059
1,Trump Tycoon App for iPhone & iPod Touch - It's $2.99 but the advice is priceless! http://bit.ly/UGUF0,2010-01-15 16:28:02,23.0,20,false,1147.719971,1136.030029,1150.229980,1147.949951,1138.040039,1136.030029,1136.030029,1136.030029
2,"from Donald Trump: ""I saw Lady Gaga last night and she was fantastic!""",2010-01-21 16:58:43,3813.0,3223,false,1138.680054,1115.489990,1091.760010,1092.400024,1096.780029,1091.760010,1092.400024,1116.479980
3,Golf Channel & Donald Trump's World of Golf host a Celebrity Match 1/25 @ TNGC LA CA - Mark Wahlberg vs. Kevin Dillon http://bit.ly/4MubN6,2010-01-22 20:51:00,12.0,1,false,1091.760010,1095.800049,1092.170044,1091.939941,1097.500000,1073.869995,1073.890015,1092.400024
4,Superbowl Sunday is a great American tradition. The Colts and Saints are already champions but may the best team win!,2010-02-07 16:54:42,16.0,3,false,1066.189941,1060.060059,1070.520020,1069.680054,1068.130005,1075.510010,1079.130005,1065.510010
5,Donald Trump appearing today on CNN International’s ‘Connect the World’ as ‘Connector of the Day’. Submit questions: http://bit.ly/bPiP7T,2010-02-10 15:17:56,7.0,1,false,1069.680054,1067.099976,1078.469971,1075.949951,1075.510010,1075.510010,1079.130005,1068.130005
6,Donald Trump appeared on the final episode of The Jay Leno Show to deliver a very special message: http://bit.ly/cv1En7,2010-02-11 19:57:36,16.0,3,false,1078.469971,1079.130005,1094.869995,1096.140015,1099.510010,1075.510010,1079.130005,1075.949951
7,From Donald Trump: “I’m so proud of my wife Melania and the launch of her new jewelry line to debut on QVC on April 30th at 9 p.m.”,2010-02-26 21:18:26,25.0,21,false,1104.489990,1117.010010,1118.310059,1119.359985,1118.790039,1138.699951,1138.400024,1105.359985
8,From Donald Trump: Andrea Bocelli @ Mar-a-Lago - Many say best night of entertainment in long history of Palm Beach http://bit.ly/2fNgOz,2010-03-05 14:51:32,7.0,6,false,1125.119995,1138.400024,1138.500000,1137.560059,1140.449951,1138.699951,1138.400024,1138.699951
9,The Celebrity Apprentice has a two-hour premiere this Sunday March 14th at 9 p.m. on NBC. This will be the best season yet see you then!,2010-03-12 20:30:52,23.0,7,false,1149.989990,1150.829956,1159.459961,1159.939941,1166.209961,1159.900024,1157.250000,1148.530029


### Now we need to change these to outputs - Up/No change = 1, Down = 0

In [20]:
tweet_stock["after_dir"] = np.where(tweet_stock['after_price'] - tweet_stock["before_price"] > 0, 1, 0)

In [21]:
tweet_stock["1_open_dir"] = np.where(tweet_stock["1 Day Open"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["1_close_dir"] = np.where(tweet_stock["1 Day Close"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["2_open_dir"] = np.where(tweet_stock["2 Day Open"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["2_close_dir"] = np.where(tweet_stock["2 Day Close"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["eow_close_dir"] = np.where(tweet_stock["EOW Close"] - tweet_stock["before_price"] > 0, 1, 0)
tweet_stock["sow_open_dir"] = np.where(tweet_stock["SOW Open"] - tweet_stock["before_price"] > 0, 1, 0)

In [22]:
set(tweet_stock['after_dir'].tolist())

{0, 1}

## We only care if the change is significant - let's take a look at after_price - before_price here only!!!!!

In [23]:
stock_drops = tweet_stock[tweet_stock['after_dir'] == 0]
average_drop = np.mean(np.abs(stock_drops['after_price'] - stock_drops['before_price']))
sd_drop = np.std(np.abs(stock_drops['after_price'] - stock_drops['before_price']))
print("avg drop: ", average_drop, "sd_drop: ", sd_drop)
stock_incs = tweet_stock[tweet_stock['after_dir'] == 1]
average_inc = np.mean(np.abs(stock_incs['after_price'] - stock_incs['before_price']))
sd_inc = np.std(np.abs(stock_incs['after_price'] - stock_incs['before_price']))
print("avg inc: ", average_inc, "sd_inc: ", sd_inc)

avg drop:  5.755094455585635 sd_drop:  9.6165440897116
avg inc:  5.702514808508458 sd_inc:  7.186161278596292


In [24]:
# tweet_stock['is_significant'] = tweet_stock[((tweet_stock['after_dir'] == 0) & (tweet_stock['after_price'] - tweet_stock['before_price'] < -1*(average_drop+0.5*sd_drop)))]
#                                             | ((tweet_stock['after_dir'] == 1) & (tweet_stock['after_price'] - tweet_stock['before_price'] > (average_inc+0.5*sd_inc))))]

In [25]:
percent_sd = 0
tweet_stock['sig_up'] = np.where(((tweet_stock['after_dir'] == 1) & (tweet_stock['after_price'] - tweet_stock['before_price'] > (average_inc+percent_sd*sd_inc))), 1, 0)
tweet_stock['sig_down'] = np.where(((tweet_stock['after_dir'] == 0) & (tweet_stock['after_price'] - tweet_stock['before_price'] < -1*(average_drop+percent_sd*sd_drop))), 1, 0)
tweet_stock['is_sig'] = tweet_stock['sig_up'] + tweet_stock['sig_down']

In [26]:
set(tweet_stock['after_dir'].tolist())

{0, 1}

In [27]:
from collections import Counter
set(tweet_stock['sig_down'].tolist())

{0, 1}

In [28]:
Counter(tweet_stock['sig_down'].tolist())

Counter({0: 33647, 1: 5393})

In [29]:
Counter(tweet_stock['sig_up'].tolist())

Counter({0: 32840, 1: 6200})

In [30]:
Counter(tweet_stock['is_sig'].tolist())

Counter({0: 27447, 1: 11593})

In [31]:
tweet_stock[tweet_stock.is_sig == 2].head(10)

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,before_price,1 Day Open,1 Day Close,2 Day Open,2 Day Close,...,after_dir,1_open_dir,1_close_dir,2_open_dir,2_close_dir,eow_close_dir,sow_open_dir,sig_up,sig_down,is_sig


In [32]:
len(tweet_stock.index)

39040

In [33]:
len(tweet_stock[tweet_stock['is_sig'] == 1].index)

11593

In [34]:
# tweet_stock = tweet_stock[tweet_stock['is_sig'] == 1]

In [35]:
tweet_stock = tweet_stock.drop(["before_price", "1 Day Open", "1 Day Close", "2 Day Open", "2 Day Close", "EOW Close", "SOW Open", "after_price"], axis=1)

## Easy Text preprocessing
- Convert to lower case
- Convert links to <link>
- Remove excess spaces/newlines

In [36]:
from data_science_toolkit.string_ops import remove_newlines, remove_excess_spaces, normalize_links, lower, custom_replace, html_to_unicode, unicode_to_html
from data_science_toolkit.utils import parallel_compute
import nltk
from unidecode import unidecode

In [37]:
def preprocess_tweet(tweet):
    tweet = normalize_links(remove_newlines(html_to_unicode(tweet)), 'msciurl')
#     , re.compile(r"['\"]"), re.compile(r"\.\.\.")
    # NLTK by default turns mentions into @ <tag>.  Makes for easier analysis this stops the mentions from being separated, and allows them to be used in the clustering and stuff later...
    tweet = custom_replace(tweet,
                          [re.compile('RT'), re.compile(r"@(?=\S)"), re.compile(r"#(?=\S)")],
                          [' ', 'twitmention', 'twithashtag'])
    tweet = ' '.join([word for sent in nltk.sent_tokenize(tweet) for word in nltk.word_tokenize(sent)])
    return unidecode(lower(remove_excess_spaces(tweet)))

In [38]:
tweets = tweet_stock["text"].tolist()
processed_tweets = parallel_compute(tweets, preprocess_tweet)

100%|██████████| 39040/39040 [00:04<00:00, 9644.13it/s] 


In [39]:
tweet_stock['preprocessed_text'] = processed_tweets

## Add Other Columns for Analysis

In [40]:
tweet_stock["created_hour"] = tweet_stock["created_at"].apply(lambda x: x.hour)

In [41]:
tweet_stock['dow'] = tweet_stock['created_at'].dt.dayofweek

In [42]:
len(tweet_stock.index)

39040

In [43]:
tweet_stock['is_retweet'] = (tweet_stock['is_retweet'] == 'true').astype(bool)

In [44]:
tweet_stock = tweet_stock[tweet_stock['is_retweet'] == False]

In [45]:
len(tweet_stock.index)

36249

In [46]:
tweet_stock['num_links'] = tweet_stock.preprocessed_text.apply(lambda tweet: sum([1 if 'msciurl' in word else 0 for word in tweet.split("  ")]))
tweet_stock['num_words'] = tweet_stock.preprocessed_text.apply(lambda tweet: sum([1 if len(word) > 1 and 'msciurl' not in word and 'twitmention' not in word and 'twithashtag' not in word else 0 for word in tweet.split(" ")]))

In [67]:
sum(tweet_stock['num_links'].tolist())

8790

In [47]:
# tweet_stock = tweet_stock[tweet_stock.num_words > 5]

In [48]:
len(tweet_stock.index)

36249

In [49]:
# tweet_stock[tweet_stock.retweet_count > 41000].head(100)

In [50]:
# avg_retweet = np.mean(tweet_stock['retweet_count'].tolist())
# sd_retweet = np.std(tweet_stock['retweet_count'].tolist())
# tweet_stock = tweet_stock[tweet_stock['retweet_count'] > (avg_retweet+0*sd_retweet)]

In [51]:
len(tweet_stock.index)

36249

In [52]:
tweet_stock.head(10)

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,after_dir,1_open_dir,1_close_dir,2_open_dir,2_close_dir,eow_close_dir,sow_open_dir,sig_up,sig_down,is_sig,preprocessed_text,created_hour,dow,num_links,num_words
0,Celebrity Apprentice returns to NBC Sunday 3/14 9-11PM ET/PT. Outstanding list of celebrities & season should be the best one yet!,2010-01-12 18:05:08,20.0,3,False,1,1,1,1,0,0,0,0,0,0,celebrity apprentice returns to nbc sunday 3/14 9-11pm et/pt . outstanding list of celebrities & season should be the best one yet !,18,1,0,20
1,Trump Tycoon App for iPhone & iPod Touch - It's $2.99 but the advice is priceless! http://bit.ly/UGUF0,2010-01-15 16:28:02,23.0,20,False,0,0,1,1,0,0,0,0,1,1,trump tycoon app for iphone & ipod touch - it 's $ 2.99 but the advice is priceless ! msciurl,16,4,1,15
2,"from Donald Trump: ""I saw Lady Gaga last night and she was fantastic!""",2010-01-21 16:58:43,3813.0,3223,False,0,0,0,0,0,0,0,0,1,1,from donald trump : `` i saw lady gaga last night and she was fantastic ! '',16,3,0,14
3,Golf Channel & Donald Trump's World of Golf host a Celebrity Match 1/25 @ TNGC LA CA - Mark Wahlberg vs. Kevin Dillon http://bit.ly/4MubN6,2010-01-22 20:51:00,12.0,1,False,1,1,1,1,1,0,0,0,0,0,golf channel & donald trump 's world of golf host a celebrity match 1/25 @ tngc la ca - mark wahlberg vs. kevin dillon msciurl,20,4,1,20
4,Superbowl Sunday is a great American tradition. The Colts and Saints are already champions but may the best team win!,2010-02-07 16:54:42,16.0,3,False,0,0,1,1,1,1,1,0,0,0,superbowl sunday is a great american tradition . the colts and saints are already champions but may the best team win !,16,6,0,19
5,Donald Trump appearing today on CNN International’s ‘Connect the World’ as ‘Connector of the Day’. Submit questions: http://bit.ly/bPiP7T,2010-02-10 15:17:56,7.0,1,False,0,0,1,1,1,1,1,0,0,0,donald trump appearing today on cnn international ' s ' connect the world ' as ' connector of the day ' . submit questions : msciurl,15,2,1,17
6,Donald Trump appeared on the final episode of The Jay Leno Show to deliver a very special message: http://bit.ly/cv1En7,2010-02-11 19:57:36,16.0,3,False,0,1,1,1,1,0,1,0,0,0,donald trump appeared on the final episode of the jay leno show to deliver a very special message : msciurl,19,3,1,17
7,From Donald Trump: “I’m so proud of my wife Melania and the launch of her new jewelry line to debut on QVC on April 30th at 9 p.m.”,2010-02-26 21:18:26,25.0,21,False,1,1,1,1,1,1,1,0,0,0,"from donald trump : "" i ' m so proud of my wife melania and the launch of her new jewelry line to debut on qvc on april 30th at 9 p.m . """,21,4,0,26
8,From Donald Trump: Andrea Bocelli @ Mar-a-Lago - Many say best night of entertainment in long history of Palm Beach http://bit.ly/2fNgOz,2010-03-05 14:51:32,7.0,6,False,1,1,1,1,1,1,1,1,0,1,from donald trump : andrea bocelli @ mar-a-lago - many say best night of entertainment in long history of palm beach msciurl,14,4,1,18
9,The Celebrity Apprentice has a two-hour premiere this Sunday March 14th at 9 p.m. on NBC. This will be the best season yet see you then!,2010-03-12 20:30:52,23.0,7,False,0,1,1,1,1,1,1,0,0,0,the celebrity apprentice has a two-hour premiere this sunday march 14th at 9 p.m. on nbc . this will be the best season yet see you then !,20,4,0,24


# MORE OR LESS RETWEETS THAN AVG!!

In [53]:
tweet_stock.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36249 entries, 0 to 39039
Data columns (total 20 columns):
text                 36249 non-null object
created_at           36249 non-null datetime64[ns]
retweet_count        36249 non-null float64
favorite_count       36249 non-null object
is_retweet           36249 non-null bool
after_dir            36249 non-null int64
1_open_dir           36249 non-null int64
1_close_dir          36249 non-null int64
2_open_dir           36249 non-null int64
2_close_dir          36249 non-null int64
eow_close_dir        36249 non-null int64
sow_open_dir         36249 non-null int64
sig_up               36249 non-null int64
sig_down             36249 non-null int64
is_sig               36249 non-null int64
preprocessed_text    36249 non-null object
created_hour         36249 non-null int64
dow                  36249 non-null int64
num_links            36249 non-null int64
num_words            36249 non-null int64
dtypes: bool(1), datetime64[ns](1), fl

In [54]:
import numpy as np
from collections import defaultdict
from dateutil import relativedelta
curr_dt = datetime(2010, 1, 1)
dt_avg = defaultdict(lambda: defaultdict(int))
while curr_dt < datetime(2020, 1, 1):
    dt_avg[curr_dt.year][curr_dt.month] = np.mean(tweet_stock[tweet_stock['created_at'].between(curr_dt, curr_dt+relativedelta.relativedelta(months=1))]['retweet_count'].tolist())  
    curr_dt += relativedelta.relativedelta(months=1)
dt_avg

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


defaultdict(<function __main__.<lambda>()>,
            {2010: defaultdict(int,
                         {1: 967.0,
                          2: 16.0,
                          3: 21.0,
                          4: 200.61111111111111,
                          5: 22.529411764705884,
                          6: 15.823529411764707,
                          7: 27.46153846153846,
                          8: 39.875,
                          9: 35.38461538461539,
                          10: 29.555555555555557,
                          11: 32.54545454545455,
                          12: 78.8}),
             2011: defaultdict(int,
                         {1: 66.33333333333333,
                          2: 72.8125,
                          3: 59.166666666666664,
                          4: 42.6,
                          5: 82.6923076923077,
                          6: 51.875,
                          7: 165.45205479452054,
                          8: 279.69565217391306,
         

In [55]:
from data_science_toolkit.file_ops import write_pkl
write_pkl("dt_avg_{}.pkl".format(output_file_name), dict(dt_avg))

In [56]:
# tweet_stock['above_avg'] = np.where((tweet_stock['retweet_count'] > dt_avg[tweet_stock['created_at'].year][tweet_stock[created_at].month]), 1, 0)
# tweet_stock['above_avg'] = tweet_stock.apply(lambda row: row[''])
greater_than_monthly_avg = list(map(int, [count > dt_avg[lit_date.year][lit_date.month] for count, lit_date in zip(tweet_stock['retweet_count'].tolist(), tweet_stock['created_at'].tolist())]))
tweet_stock['above_monthly_avg'] = greater_than_monthly_avg

## There is one tweets that is messed up... remove it

In [57]:
tweet_stock = tweet_stock[tweet_stock.num_words < 55]

## Percent Caps

In [58]:
orig = tweet_stock['text'].tolist()
percent_caps = []
for tweet in orig:
    words = tweet.split(" ")
    num_allcaps = sum([1 for x in words if x.upper() == x and re.search('[a-zA-Z]', x)])
    percent_caps.append(int(num_allcaps/len(words)*10)/10 if len(words) else 0)

In [59]:
tweet_stock['percent_caps'] = percent_caps
print(set(percent_caps))

{0.1, 0.0, 0.3, 0.2, 1.0, 0.4, 0.7, 0.5, 0.6, 0.8, 0.9}


## Num Hashtags

In [60]:
preprocessed = tweet_stock['preprocessed_text'].tolist()
num_hashtags = []
for tweet in preprocessed:
    words = tweet.split(" ")
    num_hashtag = sum([1 for x in words if 'twithashtag' in x and re.search('[a-zA-Z]', x)])
    num_hashtags.append(num_hashtag if num_hashtag else 0)

In [61]:
tweet_stock['num_hashtags'] = num_hashtags
print(set(num_hashtags))

{0, 1, 2, 3, 4, 5, 6, 7, 8}


## Num Mentions

In [62]:
num_mentions = []
for tweet in preprocessed:
    words = tweet.split(" ")
    num_mention = sum([1 for x in words if 'twitmention' in x and re.search('[a-zA-Z]', x)])
    num_mentions.append(num_mention if num_mention else 0)

In [63]:
tweet_stock['num_mentions'] = num_mentions
print(set(num_mentions))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}


### Save

In [64]:
tweet_stock.to_csv('./stocks/trump_tweets_sp500.csv', index=False)

In [65]:
# NEXT STEPS:
# - ACTUALLY GET THE PRICES FOR THESE DAYS TO GET PROPER INPUT/OUTPUT
# - SPLIT DATA INTO TRAIN/TEST SETS (MAKE SURE TO RANDOMIZE DATES!)
# - EDA (CLASS BALANCE, MOST COMMON WORDS, CLUSTERING FOR COMMON TOPICS)
# - CLASSIFICATION (RANDOM FOREST? SOFT MARGIN SVM?)