# Reddit WallStreetBets Sentiment Analysis for use in Stock Trading 

In [None]:
# Imports for processing Data, performing calculations, and plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn


# Reddits API for compiling posts
#!pip install praw --upgrade
import praw #reddit data api

#Used for compiling stock information
#!pip install ffn
import ffn 


# RegEx 
import re #regex
!pip install vaderSentiment

# VADER used for social media sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer #VADER sentiment model

# Used to store and pull in large DataFrames that are compiled
import requests
import json
import csv

# For processing time functions
import time
import datetime

# For Regularization
from sklearn.preprocessing import MinMaxScaler


pd.set_option('display.max_rows', 300)

In [None]:
# Import TensorFlow for building a neural network
import tensorflow as tf
from tensorflow import keras
tf.test.gpu_device_name()

In [None]:
# Functions to compile post data for use in sentiment Analysis
def getData(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?title='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    print(url)
    red = requests.get(url)
    data = json.loads(red.text)
    return data['data']

# Get relevant data from data extracted using previous function
def getSubData(subm):
    subData = [subm['id'], subm['title'], subm['url'], datetime.datetime.fromtimestamp(subm['created_utc']).date()]
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"
    subData.append(flair)
    stats.append(subData)
    

In [None]:
#  Indicate which subreddit to pull form 

sub = 'wallstreetbets'

# Setting time range
# Setting time range for all of 2021 so far 

before = "1615827608" #March 15 2021
after = "1609520408" #January 1 2021

# Set query string
query = "Daily Discussion Thread"
count = 0
stats = []

In [None]:
# Gather all data between the desired dates
data = getData(query, after, before, sub)

In [None]:
# Compile Submissions
while len(data) > 0:
    for submission in data:
        getSubData(submission)
        count+=1
        
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    reddit = getData(query, after, before, sub)

In [None]:
# Put data into DataFrame

data = {}
ids = []
titles = []
urls = []
dates = []
flairs = []

for stat in stats:
    ids.append(stat[0])
    titles.append(stat[1]) 
    urls.append(stat[2])
    dates.append(stat[3])
    flairs.append(stat[4])

In [None]:
# Create DataFrame

reddit['id']=ids
reddit['title']=titles
reddit['url']=urls
reddit['date']=dates
reddit['flair']=flairs

reddit_df = pd.DataFrame(reddit)
reddit_df = reddit_df[reddit_df['flair'] == 'Daily Discussion']

In [None]:
reddit_df.head()

In [None]:
# Create Reddit connection
# Withholding these fields for privacy. 
reddit = praw.Reddit(client_id='', client_secret='', user_agent='')

# Collect Reddit Comments
daily_comments=[]
for url in reddit_df['url'].tolist():
    try:
        submission = reddit.submission(url=url)
        submission.comments.replace_more(limit=0)
        comments=list([(comment.body) for comment in submission.comments])
    except:
        comments=None
    daily_comments.append(comments)

In [None]:
# Using Vader sentiment analyzer to get sentiment score on posts

vader = SentimentIntensityAnalyzer()

scores = []
for comment in daily_comments:
    sent_score = 0
    try:
        for comment in comments:
            sent_score = sent_score + vader.polarity_scores(comment)['compound']
    except TypeError:
        sent_score = 0
        
    scores.append(sent_score)


In [None]:
# Plug score sinto DataFrame
reddit_df['sentiment score'] = scores

In [None]:
# Retrieve stock prices for SPY - S&P 500 aggregate Stock

spy = ffn.get('spy', start = '2021-01-01')
spy_values = []

for date in reddit_df['date'].tolist():
    try:
        spy_value.append(float(spy.loc[date]))
    except KeyError:
        spy_values.append(None)

In [None]:
#Submit stock prices to original DataFrame
reddit_df['spy'] = spy_values

# Isolate Columns of INterest 
reddit_df = reddit_df[['date', 'sentiment score', 'spy']]

reddit_df = reddit_df.set_index('date')
reddit_df = reddit_df[reddit_df['spy'].notna()]

reddit_df.to_csv('positive+negative_sentiment_data_1.csv')

In [None]:
# Save Data to CSV so I dont have to run this long function everytime
reddit_df = pd.read_csv('positive+negative_sentiment_data_1.csv')

In [None]:
# Plot of sentiment score vs. SPY price

reddit_df.plot(secondary_y = 'sentiment score', figsize = (15, 10))

In [None]:
# Use Frouier to remove noise from sentminet score

open_fft = np.fft.fft(np.asarray(reddit_df['sentiment score'].tolist()))

fourier_df = pd.DataFrame({'fft': open_fft})

fourier_df['absolute'] = fourier_df['fft'].apply(lambda x: np.abs(x))
fourier_df['angular'] = fourier_df['fft'].apply(lambda x: np.angle(x))

fourier_list = np.asarray(fourier_df['fft'].tolist())

for num in [5, 10, 20]:
    fourier_list_2 = np.copy(fourier_list); fourier_list_2[num: -num] = 0
    reddit_df['fourier ' + str(num)] = np.fft.ifft(fft_list_2)
    


In [None]:
# Plot of Sentiment Score, as well as Fourier flattened with 5, 10, and 20 frequencies
reddit_df[['sentiment score', 'fourier 5', 'fourier 10', 'fourier 20']].plot(figsize=(16, 10));

In [None]:
# Plot of the SPY price and with fourier at 20 frequency
reddit_df[['spy', 'fourier 20']].plot(secondary_y = 'fourier 20', figsize=(16, 10));

In [None]:
# Normalizing the data

scaler = MinMaxScaler(feature_range = (0, 1))

reddit_df['normalized price'] = scaler.fit_transform(reddit_df['spy'].to_numpy().reshape(-1, 1))

reddit_df['log spy'] = np.log(reddit_df['spy'] / reddit_df['spy'].shift(1))

reddit_df['normalized sentiment'] = scaler.fit_transform(reddit_df['sentiment score'].to_numpy().reshape(-1, 1))

reddit_df['normalized fourier 5'] = scaler.fit_transform(np.asarray(list([(float(x)) for x in reddit_df['fourier 5'].to_numpy()])).reshape(-1, 1))
reddit_df['normalized fourier 10'] = scaler.fit_transform(np.asarray(list([(float(x)) for x in reddit_df['fourier 10'].to_numpy()])).reshape(-1, 1))
reddit_df['normalized fourier 20'] = scaler.fit_transform(np.asarray(list([(float(x)) for x in reddit_df['fourier 20'].to_numpy()])).reshape(-1, 1))


In [None]:
# Plot of all normalized Data
reddit_df[['normalized price', 'normalized sentiment', 'normalized fourier 5', 'normalized fourier 10', 'normalized fourier 20']].plot(figsize=(16, 10))

## SPY Price vs. +/- Sentiment 

In [None]:
# Check Correlation 

four_vs_price_corr = reddit_df[['normalized fourier 20', 'normalized_price']].corr()
four_vs_price_corr

In [None]:
# Create DataFrame with rolling correlation

two_week_rolling = reddit_df['normalized_price'].rolling(window=14).corr(reddit_df['normalized fourier 20'])

corr = reddit_df[['normalized fourier 20', 'normalized_price']].corr().iloc[0, 1]

mean = two_week_rolling.mean()
print(mean)

In [None]:
# Get Standard Deviation of two week rolling average
std = np.std(two_week_rolling)
print(std)

In [None]:
# Plotting the two week rolling correlation with actual correlation
ax = two_week_rolling.plot(figsize=(16, 10))

ax.axhline(corr, c='r')

In [None]:
# Plot two week rolling correlation vs. normalized SPY price 
# with lines at mean and mean minus std of rolling correlation. 

reddit_df['two week rolling'] = two_week_rolling

ax = reddit_df[['two_week_rolling', 'normalized_price']].plot(secondary_y = 'normalized_price', figsize=(16, 10))

ax.axhline(mean-std, c='black');
ax.axhline(mean, c='red');

# Bullish and Bearish Sentiment Analysis 

I will be following the same approach as before, but this time I only need the Title of the Reddit Post to determine whether the statement is Bear-ish or Bullish. From There I will build a Neural Network to attempt to predict the next days Stock price. 

In [None]:
# Subreddit to scrape
sub = 'wallstreetbets'

# Date Range 

before = "1615827608" #March 15 2021
after = "1609520408" #January 1 2021
query = ""

count = 0
stats = []

In [None]:
# Get Data from the subreddit posts between the time frame indicated
reddit2 = getData(query, after, before, sub)

In [None]:
# Compile Posts

while len(reddit2) > 0:
    for submission in reddit2:
        getSubData(submission)
        count += 1
        
        print(len(reddit2))
        print(str(datetime.datetime.fromtimestamp(reddit2[-1]['created_utc'])))
        
        after = reddit2[-1]['created_utc']
        
        try:
            reddit2 = getData(query, after, before, sub)
        
        except:
            pass
        

In [None]:
# Extract all relevant information
reddit2 = {}

ids = []
titles = []
urls = []
dates = []
flairs = []

for stat in stats:
    ids.append(stat[0])
    titles.append(stat[1])
    urls.append(stat[2])
    dates.append(stat[3])
    flairs.append(stat[4])
    


In [None]:
# Create a DataFrame of the information
reddit2['id'] = ids
reddit2['title'] = titles
reddit2['url'] = urls
reddit2['date'] = dates
reddit2['flair'] = flairs

reddit_df2 = pd.DataFrame(reddit2)

In [None]:
# Extract jsut the titles of the reddit posts. 
titles = reddit_df2['title'].tolist()
titles = list([(title.lower()) for title in titles])

In [None]:
# Creating a list of words to indicate whether a post is Bull-ish or Bear-ish
bullish = ['call', 'long', 'all in', 'moon', 'going up', 'rocket', 'buy', 'long term', 'green']
bearish = ['put', 'short', 'going down', 'drop', 'bear', 'sell', 'red']

In [None]:
# Create a scoring index to give to bearish and bullish posts 
bullish_scores = []
bearish_scores = []

for title in titles:
    bull = False
    bear = False
    
    for word in bullish:
        if word in title:
            bear = True
    if re.findall(r'(\b\d{1,4}[c]\b)|(\b\d{1,4}[ ][c]\b)', title):
        bull = True
        
    for word in bearish:
        if word in title:
            bear = True
    if re.findall(r'(\b\d{1,4}[p]\b)|(\b\d{1,4}[ ][p]\b)', title):
            bear = True
            
    
    
    if bull == True and bear == True:
        bullish_scores.append(0)
        bearish_scores.append(0)
    if bull == False and bear == False:
        bullish_scores.append(0)
        bearish_scores.append(0)
    if bull == True and bear == False:
        bullish_scores.append(1)
        bearish_scores.append(0)
        
    if bull == False and bear == True:
        bullish_scores.append(0)
        bearish_scores.append(1)
        
reddit_df2['bullish score'] = bullish_scores
reddit_df2['bearish score'] = bearish_scores   

In [None]:
# Isolate posts with desired flairs

ind = []

flairs = reddit_df2['flair'].tolist()

for i in range(len(flairs)):
    if flairs[i] == 'DD' or flairs[i] =='Discussion' or flairs[i] == 'YOLO' or flairs[i] == 'Fundamentals' or flairs[i] == 'Stocks':
        ind.append(i)

        
reddit_df2 = reddit_df2.iloc[ind]

In [None]:
# Sum sentiment scores from all the submission titles per day 
# and divide by total submissions for that day

scores_df = reddit_df2.groupby('date').sum()

scores_df['bullish score'] == scores_df['bullish score'] / reddit_df2.groupby('date').count()['bullish score']
scores_df['bearish score'] == scores_df['bearish score'] / reddit_df2.groupby('date').count()['bearish score']

In [None]:
# Get the SPY stock price for the time frame and assign the appropriate bear and bull scores

reddit_df2 = ffn.get('spy', start = '2021-01-01')

reddit_df2 = reddit_df2.loc[:'2021-03-14']

bullish_values = []
bearish_values = []

for date in reddit_df2.index.tolist():
    bullish_values.append(float(scores_df.loc[date.date()]['bullish score']))
    bearish_values.append(float(scores_df.loc[date.date()]['bearish score']))   
    

reddit_df2['bullish score'] = bullish_values
reddit_df2['bearish score'] = bearish_values

In [None]:
# Save to csv so I do not have to reoad the data everytime
reddit_df2.to_csv('bull+bearish_sentiment_data_1.csv')

In [None]:
# Plot the bull scores vs. SPY

reddit_df2[['spy', 'bullish score']].loc['2021-01-01':].plot(secondary_y = 'bullish score', figsize=(16, 10))

In [None]:
# Plot the bear scores vs. spy

reddit_df2[['spy', 'bearish score']].loc['2021-01-01':].plot(secondary_y = 'bearish score', figsize=(16, 10))

In [None]:
# Perform Fourier transformaion to smooth data

norm_fft = np.fft.fft(np.asarray(reddit_df2['bull score'].tolist()))
fft_df = pd.DataFrame({'fft':norm_fft})
fft_df['absolute'] = fft_df['fft'].apply(lambda x: np.abs(x))
fft_df['angular'] = fft_df['fft'].apply(lambda x: np.angle(x))
fft_list = np.asarray(fft_df['fft'].tolist())

for num in [10, 30]:
    fft_list_m10= np.copy(fft_list); fft_list_m10[num:-num]=0
    reddit_df2['fourier bull '+str(num)]=np.fft.ifft(fft_list_m10)

norm_fft = np.fft.fft(np.asarray(df_2['bear score'].tolist()))
fft_df = pd.DataFrame({'fft':norm_fft})
fft_df['absolute'] = fft_df['fft'].apply(lambda x: np.abs(x))
fft_df['angular'] = fft_df['fft'].apply(lambda x: np.angle(x))
fft_list = np.asarray(fft_df['fft'].tolist())

for num in [10, 30]:
    fft_list_m10= np.copy(fft_list); fft_list_m10[num:-num]=0
    reddit_df2['fourier bear '+str(num)]=np.fft.ifft(fft_list_m10)

In [None]:
# Plot of Bullish scores with fourier transformation

reddit_df2[['bullish score', 'fourier bull 10', 'fourier bull 30']].plot(figsize=(16, 10));

In [None]:
# Plot of Bearish scores with fourier transformation

reddit_df2[['bearish score', 'fourier bear 10', 'fourier bear 30']].plot(figsize=(16, 10));

In [None]:
# Normalize all of the variables

scaler = MinMaxScaler(feature_range = (0, 1))

redditdf_2['normalized_price']=scaler.fit_transform(redditdf_2['spy'].to_numpy().reshape(-1, 1))
redditdf_2['log spy']=np.log(redditdf_2['spy']/redditdf_2['spy'].shift(1))
redditdf_2['normalized_bull']=scaler.fit_transform(redditdf_2['bull score'].to_numpy().reshape(-1, 1))
redditdf_2['normalized_bear']=sc.fit_transform(redditdf_2['bear score'].to_numpy().reshape(-1, 1))
redditdf_2['normalized_fourier_bull_10']=sc.fit_transform(np.asarray(list([(float(x)) for x in redditdf_2['fourier bull 10'].to_numpy()])).reshape(-1, 1))
redditdf_2['normalized_fourier_bear_10']=sc.fit_transform(np.asarray(list([(float(x)) for x in redditdf_2['fourier bear 10'].to_numpy()])).reshape(-1, 1))
redditdf_2['normalized_fourier_bull_30']=sc.fit_transform(np.asarray(list([(float(x)) for x in redditdf_2['fourier bull 30'].to_numpy()])).reshape(-1, 1))
redditdf_2['normalized_fourier_bear_30']=sc.fit_transform(np.asarray(list([(float(x)) for x in redditdf_2['fourier bear 30'].to_numpy()])).reshape(-1, 1))

In [None]:
# Plot of normalized spy price and fourier transformed bull sentiment scores 

reddit_df2[['normalized_price', 'normalized_fourier_bull_10', 'normalized_fourier_bull_30']].plot(figsize=(16, 10));


In [None]:
# Plot of normalized spy price and fourier transformed bear sentiment scores 

reddit_df2[['normalized_price', 'normalized_fourier_bear_10', 'normalized_fourier_bear_30']].plot(figsize=(16, 10));


## Neural Network to predict following days SPY price 

In [None]:
# Function to manipulate numpy array
def remove_first(array):
    new_array = []
    for x in array:
        new_array.append(x[1:])
    return np.asarray(new_array)

Create a 2 week window of data out of array. The array contains the window, number of days in each window, and the number of features. 

In [None]:
df = reddit_df2[['normalized_price', 
                 'normalized_bull', 
                 'normalized_fourier_bull_10', 
                 'normalized_fourier_bear_10', 
                'normalized_fourier_bull_30',
                'normalized_fourier_bear_30']].to_numpy()



In [None]:
# Set windown, lag, and aggregate data
window = 15
gap = 1
data = []

for i in range(len(df)-window):
    data.append(df[x:x+window])
    
data = np.asarray(data)

train = data[:-50]
test = data[-50]
np.random.shuffle(train)

At this point I manually split the data into training and testing sets for validation and to run a neural network. 

In [None]:
# Create training set
X_train = []
y_train = []

for d in train:
    X_train.append(remove_first(d[:window-gap]))
    y_train.append(d[-1][0])
    


In [None]:
# Create testing set
X_test = []
y_test = []

for d in test:
    X_test.append(remove_first(d[:window-gap]))
    y_test.append(d[-1][0])

In [None]:
# Convert train and testing into numpy arrays

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

I then contruct a a Long Short Term Memory neural Network

In [None]:
tf.keras.backend.clear_session()
tf.random.set_seed(51)
np.random.seed(51)

mc = tf.keras.callbacks.ModelCheckpoint(filepath='lstm_bullbear_sentiment_1.h5', monitor='val_loss', save_best_only=True)

model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(48, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    tf.keras.layers.LSTM(48, return_sequences=True),
    tf.keras.layers.LSTM(48),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(12, activation='relu'),
    tf.keras.layers.Dense(1),
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])


history = model.fit(X_train, y_train, batch_size = 32, validation_data = [X_test, y_test], epochs = 250, callbacks = [mc]).history

Plotting the loss

In [None]:
accuracy = history['accuracy']
val_accuracy = history['val_accuracy']
loss = history['loss']

val_loss = history['val_loss']
epochs = range(len(acc))



In [None]:
plt.figure(figsize=(16, 12))
plt.plot(epochs, loss, 'r', label = 'Loss in Training')
plt.plot(epochs, val_loss, 'b', label = 'Validation loss')
plt.title("Training and Validation Loss")
plt.legend()
plt.figure()
plt.show()

Making Predictions

In [None]:
preds = model.predict(X_test)

preds_df = pd.DataFrame(y_test, columns=['price'])
preds_df['preds'] = preds

pred_df.plot(figsize=(18, 12))
plt.show()

Finding the Mean Absolute Error of the predictions

In [None]:
mae = sum(tf.keras.metrics.mean_absolute_error(y_test, preds).numpy())/len(y_test)

print("The MEA of the model is: ", mea)