In [1]:
import pandas as pd
import numpy as np
import praw
import nltk

In [2]:
# Used to create variables to validate access to the Reddit API
user_agent = "S&P 500 Sentiment Analysis Model v.0.1"
reddit = praw.Reddit(
    client_id = #YOUR REDDIT CLIENT ID,
    client_secret = #YOUR REDDIT CLIENT SECRET,
    user_agent = user_agent
)

Version 7.7.0 of praw is outdated. Version 7.7.1 was released Tuesday July 11, 2023.


In [3]:
# Collect all the headlines from the subreddit & store them in a DF
WSB_headlines = set()
POL_headlines = set()
ECON_headlines = set()
for submission in reddit.subreddit('wallstreetbets').hot(limit=None):
    WSB_headlines.add(submission.title)

for submission in reddit.subreddit('Politics').hot(limit=None):
    POL_headlines.add(submission.title)

for submission in reddit.subreddit('Economics').hot(limit=None):
    ECON_headlines.add(submission.title)
    
WSB_df = pd.DataFrame(WSB_headlines)
POL_df = pd.DataFrame(POL_headlines)
ECON_df = pd.DataFrame(ECON_headlines)

In [4]:
#Analyzing the Data initally 
WSB_df.head()
ECON_df.head()

Unnamed: 0,0
0,Biden Administration Announces Indo-Pacific De...
1,China Overtakes Japan as World's Top Car Exporter
2,BOE Faces ‘Tricky’ Task Ahead of Rate Decision...
3,Inflation rose 0.4% in April and 4.7% from a y...
4,Dollar bears bide their time as US economic st...


In [5]:
POL_df.head()

Unnamed: 0,0
0,Trump finally reveals how he thinks he could e...
1,"Buoys, razor wire, and a Trump-y wall: How Gre..."
2,Biden Announces New Rules To Tackle Big Corpor...
3,Official concedes 8-year-old who died in U.S. ...
4,Trump asks the judge to delay the start of his...


In [6]:
#Setting up the NLP & Creating the Scores
import nltk
import pprint
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()
results = []
results_1 = []
results_2 = []

#WSB
for each_line in WSB_headlines: 
    score = sia.polarity_scores(each_line)
    score['headline'] = each_line
    results.append(score)

#Politics
for each_line in POL_headlines:
    score = sia.polarity_scores(each_line)
    score['headline'] = each_line
    results_1.append(score)

#Economics
for each_line in ECON_headlines:
    score = sia.polarity_scores(each_line)
    score['headline'] = each_line
    results_2.append(score)

pprint.pprint(results[:3], width=100)
print('\n')
pprint.pprint(results_1[:3], width=100)
print('\n')
pprint.pprint(results_2[:3], width=100)

[{'compound': 0.0,
  'headline': 'Just updating the group on my carvana puts. How do they make you feel? 🩸',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.3612,
  'headline': 'What would stop me from making more on call options if I had won that billion '
              'dollar lottery?',
  'neg': 0.105,
  'neu': 0.718,
  'pos': 0.177},
 {'compound': -0.4215,
  'headline': 'The Verge: Apple is testing an AI chatbot but has no idea what to do with it',
  'neg': 0.149,
  'neu': 0.851,
  'pos': 0.0}]


[{'compound': -0.5994,
  'headline': 'Trump finally reveals how he thinks he could end Russia’s war in Ukraine in a day',
  'neg': 0.218,
  'neu': 0.782,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': 'Buoys, razor wire, and a Trump-y wall: How Greg Abbott turned the Rio Grande into '
              'an immigration ‘war zone’',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': 'Biden Announces New Rules To Tackle Big Corporate Consolidation',
  'neg': 0

In [7]:
#Convert the Dictionary of Headlies and Scores into a DF
WSB_df = pd.DataFrame.from_records(results)
WSB_df.head()

POL_df = pd.DataFrame.from_records(results_1)
POL_df.head()

ECON_df = pd.DataFrame.from_records(results_2)
ECON_df.head()

Unnamed: 0,neg,neu,pos,compound,headline
0,0.0,1.0,0.0,0.0,Biden Administration Announces Indo-Pacific De...
1,0.0,0.795,0.205,0.2023,China Overtakes Japan as World's Top Car Exporter
2,0.0,1.0,0.0,0.0,BOE Faces ‘Tricky’ Task Ahead of Rate Decision...
3,0.0,1.0,0.0,0.0,Inflation rose 0.4% in April and 4.7% from a y...
4,0.0,0.738,0.262,0.4939,Dollar bears bide their time as US economic st...


In [8]:
#Label each headline with a positive(1) or negative(-1) overall value

#Wall Street Bets
WSB_df['label'] = 0
WSB_df.loc[WSB_df['compound'] > 0.2, 'label'] = 1
WSB_df.loc[WSB_df['compound'] < -0.2, 'label'] = -1
WSB_df.head()

#Politics
POL_df['label'] = 0
POL_df.loc[POL_df['compound'] > 0.2, 'label'] = 1
POL_df.loc[POL_df['compound'] < -0.2, 'label'] = -1
POL_df.head()

#Economics
ECON_df['label'] = 0
ECON_df.loc[ECON_df['compound'] > 0.2, 'label'] = 1
ECON_df.loc[ECON_df['compound'] < -0.2, 'label'] = -1
ECON_df.head()

Unnamed: 0,neg,neu,pos,compound,headline,label
0,0.0,1.0,0.0,0.0,Biden Administration Announces Indo-Pacific De...,0
1,0.0,0.795,0.205,0.2023,China Overtakes Japan as World's Top Car Exporter,1
2,0.0,1.0,0.0,0.0,BOE Faces ‘Tricky’ Task Ahead of Rate Decision...,0
3,0.0,1.0,0.0,0.0,Inflation rose 0.4% in April and 4.7% from a y...,0
4,0.0,0.738,0.262,0.4939,Dollar bears bide their time as US economic st...,1


In [9]:
#Create a new DF with just the headlines and the sentiment score for WSB
WSB_Analysis_df = []
WSB_df = WSB_df[['headline', 'label']]
WSB_Analysis_df = WSB_df.label.value_counts(normalize=True) * 100

#Create a new DF with just the headlines and sentiment scores for Politics Subreddit
POL_Analysis_df = []
POL_df = POL_df[['headline', 'label']]
POL_Analysis_df = POL_df.label.value_counts(normalize=True) * 100

#Create a new DF with just the headlines and sentiment scores for Economics Subreddit
ECON_Analysis_df = []
ECON_df = ECON_df[['headline', 'label']]
ECON_Analysis_df = ECON_df.label.value_counts(normalize=True) * 100

WSB_Analysis_df

label
 0    59.894459
 1    24.010554
-1    16.094987
Name: proportion, dtype: float64

In [10]:
POL_Analysis_df

label
 0    42.762284
-1    36.520584
 1    20.717131
Name: proportion, dtype: float64

In [11]:
ECON_Analysis_df

label
 0    47.647059
-1    32.794118
 1    19.558824
Name: proportion, dtype: float64

In [12]:
#Find the sentiment of each subreddit
Net_Change_WSB = WSB_Analysis_df.loc[1] /100 - WSB_Analysis_df.loc[-1]/100
Net_Change_POL = POL_Analysis_df.loc[1]/100 - POL_Analysis_df.loc[-1]/100
Net_Change_ECON = ECON_Analysis_df.loc[1]/100 - ECON_Analysis_df.loc[-1]/100

#Change the weight of each subreddit based on the size of it compared to the total size of data set
Net_Change_WSB *= .5447
Net_Change_POL *= .3269
Net_Change_ECON *= .1284

#Combine Sets
Weighted_Total = Net_Change_WSB + Net_Change_POL + Net_Change_ECON

Weighted_Total

-0.025539510044049574

In [13]:
#Arrays to keep track of the sentiment scores and the change in price both rounded to 4 sigfigs()
#Independent Variables
Sentiment_Scores = np.array([0.0178, 0.0244, .045, 0.0467, .0659])

#Dependent Variables
Max_High = np.array([.68, 4.01, 1.66, 1.09, 2.48])
Max_Low = np.array([-2.36, -0.33, -0.59, -1.48, -0.44])

In [14]:
#Use Data to get two functions for predictions
#To predict the high of the day 
High_reg = np.polyfit(Sentiment_Scores, Max_High, deg=1)
High_reg

array([1.74677634, 1.91419882])

In [15]:
#To predict the low of the day
Low_reg = np.polyfit(Sentiment_Scores, Max_Low, deg =1)
Low_reg

array([22.39161699, -1.93476901])

In [16]:
#Make Predictions given the Regression Correlation Functions
#High Prediction
High = High_reg[0] * Weighted_Total + High_reg[1]
High

1.8695870056770283

In [17]:
#Low Prediction
Low = Low_reg[0] * Weighted_Total + Low_reg[1]
Low

-2.506639941956976

In [18]:
#Take in User input and output the prediciton for the highest/lowest prices of the stock
Stock_Price = input("Today\'s S&P500 Opening Price: ")
Stock_Price = float(Stock_Price)

Today's S&P500 Opening Price: 453.96


In [21]:
#Create the predictions using the low and high we got earlier and the current stock price
High_Pred = Stock_Price + High
Low_Pred = Stock_Price + Low

In [22]:
#Display the high and the low 
from IPython.display import display
print("High of the Day: ")
display(High_Pred)
print("\n")
print("Opening Price: ")
display(Stock_Price)
print("\n")
print("Low of the Day: ")
display(Low_Pred)

High of the Day: 


455.829587005677



Opening Price: 


453.96



Low of the Day: 


451.453360058043