In [312]:
import yfinance as yf
import pandas as pd
import numpy as np

# load symbols

In [2]:
filepath = "/Users/Vincent/Desktop/nlp-stock-market-trend-prediction-with-reddit-posts/data/"

In [3]:
df_symbols = pd.read_pickle(filepath + "symbols.pd")

In [4]:
df_symbols["Symbol"] = df_symbols["Symbol"].astype(str).apply(lambda sym: sym.lower())

In [5]:
df_symbols.head()

Unnamed: 0,index,Symbol,Description,STOCK_EXCHANGE
0,0,aaa,First Priority Clo Bond ETF,AMEX
1,1,aaau,GS Physical Gold ETF,AMEX
2,2,aamc,Altisource Asset,AMEX
3,3,aau,Almaden Minerals,AMEX
4,4,abeq,Absolute Core Strategy ETF,AMEX


# load reddit data

In [6]:
df = pd.read_pickle(filepath + "wallstreetbets.pd")

In [7]:
df.head()

Unnamed: 0,created_at,title,selftext,score,permalink,all_comments
0,2021-05-16,Life savings of 43k into UWMC - I have a crapp...,,1120.0,/r/wallstreetbets/comments/ndgwki/life_savings...,[Can't you just turn your civic into a sports ...
1,2021-06-08,"My current $3.7 mil portfolio, powered entirel...",,4467.0,/r/wallstreetbets/comments/nv5gs3/my_current_3...,[You could retire comfortably. I’m glad you’ve...
2,2021-06-09,$UWMC This rocket is fueled and ready 🚀🚀🚀,,358.0,/r/wallstreetbets/comments/nw73p0/uwmc_this_ro...,[I’m in for 400 shares at 7.90 cost basis. Bee...
3,2021-06-10,$ASO - Your YOLOs don't have to be retarded(A ...,All you retards dumping your life savings in c...,208.0,/r/wallstreetbets/comments/nwysob/aso_your_yol...,"[Smooth brain enjoy, picturebook good, easy.\n..."
4,2021-06-10,"$CLOV was a meme stock, but it was undervalued...","I’ll admit it, I didn’t do any fucking researc...",240.0,/r/wallstreetbets/comments/nwz20e/clov_was_a_m...,[Just a dumb ape here but isn't Ortex reportin...


# text pre-processing

## lower and remove punctuation etc.

In [8]:
import re
import string

In [9]:
def cleaning(text):
    text = text.lower()
    text = re.sub("\[.*\]", "", text)
    text = re.sub("\(.*\)", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub("\w*\d\w*", "", text)
    text = re.sub("\s{2}", " ", text)
    text = text.encode("ascii", "ignore").decode("ascii") # removing emojis
    return text

In [10]:
df["title_clean"] = df.title.apply(cleaning)
df["text_clean"] = df.apply(lambda row: cleaning(row["title"] + " " + row["selftext"]), axis=1)

## remove daily threads

In [457]:
df = df[~df.title_clean.str.contains("daily.*thread")]

## remove stop words and tokenize

In [458]:
import nltk
nltk.download("stopwords")
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.corpus import words
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [504]:
stop_words = set(stopwords.words("english"))
stop_words.add("yolo")

In [505]:
len(stop_words & set(df_symbols["Symbol"].tolist()))

54

In [506]:
df["title_tokens"] = df.title_clean.apply(lambda t: [token for token in word_tokenize(t) if token not in stop_words])
df["text_tokens"] = df.text_clean.apply(lambda t: [token for token in word_tokenize(t) if token not in stop_words])

## filter stocks

In [507]:
from collections import Counter

In [508]:
symbols = set(df_symbols["Symbol"].tolist()) - set(words.words())
symbols.add("wish")
symbols.remove("yolo")
len(symbols)

65579

In [509]:
df["title_symbols"] = df.title_tokens.apply(lambda tokens: [token for token in tokens if token in symbols])
df["text_symbols"] = df.text_tokens.apply(lambda tokens: [token for token in tokens if token in symbols])

In [510]:
df["title_symbols"] = df.title_symbols.apply(Counter)
df["text_symbols"] = df.text_symbols.apply(Counter)

In [548]:
def remove_stocks(row):
    title, stocks = row["title_clean"], row["title_symbols"].keys()
    for stock in stocks:
        title = title.replace(stock, "")
    return title

In [549]:
df["title_clean_no_stocks"] = df.apply(remove_stocks, axis=1)

In [550]:
df["title_tokens_clean"] = df.title_clean_no_stocks.apply(lambda t: [token for token in word_tokenize(t) if token not in stop_words])

## stemming

In [551]:
sno = nltk.stem.SnowballStemmer("english")

In [552]:
df["title_tokens"] = df.title_tokens.apply(lambda tokens: [sno.stem(token) for token in tokens])
df["text_tokens"] = df.text_tokens.apply(lambda tokens: [sno.stem(token) for token in tokens])
df["title_tokens_clean"] = df.title_tokens_clean.apply(lambda tokens: [sno.stem(token) for token in tokens])

## lemmatization

In [554]:
lem = nltk.stem.WordNetLemmatizer()

In [555]:
df["title_tokens"] = df.title_tokens.apply(lambda tokens: [lem.lemmatize(token) for token in tokens])
df["text_tokens"] = df.text_tokens.apply(lambda tokens: [lem.lemmatize(token) for token in tokens])
df["title_tokens_clean"] = df.title_tokens_clean.apply(lambda tokens: [lem.lemmatize(token) for token in tokens])

# Sentiment Analysis

In [556]:
# take only entires where we have some symbols
df_sa = df[df.title_symbols.apply(len) > 0]

## lexicon-based
__Different lexicons__:
* AFINN
* Bing Liu's
* MPQA subjectivity
* SentiWordNet
* VADER
* TextBlob

### AFINN

In [557]:
from afinn import Afinn

In [558]:
af = Afinn()

In [559]:
df_sa["AFINN_polarity"] = df_sa.text_clean.apply(lambda text: af.score(text))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


### Bing Liu's

In [560]:
nltk.download("opinion_lexicon")
from nltk.corpus import opinion_lexicon

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [561]:
pos = set(opinion_lexicon.positive())
neg = set(opinion_lexicon.negative())

In [562]:
df_sa["Bing_Liu_polarity"] = df_sa.text_tokens.apply(lambda tokens: sum(1 if token in pos else -1 if token in neg else 0 for token in tokens))

### Vader

In [563]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [564]:
sid = SentimentIntensityAnalyzer()

In [565]:
df_sa["VADER_polarity"] = df_sa.text_clean.apply(lambda text: sid.polarity_scores(text)["compound"])

### TextBlob

In [566]:
from textblob import TextBlob

In [567]:
df_sa["TextBlob_polarity"] = df_sa.text_clean.apply(lambda text: TextBlob(text).polarity)

### Comparison

In [568]:
df_sa[["text_clean","AFINN_polarity", "Bing_Liu_polarity", "VADER_polarity", "TextBlob_polarity"]].head(50)

Unnamed: 0,text_clean,AFINN_polarity,Bing_Liu_polarity,VADER_polarity,TextBlob_polarity
0,life savings of into uwmc i have a crappy hond...,-1.0,0,-0.5106,0.0
1,my current mil portfolio powered entirely by m...,0.0,0,0.0,0.0
2,uwmc this rocket is fueled and ready,0.0,0,0.3612,0.2
3,aso your yolos dont have to be retarded all yo...,-29.0,-5,-0.9917,-0.12142
4,clov was a meme stock but it was undervalued b...,-13.0,-1,0.9832,0.011038
5,clov i did a reverse retard finally,-2.0,-1,-0.5267,-0.45
6,wish and amazon buyout offer in remember amazo...,14.0,5,0.9677,0.131944
7,wish holding the line boys another shares toda...,2.0,0,0.5994,0.0
8,wkhs and why it could be a nobrainer hey guys ...,24.0,10,0.9901,0.130811
9,loss on bb calls in two days,-3.0,-1,-0.3182,0.0


## Naive Bayes

In [569]:
if False:
    df_train.to_csv("/Users/Vincent/Desktop/nlp-stock-market-trend-prediction-with-reddit-posts/data/Classifier/training_list.csv")

## Maximum Entropy (ME)

## Support Vector Machines (SVM)

## Multilayer perceptron (MLP)

# Latent Dirichlet Allocation (LDA)

In [570]:
import gensim
from gensim import corpora
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models

In [571]:
df_sa["title_bigrams"] = df_sa.title_tokens_clean.apply(lambda t: ["_".join(bigram) for bigram in nltk.bigrams(t)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [572]:
flatten = lambda nested_list: [elm for sublist in nested_list for elm in sublist]

In [573]:
id2word = corpora.Dictionary(df_sa.title_tokens_clean)
#id2word = corpora.Dictionary(df_sa.title_bigrams)

In [574]:
corpus = [id2word.doc2bow(tokens) for tokens in df_sa.title_tokens_clean]
#corpus = [id2word.doc2bow(tokens) for tokens in df_sa.title_bigrams]

In [536]:
print(id2word)

Dictionary(2166 unique tokens: ['buy', 'car', 'civic', 'crappy', 'honda']...)


In [496]:
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=5,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True)

In [497]:
topics = lda_model.print_topics(num_words=10)

In [498]:
for topic in topics:
    print(topic)

(0, '0.025*"like" + 0.024*"play" + 0.024*"im" + 0.022*"next" + 0.021*"update" + 0.020*"week" + 0.010*"fuck" + 0.010*"one" + 0.009*"buying" + 0.009*"fly"')
(1, '0.021*"go" + 0.021*"still" + 0.020*"lets" + 0.016*"tomorrow" + 0.016*"sold" + 0.016*"time" + 0.012*"make" + 0.012*"holding" + 0.011*"last" + 0.009*"strong"')
(2, '0.022*"stock" + 0.016*"buy" + 0.014*"analysis" + 0.013*"technical" + 0.011*"money" + 0.010*"meme" + 0.010*"way" + 0.010*"true" + 0.008*"says" + 0.008*"rocket"')
(3, '0.020*"apes" + 0.018*"short" + 0.016*"got" + 0.014*"get" + 0.013*"stock" + 0.013*"back" + 0.011*"year" + 0.011*"bullish" + 0.009*"need" + 0.009*"dont"')
(4, '0.121*"yolo" + 0.027*"moon" + 0.024*"shares" + 0.017*"going" + 0.016*"options" + 0.016*"calls" + 0.014*"good" + 0.013*"today" + 0.012*"retard" + 0.011*"long"')


In [499]:
# Perplexity: how probable new unseen data is (but mustn't correlate with human judgement)
print("Perplexity:", lda_model.log_perplexity(corpus))

coherence_model_lda = CoherenceModel(model=lda_model, texts=df_sa.title_bigrams, dictionary=id2word, coherence="c_v")
print("Coherence Score:", coherence_model_lda.get_coherence())

Perplexity: -7.929074689785096


  numerator = (co_occur_count / num_docs) + EPSILON
  denominator = (w_prime_count / num_docs) * (w_star_count / num_docs)
  co_doc_prob = co_occur_count / num_docs


Coherence Score: nan


## hyperparameter tuning

In [501]:
# Topic parameter
topics = range(2,100)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.1))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.1, 1, 0.1))
beta.append('symmetric')

for k in topics:
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=k,
        random_state=100,
        update_every=1,
        chunksize=1000,
        passes=10,
        #alpha=.91,
        #eta=b,
        per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df_sa.title_tokens_clean, dictionary=id2word, coherence="c_v")
    print(k, coherence_model_lda.get_coherence())

2 0.49176312423169044
3 0.5596221473673748
4 0.5941907589748419
5 0.5554028807812793
6 0.5599493342804385
7 0.5691848695067179
8 0.571672380313652
9 0.5992379512108791
10 0.585493984232982
11 0.5910865929986246
12 0.5742824532201031
13 0.5849851560418828
14 0.5965099506337707
15 0.5856288931415011
16 0.6075242210191873
17 0.6105487566904504
18 0.6063205202305365
19 0.5908104597299652
20 0.5976307399512523
21 0.6014351014056121
22 0.599244488932293
23 0.5870801526777898
24 0.5806243811957262
25 0.6044643159269018
26 0.5665093131854143
27 0.5888858366899637
28 0.5783665702317039
29 0.5800679388974411
30 0.5924328810331445
31 0.5781609938001471
32 0.5713453327898305
33 0.5725985560784537
34 0.5667743295853699
35 0.5536313555575257
36 0.5488864947504919
37 0.5463976639533691
38 0.5277385440443103
39 0.5186600844823881
40 0.537416853931823
41 0.5352778600974869
42 0.521947062015588
43 0.5233249513644409
44 0.5052922809009084
45 0.5132886779552367
46 0.5100643823460284
47 0.4895577779048218


KeyboardInterrupt: 

# visualization

In [575]:
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=25,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha=.91,
    eta="symmetric",
    per_word_topics=True)

interesting topics:
6 - buy
7 - gain 20
9 - selling

In [576]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
