In [1]:
import pandas as pd
import numpy as np

# load symbols

In [2]:
filepath = "/Users/Vincent/Desktop/nlp-stock-market-trend-prediction-with-reddit-posts/data/"

In [3]:
df_symbols = pd.read_pickle(filepath + "symbols.pd")

In [4]:
df_symbols["Symbol"] = df_symbols["Symbol"].astype(str).apply(lambda sym: sym.lower())

In [5]:
df_symbols.head()

Unnamed: 0,index,Symbol,Description,STOCK_EXCHANGE
0,0,aaa,First Priority Clo Bond ETF,AMEX
1,1,aaau,GS Physical Gold ETF,AMEX
2,2,aamc,Altisource Asset,AMEX
3,3,aau,Almaden Minerals,AMEX
4,4,abeq,Absolute Core Strategy ETF,AMEX


# load reddit data

In [6]:
df = pd.read_pickle(filepath + "wallstreetbets.pd")

In [7]:
df.head()

Unnamed: 0,id,created_at,title,selftext,score,permalink,all_comments
0,t3_olv4sd,2021-07-17,$MU is extremely undervalued and set for a big...,"Never done a DD before, so this will def be sh...",32.0,/r/wallstreetbets/comments/olv4sd/mu_is_extrem...,[\n**User Report**| | | |\n:--|:--|:--|:--\n**...
1,t3_om0qrd,2021-07-17,Fraternal Association of Gambling Gentlemen an...,Inductions\n------\n\n\n----------\nUser|Cast ...,11.0,/r/wallstreetbets/comments/om0qrd/fraternal_as...,"[Good morning, partially clowdy today., hangov..."
2,t3_olzpz7,2021-07-17,CLNE - Back of the Envelope Valuation,"&amp;#x200B;\n\nCLNE, maybe to the moon, or ma...",5.0,/r/wallstreetbets/comments/olzpz7/clne_back_of...,[\n**User Report**| | | |\n:--|:--|:--|:--\n**...
3,t3_olzc1w,2021-07-17,We Are In The Mids Of A Major Correction Right...,Some Cold Hard Facts:\n\nHere's a line chart o...,8.0,/r/wallstreetbets/comments/olzc1w/we_are_in_th...,[\n**User Report**| | | |\n:--|:--|:--|:--\n**...
4,t3_olxsn6,2021-07-17,Let's visualize green bars together (TLRY),"LOOK, I can't say this is a DD, I just felt li...",23.0,/r/wallstreetbets/comments/olxsn6/lets_visuali...,[\n**User Report**| | | |\n:--|:--|:--|:--\n**...


# text pre-processing

## lower and remove punctuation etc.

In [8]:
import re
import string

In [9]:
def cleaning(text):
    text = text.lower()
    text = re.sub("\[.*\]", "", text)
    text = re.sub("\(.*\)", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub("\w*\d\w*", "", text)
    text = re.sub("\s{2}", " ", text)
    text = text.encode("ascii", "ignore").decode("ascii") # removing emojis
    return text

In [10]:
df["title_clean"] = df.title.apply(cleaning)
df["text_clean"] = df.apply(lambda row: cleaning(row["title"] + " " + row["selftext"]), axis=1)

## remove daily threads

In [11]:
df = df[~df.title_clean.str.contains("daily.*thread")]

## remove stop words and tokenize

In [12]:
import nltk
nltk.download("stopwords")
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.corpus import words
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [13]:
stop_words = set(stopwords.words("english"))
stop_words.add("yolo")

In [14]:
len(stop_words & set(df_symbols["Symbol"].tolist()))

54

In [15]:
df["title_tokens"] = df.title_clean.apply(lambda t: [token for token in word_tokenize(t) if token not in stop_words])
df["text_tokens"] = df.text_clean.apply(lambda t: [token for token in word_tokenize(t) if token not in stop_words])

## filter stocks

In [16]:
from collections import Counter

In [17]:
symbols = set(df_symbols["Symbol"].tolist()) - set(words.words())
symbols.add("wish")
symbols.remove("yolo")
symbols.remove("app")
len(symbols)

65578

In [18]:
df["title_symbols"] = df.title_tokens.apply(lambda tokens: [token for token in tokens if token in symbols])
df["text_symbols"] = df.text_tokens.apply(lambda tokens: [token for token in tokens if token in symbols])

In [19]:
df["title_symbols"] = df.title_symbols.apply(Counter)
df["text_symbols"] = df.text_symbols.apply(Counter)

In [20]:
def remove_stocks(row):
    title, stocks = row["title_clean"], row["title_symbols"].keys()
    for stock in stocks:
        title = title.replace(stock, "")
    return title

In [21]:
df["title_clean_no_stocks"] = df.apply(remove_stocks, axis=1)

In [22]:
df["title_tokens_clean"] = df.title_clean_no_stocks.apply(lambda t: [token for token in word_tokenize(t) if token not in stop_words])

## stemming

In [23]:
sno = nltk.stem.SnowballStemmer("english")

In [24]:
df["title_tokens"] = df.title_tokens.apply(lambda tokens: [sno.stem(token) for token in tokens])
df["text_tokens"] = df.text_tokens.apply(lambda tokens: [sno.stem(token) for token in tokens])
df["title_tokens_clean"] = df.title_tokens_clean.apply(lambda tokens: [sno.stem(token) for token in tokens])

## lemmatization

In [25]:
lem = nltk.stem.WordNetLemmatizer()

In [26]:
df["title_tokens"] = df.title_tokens.apply(lambda tokens: [lem.lemmatize(token) for token in tokens])
df["text_tokens"] = df.text_tokens.apply(lambda tokens: [lem.lemmatize(token) for token in tokens])
df["title_tokens_clean"] = df.title_tokens_clean.apply(lambda tokens: [lem.lemmatize(token) for token in tokens])

# Sentiment Analysis

In [27]:
# take only entires where we have some symbols
df_sa = df[df.title_symbols.apply(len) > 0]

## lexicon-based
__Different lexicons__:
* AFINN
* Bing Liu's
* MPQA subjectivity
* SentiWordNet
* VADER
* TextBlob

### AFINN

In [28]:
from afinn import Afinn

In [29]:
af = Afinn()

In [30]:
df_sa["AFINN_polarity"] = df_sa.text_clean.apply(lambda text: af.score(text))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


### Bing Liu's

In [31]:
nltk.download("opinion_lexicon")
from nltk.corpus import opinion_lexicon

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [32]:
pos = set(opinion_lexicon.positive())
neg = set(opinion_lexicon.negative())

In [33]:
df_sa["Bing_Liu_polarity"] = df_sa.text_tokens.apply(lambda tokens: sum(1 if token in pos else -1 if token in neg else 0 for token in tokens))

### Vader

In [34]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [35]:
sid = SentimentIntensityAnalyzer()

In [36]:
df_sa["VADER_polarity"] = df_sa.text_clean.apply(lambda text: sid.polarity_scores(text)["compound"])

### TextBlob

In [37]:
from textblob import TextBlob

In [38]:
df_sa["TextBlob_polarity"] = df_sa.text_clean.apply(lambda text: TextBlob(text).polarity)

### Comparison

In [39]:
df_sa[["text_clean","AFINN_polarity", "Bing_Liu_polarity", "VADER_polarity", "TextBlob_polarity"]].head(50)

Unnamed: 0,text_clean,AFINN_polarity,Bing_Liu_polarity,VADER_polarity,TextBlob_polarity
2,clne back of the envelope valuation \nclne may...,30.0,3,0.9981,0.094055
6,negg yolo gainz,0.0,0,0.2732,0.0
8,tsmc amp amd stocks huge gain soon they both t...,2.0,4,0.8231,0.086129
11,follow up on my bear bet on zlab currently bel...,6.0,-1,0.9776,0.107197
20,yolo into bngo,0.0,0,0.2732,0.0
22,crsr reasons why it will go up and touch the m...,19.0,1,0.9805,0.098934
25,i put on cnk friday am down and dont know why,0.0,0,0.0,-0.155556
28,stld earning play leveraged minimum risk steel...,-7.0,-1,0.9591,-0.143333
33,payo your baby momma so she can feed the kids ...,34.0,3,0.9973,0.090007
34,nvda dip thoughts nvda is going insane here ar...,5.0,3,0.8414,0.125714


## Naive Bayes

In [40]:
if False:
    df_train.to_csv("/Users/Vincent/Desktop/nlp-stock-market-trend-prediction-with-reddit-posts/data/Classifier/training_list.csv")

## Maximum Entropy (ME)

## Support Vector Machines (SVM)

## Multilayer perceptron (MLP)

# Latent Dirichlet Allocation (LDA)

In [41]:
import gensim
from gensim import corpora
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models



In [42]:
df_sa["title_bigrams"] = df_sa.title_tokens_clean.apply(lambda t: ["_".join(bigram) for bigram in nltk.bigrams(t)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [43]:
id2word = corpora.Dictionary(df_sa.title_tokens_clean)
#id2word = corpora.Dictionary(df_sa.title_bigrams)

In [44]:
corpus = [id2word.doc2bow(tokens) for tokens in df_sa.title_tokens_clean]
#corpus = [id2word.doc2bow(tokens) for tokens in df_sa.title_bigrams]

## hyperparameter tuning

In [45]:
if False:
    # Topic parameter
    topics = range(2,50)

    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.05))
    alpha.append('symmetric')
    alpha.append('asymmetric')

    # Beta parameter
    beta = list(np.arange(0.1, 1, 0.05))
    beta.append('symmetric')

    for b in beta:
        lda_model = gensim.models.ldamodel.LdaModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=34,
            random_state=100,
            update_every=1,
            chunksize=1000,
            passes=10,
            alpha=.96,
            eta="symmetric",
            per_word_topics=True)

        coherence_model_lda = CoherenceModel(model=lda_model, texts=df_sa.title_tokens_clean, dictionary=id2word, coherence="c_v")
        print(b, coherence_model_lda.get_coherence())

# model

In [46]:
if False:
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=34,
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=10,
        alpha=.96,
        eta="symmetric",
        per_word_topics=True)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=df_sa.title_tokens_clean, dictionary=id2word, coherence="c_v")
    print(b, coherence_model_lda.get_coherence())

interesting topics:
6 - buy
7 - gain 20
9 - selling

In [47]:
# safe model
if False:
    filepath = "/Users/Vincent/Desktop/nlp-stock-market-trend-prediction-with-reddit-posts/models/lda_model.model"
    lda_model.save(filepath)

In [48]:
# load model
if True:
    filepath = "/Users/Vincent/Desktop/nlp-stock-market-trend-prediction-with-reddit-posts/models/lda_model.model"
    lda_model = gensim.models.ldamodel.LdaModel.load(filepath)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df_sa.title_tokens_clean, dictionary=id2word, coherence="c_v")
    print(coherence_model_lda.get_coherence())

0.7296691429575735


## evaluation

__interesting topics:__
* 3 - buy
* 8 - earn
* 12 - save
* 13 - bought
* 15 - sell
* 16 - rise
* 17 - moon
* 28 - drop

In [49]:
topics_definition = {
    "positiv": [4,6,7,10,12,25,33,21],
    "negativ": [15,23],
}

In [50]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


## Apply LDA model on DataFrame

In [51]:
def lda_score_threshold(tokens, threshold=0.04):
    bow = id2word.doc2bow(tokens)
    return [topic for topic, score in lda_model[bow][0] if score > threshold]

In [52]:
df_sa["topics"] = df_sa.title_tokens_clean.apply(lda_score_threshold)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [53]:
print("df_size:", df_sa.shape[0], "\ndf_no_topic_found:", sum(df_sa.topics.str.len() == 0), "\ndf_topics_found:", sum(df_sa.topics.str.len() >= 1))

df_size: 1663 
df_no_topic_found: 575 
df_topics_found: 1088


## Results

### Top 3 stock symbols per day

In [241]:
polarity_cols = df_sa.columns[df_sa.columns.str.contains(r"polarity$")].tolist()
polarity_cols

['AFINN_polarity', 'Bing_Liu_polarity', 'VADER_polarity', 'TextBlob_polarity']

In [335]:
stock_polarity_cols = [polarity + "_stock_score" for polarity in polarity_cols]
# score is based on all symbols per day, not a subset (e.g. "SPCE" got 50 threads for one day, but only 20 are in the relevant ones -> but the calculation is done on the 50)
series = df_sa[["title_symbols"] + polarity_cols].apply(lambda row: [Counter({symbol: polarity for symbol in row["title_symbols"]}) for polarity in row[polarity_cols]], axis=1)
df_sa[stock_polarity_cols] = pd.DataFrame(series.tolist(), index=series.index)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [336]:
def update_counters(series):
    total = Counter()
    for counter in series.tolist():
        if not type(counter) is Counter:
            continue
        total.update(counter)
    return total

In [371]:
df_results = df_sa.groupby("created_at")[["title_symbols"] + stock_polarity_cols].agg(update_counters)
df_results["most_common_symbols"] = df_results.title_symbols.apply(lambda counter: counter.most_common(3))
df_results["unique_symbol_count"] = df_results.title_symbols.str.len()

for col in stock_polarity_cols:
    df_results["most_common_" + col] = df_results[col].apply(lambda counter: [(stock, value) for stock, value in counter.most_common(3) if value > 0])
    df_results["least_common_" + col] = df_results[col].apply(lambda counter: [(stock, value) for stock, value in counter.most_common()[::-1] if value < 0][0:3])

df_results.drop(columns=["title_symbols"] + stock_polarity_cols, inplace=True)
df_results.head(100).style

Unnamed: 0_level_0,most_common_symbols,unique_symbol_count,most_common_AFINN_polarity_stock_score,least_common_AFINN_polarity_stock_score,most_common_Bing_Liu_polarity_stock_score,least_common_Bing_Liu_polarity_stock_score,most_common_VADER_polarity_stock_score,least_common_VADER_polarity_stock_score,most_common_TextBlob_polarity_stock_score,least_common_TextBlob_polarity_stock_score
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-05-16,"[('uwmc', 1)]",1,[],"[('uwmc', -1.0)]",[],[],[],"[('uwmc', -0.5106)]",[],[]
2021-06-08,"[('amc', 1), ('gme', 1), ('pltr', 1)]",4,[],[],[],[],[],[],[],[]
2021-06-09,"[('uwmc', 1)]",1,[],[],[],[],"[('uwmc', 0.3612)]",[],"[('uwmc', 0.2)]",[]
2021-06-10,"[('bb', 8), ('wish', 4), ('dd', 4)]",19,"[('dd', 100.0), ('clne', 68.0), ('wish', 64.0)]","[('aso', -29.0), ('clov', -15.0), ('ive', -1.0)]","[('rkt', 16), ('wkhs', 10), ('clne', 10)]","[('aso', -5), ('clov', -2), ('mvis', -1)]","[('dd', 3.4196), ('clne', 3.3083), ('wish', 3.1484)]","[('aso', -0.9917)]","[('bb', 0.906188897309587), ('gme', 0.5920879120879121), ('wkhs', 0.3808112874779541)]","[('clov', -0.43896198830409355), ('uwmc', -0.4), ('mvis', -0.3)]"
2021-06-11,"[('wish', 26), ('clf', 13), ('amc', 13)]",58,"[('adds', 113.0), ('rkt', 110.0), ('amc', 109.0)]","[('gme', -24.0), ('gnus', -5.0), ('def', -2.0)]","[('clov', 44), ('clne', 30), ('vs', 22)]","[('gnus', -4), ('viac', -2), ('def', -1)]","[('wish', 13.587000000000002), ('clov', 5.8439000000000005), ('clf', 5.3177)]","[('gnus', -0.6249), ('def', -0.5267), ('ccs', -0.4767)]","[('wish', 3.308289968526577), ('amc', 1.8746572871572873), ('clov', 1.4818364661654133)]","[('def', -0.9), ('ccs', -0.4), ('pt', -0.3)]"
2021-06-12,"[('sens', 7), ('wish', 5), ('clf', 5)]",36,"[('clne', 125.0), ('wish', 42.0), ('bpmc', 33.0)]","[('sens', -6.0), ('amc', -2.0), ('tac', -2.0)]","[('clne', 35), ('wish', 9), ('aso', 8)]","[('tac', -7), ('sens', -7), ('wb', -2)]","[('clf', 3.156), ('clne', 2.9897), ('wish', 2.1513)]","[('nio', -0.2441), ('bb', -0.10460000000000008), ('tac', -0.0919)]","[('clf', 1.3934322213733978), ('uwmc', 0.7205882352941176), ('info', 0.7)]","[('nio', -0.052243589743589745)]"
2021-06-14,"[('wish', 7), ('dd', 5), ('crsr', 3)]",11,"[('dd', 223.0), ('bgs', 72.0), ('crsr', 51.0)]",[],"[('dd', 60), ('crsr', 13), ('bgs', 9)]","[('amp', -6), ('clov', -3), ('wish', -1)]","[('dd', 4.3551), ('wish', 4.2911), ('crsr', 1.9602)]",[],"[('wish', 0.8297373459873459), ('dd', 0.7108093254951839), ('bb', 0.5083333333333333)]",[]
2021-06-15,"[('wish', 28), ('amc', 11), ('dkng', 9)]",47,"[('dd', 202.0), ('nio', 171.0), ('crsr', 157.0)]","[('dkng', -18.0), ('bbby', -16.0), ('ive', -2.0)]","[('dd', 83), ('crsr', 52), ('clne', 32)]","[('dkng', -17), ('uwmc', -4), ('fomo', -3)]","[('wish', 13.4779), ('dd', 5.7209), ('clf', 3.4037)]","[('bbby', -0.7343), ('sndl', -0.4588), ('idex', -0.2263)]","[('bb', 1.0541666666666667), ('amc', 1.0141414141414142), ('wish', 0.9842255319636273)]","[('dkng', -0.9019425212207979), ('ive', -0.45), ('fomo', -0.19040998217468807)]"
2021-06-16,"[('wish', 20), ('dd', 13), ('amc', 9)]",40,"[('dd', 205.0), ('cano', 114.0), ('wish', 78.0)]","[('cars', -11.0), ('crsr', -3.0), ('amc', -3.0)]","[('dd', 83), ('cano', 29), ('lesl', 25)]","[('aso', -4), ('amp', -2), ('cars', -2)]","[('wish', 11.2803), ('dd', 8.2871), ('itub', 4.305400000000001)]","[('cars', -0.9166), ('crsr', -0.3182)]","[('dd', 1.2511302264652744), ('amc', 0.7963007147890868), ('wkhs', 0.5794612794612795)]","[('eod', -1.0), ('tqqq', -1.0), ('bb', -0.25202020202020214)]"
2021-06-17,"[('wish', 16), ('amc', 12), ('pltr', 8)]",63,"[('dd', 274.0), ('prpl', 127.0), ('uwmc', 99.0)]","[('sens', -59.0), ('qqq', -4.0), ('xlf', -4.0)]","[('dd', 42), ('jmia', 25), ('clne', 25)]","[('nvda', -9), ('sens', -6), ('hr', -5)]","[('wish', 10.3804), ('dd', 4.6784), ('amc', 3.814)]","[('qqq', -0.9451), ('xlf', -0.9451), ('kre', -0.9451)]","[('amc', 1.1080314422419686), ('bb', 0.7578932178932178), ('amd', 0.7237519936204146)]","[('jim', -0.2916666666666667), ('ath', -0.1), ('fsr', -0.09999999999999992)]"


In [372]:
flatten = lambda nested_list: [elm for sublist in nested_list for elm in sublist]

In [373]:
df_lda_results = df_sa[["created_at", "title", "score", "AFINN_polarity", "Bing_Liu_polarity", "VADER_polarity", "TextBlob_polarity", "topics", "title_symbols"]].copy()

### No topics found

In [374]:
df_lda_results[df_lda_results.topics.str.len() == 0].head(200).style

Unnamed: 0,created_at,title,score,AFINN_polarity,Bing_Liu_polarity,VADER_polarity,TextBlob_polarity,topics,title_symbols
6,2021-07-17,NEGG 400K+ YOLO GAINZ,119.0,0.0,0,0.2732,0.0,[],Counter({'negg': 1})
20,2021-07-16,Yolo into bngo,44.0,0.0,0,0.2732,0.0,[],Counter({'bngo': 1})
37,2021-07-16,NVDA to the 🌙!!!,23.0,0.0,0,0.0,0.0,[],Counter({'nvda': 1})
38,2021-07-16,Check out NRZ,6.0,7.0,-1,0.6705,0.135124,[],Counter({'nrz': 1})
55,2021-07-16,Wish,119.0,1.0,0,0.4019,0.0,[],Counter({'wish': 1})
57,2021-07-16,Avepoint DD + YOLO,243.0,48.0,4,0.9874,0.117697,[],Counter({'dd': 1})
72,2021-07-16,AAPL 149c 7/23,28.0,0.0,0,0.0,0.0,[],Counter({'aapl': 1})
91,2021-07-16,$F DD ABC 123 🌙🚀,14.0,7.0,4,0.8092,0.084253,[],"Counter({'dd': 1, 'abc': 1})"
96,2021-07-15,Canoo DD,53.0,-12.0,-12,0.973,0.082066,[],Counter({'dd': 1})
154,2021-07-15,TQQQ Sept 17 $150c. LFG 🚀,7.0,0.0,0,0.0,0.0,[],"Counter({'tqqq': 1, 'lfg': 1})"


### relevant topics found

In [375]:
relevant_topics = flatten(topics_definition.values())

In [376]:
mask = df_lda_results.topics.apply(lambda topics: len(set(topics) & set(relevant_topics)) >= 1)
print(sum(mask))

512


In [377]:
df_lda_results[mask].head(200).style

Unnamed: 0,created_at,title,score,AFINN_polarity,Bing_Liu_polarity,VADER_polarity,TextBlob_polarity,topics,title_symbols
11,2021-07-17,Follow up on my Bear Bet on $ZLAB,0.0,6.0,-1,0.9776,0.107197,"[4, 17]",Counter({'zlab': 1})
22,2021-07-16,CRSR : Reasons why it will go up and touch the moon 🚀🌕 (Elgato),361.0,19.0,1,0.9805,0.098934,"[2, 5, 15]",Counter({'crsr': 1})
25,2021-07-16,"I put 100k on CNK Friday, am down $15k, and don’t know why",0.0,0.0,0,0.0,-0.155556,"[1, 4, 32, 33]",Counter({'cnk': 1})
28,2021-07-16,STLD Earning Play - Leveraged Minimum Risk,16.0,-7.0,-1,0.9591,-0.143333,"[8, 25]",Counter({'stld': 1})
41,2021-07-16,"The used-car market sent American inflation soaring in June | Blame stimulus cheques, a shortage of new vehicles and rising demand for rental cars",29.0,-5.0,-1,-0.5994,0.068182,"[15, 16, 21, 26, 30]",Counter({'cars': 1})
45,2021-07-16,BB Keeps Busting My Balls,1465.0,0.0,-1,0.0,0.0,[33],Counter({'bb': 1})
46,2021-07-16,Guess cashing out my AMC gains and buying a single GME call was not a good idea…,80.0,5.0,2,-0.0015,-0.210714,"[1, 7, 10, 22]","Counter({'amc': 1, 'gme': 1})"
49,2021-07-16,"At market open I started playing Scatman ski-ba-bop-ba-dop-bop. I shit you not, the louder I played this song the better my options started to look. By eod I was playing this so loud that pieces of my popcorn ceiling were breaking up and falling all over me and my room. AMC vs Scanman connection?",30.0,-3.0,-4,-0.1867,0.1,"[8, 21, 25, 26, 28, 30, 32]","Counter({'eod': 1, 'amc': 1, 'vs': 1})"
52,2021-07-16,"CLF yolo pt 2, after a 67% return last time we’re back entry at 20.62 7/23 $20 call",83.0,0.0,0,0.2732,0.0,"[9, 10, 14, 19, 23]","Counter({'clf': 1, 'pt': 1})"
58,2021-07-16,"-$100,000 Loss on GME Options",3098.0,-3.0,-1,-0.3182,0.0,"[21, 24]",Counter({'gme': 1})


In [378]:
df_results2 = df_results.copy()
df_results2["title_symbols_relevant"] = df_lda_results[mask].groupby("created_at")[["title_symbols"]].apply(lambda c: c.sum())
df_results2["most_common_symbols_relevant"] = df_results2.title_symbols_relevant.apply(lambda counter: counter.most_common(3) if type(counter) == Counter else float('nan'))
df_results2["unique_symbol_count_relevant"] = df_results2.title_symbols_relevant.str.len()
df_results2["unique_symbol_count_relevant"] = df_results2.unique_symbol_count_relevant.fillna(0).astype(int)
df_results2.drop(columns="title_symbols_relevant", inplace=True)
df_results2.head(100).style

Unnamed: 0_level_0,most_common_symbols,unique_symbol_count,most_common_AFINN_polarity_stock_score,least_common_AFINN_polarity_stock_score,most_common_Bing_Liu_polarity_stock_score,least_common_Bing_Liu_polarity_stock_score,most_common_VADER_polarity_stock_score,least_common_VADER_polarity_stock_score,most_common_TextBlob_polarity_stock_score,least_common_TextBlob_polarity_stock_score,most_common_symbols_relevant,unique_symbol_count_relevant
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-05-16,"[('uwmc', 1)]",1,[],"[('uwmc', -1.0)]",[],[],[],"[('uwmc', -0.5106)]",[],[],"[('uwmc', 1)]",1
2021-06-08,"[('amc', 1), ('gme', 1), ('pltr', 1)]",4,[],[],[],[],[],[],[],[],,0
2021-06-09,"[('uwmc', 1)]",1,[],[],[],[],"[('uwmc', 0.3612)]",[],"[('uwmc', 0.2)]",[],,0
2021-06-10,"[('bb', 8), ('wish', 4), ('dd', 4)]",19,"[('dd', 100.0), ('clne', 68.0), ('wish', 64.0)]","[('aso', -29.0), ('clov', -15.0), ('ive', -1.0)]","[('rkt', 16), ('wkhs', 10), ('clne', 10)]","[('aso', -5), ('clov', -2), ('mvis', -1)]","[('dd', 3.4196), ('clne', 3.3083), ('wish', 3.1484)]","[('aso', -0.9917)]","[('bb', 0.906188897309587), ('gme', 0.5920879120879121), ('wkhs', 0.3808112874779541)]","[('clov', -0.43896198830409355), ('uwmc', -0.4), ('mvis', -0.3)]","[('bb', 3), ('clf', 2), ('wkhs', 1)]",11
2021-06-11,"[('wish', 26), ('clf', 13), ('amc', 13)]",58,"[('adds', 113.0), ('rkt', 110.0), ('amc', 109.0)]","[('gme', -24.0), ('gnus', -5.0), ('def', -2.0)]","[('clov', 44), ('clne', 30), ('vs', 22)]","[('gnus', -4), ('viac', -2), ('def', -1)]","[('wish', 13.587000000000002), ('clov', 5.8439000000000005), ('clf', 5.3177)]","[('gnus', -0.6249), ('def', -0.5267), ('ccs', -0.4767)]","[('wish', 3.308289968526577), ('amc', 1.8746572871572873), ('clov', 1.4818364661654133)]","[('def', -0.9), ('ccs', -0.4), ('pt', -0.3)]","[('wish', 10), ('amc', 5), ('clov', 4)]",31
2021-06-12,"[('sens', 7), ('wish', 5), ('clf', 5)]",36,"[('clne', 125.0), ('wish', 42.0), ('bpmc', 33.0)]","[('sens', -6.0), ('amc', -2.0), ('tac', -2.0)]","[('clne', 35), ('wish', 9), ('aso', 8)]","[('tac', -7), ('sens', -7), ('wb', -2)]","[('clf', 3.156), ('clne', 2.9897), ('wish', 2.1513)]","[('nio', -0.2441), ('bb', -0.10460000000000008), ('tac', -0.0919)]","[('clf', 1.3934322213733978), ('uwmc', 0.7205882352941176), ('info', 0.7)]","[('nio', -0.052243589743589745)]","[('amc', 4), ('msft', 2), ('sens', 2)]",17
2021-06-14,"[('wish', 7), ('dd', 5), ('crsr', 3)]",11,"[('dd', 223.0), ('bgs', 72.0), ('crsr', 51.0)]",[],"[('dd', 60), ('crsr', 13), ('bgs', 9)]","[('amp', -6), ('clov', -3), ('wish', -1)]","[('dd', 4.3551), ('wish', 4.2911), ('crsr', 1.9602)]",[],"[('wish', 0.8297373459873459), ('dd', 0.7108093254951839), ('bb', 0.5083333333333333)]",[],"[('dd', 2), ('wish', 2), ('clne', 1)]",9
2021-06-15,"[('wish', 28), ('amc', 11), ('dkng', 9)]",47,"[('dd', 202.0), ('nio', 171.0), ('crsr', 157.0)]","[('dkng', -18.0), ('bbby', -16.0), ('ive', -2.0)]","[('dd', 83), ('crsr', 52), ('clne', 32)]","[('dkng', -17), ('uwmc', -4), ('fomo', -3)]","[('wish', 13.4779), ('dd', 5.7209), ('clf', 3.4037)]","[('bbby', -0.7343), ('sndl', -0.4588), ('idex', -0.2263)]","[('bb', 1.0541666666666667), ('amc', 1.0141414141414142), ('wish', 0.9842255319636273)]","[('dkng', -0.9019425212207979), ('ive', -0.45), ('fomo', -0.19040998217468807)]","[('wish', 8), ('amc', 5), ('dd', 3)]",26
2021-06-16,"[('wish', 20), ('dd', 13), ('amc', 9)]",40,"[('dd', 205.0), ('cano', 114.0), ('wish', 78.0)]","[('cars', -11.0), ('crsr', -3.0), ('amc', -3.0)]","[('dd', 83), ('cano', 29), ('lesl', 25)]","[('aso', -4), ('amp', -2), ('cars', -2)]","[('wish', 11.2803), ('dd', 8.2871), ('itub', 4.305400000000001)]","[('cars', -0.9166), ('crsr', -0.3182)]","[('dd', 1.2511302264652744), ('amc', 0.7963007147890868), ('wkhs', 0.5794612794612795)]","[('eod', -1.0), ('tqqq', -1.0), ('bb', -0.25202020202020214)]","[('wish', 6), ('dd', 4), ('rkt', 3)]",21
2021-06-17,"[('wish', 16), ('amc', 12), ('pltr', 8)]",63,"[('dd', 274.0), ('prpl', 127.0), ('uwmc', 99.0)]","[('sens', -59.0), ('qqq', -4.0), ('xlf', -4.0)]","[('dd', 42), ('jmia', 25), ('clne', 25)]","[('nvda', -9), ('sens', -6), ('hr', -5)]","[('wish', 10.3804), ('dd', 4.6784), ('amc', 3.814)]","[('qqq', -0.9451), ('xlf', -0.9451), ('kre', -0.9451)]","[('amc', 1.1080314422419686), ('bb', 0.7578932178932178), ('amd', 0.7237519936204146)]","[('jim', -0.2916666666666667), ('ath', -0.1), ('fsr', -0.09999999999999992)]","[('wish', 9), ('amc', 5), ('amd', 3)]",28


### check topics of specific stock symbols

In [370]:
Counter(flatten(df_sa[df_sa.title_symbols.apply(lambda c: "spce" in c)].topics.tolist())).most_common(5)

[(33, 11), (21, 9), (15, 6), (0, 5), (11, 5)]

### more then 3 threads a day (TODO)

In [349]:
df_results2.most_common_symbols_relevant.apply(lambda c: [stock for stock, count in c if count >= 3] if type(c) is list else list())

created_at
2021-05-16                    []
2021-06-08                    []
2021-06-09                    []
2021-06-10                  [bb]
2021-06-11     [wish, amc, clov]
2021-06-12                 [amc]
2021-06-14                    []
2021-06-15       [wish, amc, dd]
2021-06-16       [wish, dd, rkt]
2021-06-17      [wish, amc, amd]
2021-06-18     [wish, pltr, amc]
2021-06-19                    []
2021-06-20                 [amc]
2021-06-21                [wish]
2021-06-22    [wish, clov, clne]
2021-06-23          [wish, pltr]
2021-06-24     [wish, tsla, amc]
2021-06-25            [bb, wish]
2021-06-26                    []
2021-06-27                    []
2021-06-28          [sofi, wish]
2021-06-29                    []
2021-06-30           [wish, amd]
2021-07-01                 [gme]
2021-07-02    [sofi, spce, wish]
2021-07-03                    []
2021-07-04                    []
2021-07-05                    []
2021-07-06                    []
2021-07-07                [negg]

# Read Stock Data

In [387]:
import yfinance as yf

In [388]:
start_date, end_date = df_results2.index[[1,-1]]

In [389]:
stocks_per_day = df_results2.most_common_symbols.apply(lambda stocks: stocks[0][0]).str.upper().tolist()
unique_stocks = list(set(stocks_per_day))

In [390]:
stock_data = yf.download(unique_stocks, start=start_date, end=end_date)

[*********************100%***********************]  16 of 16 completed


## Candlestick Charts

In [391]:
import plotly.graph_objects as go

In [392]:
def candlestick(symbols):
    if isinstance(symbols, str):
        symbols = [symbols]
    
    fig = go.Figure(data=[go.Candlestick(
        x=stock_data.index,
        open=stock_data.Open[symbol],
        high=stock_data.High[symbol],
        low=stock_data.Low[symbol],
        close=stock_data.Close[symbol],
        name=symbol
    ) for symbol in symbols])
    fig.update_layout(
        title=", ".join(symbols),
        xaxis_title="Date",
        yaxis_title="Price",
        legend_title="Stocks",
    )
    fig.show()

In [393]:
candlestick(unique_stocks[0:2])

## Line Charts

In [382]:
import plotly.express as px

In [383]:
def line(symbols):
    fig = px.line(stock_data.Open[symbols])
    fig.show()

In [384]:
line(unique_stocks)

## OHLC Charts

In [385]:
def ohlc(symbols):
    if isinstance(symbols, str):
        symbols = [symbols]
        
    fig = go.Figure(data=[go.Ohlc(
        x=stock_data.index,
        open=stock_data.Open[symbol],
        high=stock_data.High[symbol],
        low=stock_data.Low[symbol],
        close=stock_data.Close[symbol],
        name=symbol
    ) for symbol in symbols])
    fig.update_layout(
        title=", ".join(symbols),
        xaxis_title="Date",
        yaxis_title="Price",
        legend_title="Stocks",
    )
    fig.show()

In [386]:
ohlc(unique_stocks[0:2])

In [None]:
#s.actions

In [None]:
#s.options

In [None]:
#s.calendar

In [None]:
#s.recommendations

In [None]:
#s.sustainability

In [None]:
#s.info

# Word embedding

## Skip-Gram

# Ideas for further progression

* nlp on emojis