In [1]:
import numpy as np
from constants import SHARED_RANDOM_STATE
from db_helper_functions import get_stock_news_from_db
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from text_cleaning_functions import clean_text
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = get_stock_news_from_db("AAPL")
df = df[~df.article.isnull()]

In [3]:
df["article"] = df["article"].apply(lambda x: clean_text(x))

In [4]:
tf = CountVectorizer(stop_words="english", max_df=0.90, min_df=10, ngram_range=(1, 3))
tf_mat = tf.fit_transform(df.article)
tf_vocab = tf.get_feature_names_out()
tf_vocab

array(['00', '00 flat', '001', ..., 'zymeworks', 'zymeworks zyme',
       'zynga'], dtype=object)

In [5]:
num_topics = 2
num_top_words = 20

In [6]:
lda = LatentDirichletAllocation(
    n_components=num_topics,
    max_iter=5,
    random_state=SHARED_RANDOM_STATE,
)

lda.fit_transform(tf_mat)

array([[9.91665511e-01, 8.33448893e-03],
       [9.99446521e-01, 5.53479222e-04],
       [9.99028558e-01, 9.71442242e-04],
       ...,
       [9.91911128e-01, 8.08887191e-03],
       [9.96641695e-01, 3.35830464e-03],
       [9.97050249e-01, 2.94975096e-03]])

In [7]:
for topic_idx, topic_scores in enumerate(lda.components_):
    top_words = np.argsort(topic_scores)[::-1][:num_top_words]
    print(f"TOPIC #{topic_idx + 1}")
    print(f"TOP WORDS: {tf_vocab[top_words]}")

TOPIC #1
TOP WORDS: ['company' 'share' 'market' 'said' 'stock' 'earnings' 'year' 'report'
 'revenue' 'billion' 'new' 'million' 'week' 'price' 'analyst' 'investor'
 'reported' 'iphone' 'time' 'trading']
TOPIC #2
TOP WORDS: ['high' 'share' 'new' '52week' '52week high' 'stock' 'new 52week'
 'new 52week high' 'hit' 'traded' 'morning' 'session' 'yearly'
 'yearly high' 'change' 'friday' 'set' 'set new' 'monday' 'hit new']
