In [1]:
import numpy as np
from constants import SHARED_RANDOM_STATE
from db_helper_functions import get_stock_news_from_db
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = get_stock_news_from_db("AAPL")
df = df[~df.article.isnull()]

In [3]:
tf = CountVectorizer(stop_words="english", max_df=0.95, min_df=10)
tf = tf.fit(df.article)
tf_mat = tf.transform(df.article)
tf_vocab = tf.get_feature_names_out()
tf_vocab

array(['00', '000', '01', ..., 'zyme', 'zymeworks', 'zynga'], dtype=object)

In [19]:
num_topics = 2
num_top_words = 20

In [5]:
lda = LatentDirichletAllocation(
    n_components=num_topics,
    max_iter=5,
    random_state=SHARED_RANDOM_STATE,
)

lda.fit_transform(tf_mat)

array([[0.10299392, 0.89700608],
       [0.03235789, 0.96764211],
       [0.01714897, 0.98285103],
       ...,
       [0.24459318, 0.75540682],
       [0.00433708, 0.99566292],
       [0.00509012, 0.99490988]])

In [21]:
for topic_idx, topic_scores in enumerate(lda.components_):
    top_words = np.argsort(topic_scores)[::-1][:num_top_words]
    print(f"TOPIC #{topic_idx + 1}")
    print(f"TOP WORDS: {tf_vocab[top_words]}")

TOPIC #1
TOP WORDS: ['shares' 'high' 'new' 'week' 'stock' '52' 'traded' 'hit' 'company'
 'morning' 'session' 'share' 'earnings' 'revenue' 'friday' 'billion'
 'report' 'quarterly' 'change' 'monday']
TOPIC #2
TOP WORDS: ['said' 'year' 'company' 'market' 'new' 'earnings' 'investors' 'companies'
 'time' 'stocks' 'iphone' 'week' 'price' 'like' 'data' 'tech' 'china'
 'billion' 'according' 'higher']
