In [1]:
import numpy as np
from constants import SHARED_RANDOM_STATE
from db_helper_functions import get_stock_news_from_db
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from text_cleaning_functions import clean_text

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/raulmartinez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raulmartinez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raulmartinez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = get_stock_news_from_db("AAPL")
df = df[~df.article.isnull()]

In [3]:
df["article"] = df["article"].apply(lambda x: clean_text(x))

In [4]:
tf = CountVectorizer(stop_words="english", max_df=0.90, min_df=10, ngram_range=(1, 3))
tf_mat = tf.fit_transform(df.article)
tf_vocab = tf.get_feature_names_out()
tf_vocab

array(['10', '10 15', '10 20', ..., 'zymeworks', 'zymeworks zyme',
       'zynga'], dtype=object)

In [5]:
num_topics = 2
num_top_words = 20

In [6]:
lda = LatentDirichletAllocation(
    n_components=num_topics,
    max_iter=5,
    random_state=SHARED_RANDOM_STATE,
)

lda.fit_transform(tf_mat)

array([[9.99356477e-01, 6.43522507e-04],
       [9.99455505e-01, 5.44495409e-04],
       [9.99317820e-01, 6.82179931e-04],
       ...,
       [9.93224494e-01, 6.77550650e-03],
       [9.96528823e-01, 3.47117683e-03],
       [9.96834065e-01, 3.16593542e-03]])

In [7]:
for topic_idx, topic_scores in enumerate(lda.components_):
    top_words = np.argsort(topic_scores)[::-1][:num_top_words]
    print(f"TOPIC #{topic_idx + 1}")
    print(f"TOP WORDS: {tf_vocab[top_words]}")

TOPIC #1
TOP WORDS: ['company' 'market' 'said' 'stock' 'share' 'year' 'new' 'price' 'week'
 'earnings' 'analyst' 'investor' 'index' 'billion' 'report' 'million'
 'reported' 'time' 'iphone' 'higher']
TOPIC #2
TOP WORDS: ['share' 'high' 'new' 'stock' 'new high' 'hit' 'morning' 'traded'
 'session' 'yearly' 'yearly high' 'change' 'friday' 'quarterly' 'earnings'
 'revenue' 'share revenue' 'report' 'set' 'monday']
