In [41]:
import pandas as pd
import pickle
import numpy as np

In [7]:
df = pd.read_csv('benzinga.csv')
body = df[['body']]
print(body[:1])
print(len(body))

                                                body
0  Gainers Heat Biologics, Inc. (NASDAQ:HTBX) ros...
14072


In [20]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
def tokenize_bodys(body):
    tokens = nltk.word_tokenize(body)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords.words('english'):
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                tokenizer=tokenize_bodys,
                                max_features=250,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 50,
                                ngram_range=(2,4))

df = pd.read_csv('benzinga.csv')
body = df['body']

dtm_tf = tf_vectorizer.fit_transform(body)
print(dtm_tf.shape)

lda_tf = LatentDirichletAllocation(n_components=10, 
                                   max_iter=100,
                                   learning_method='online', 
                                   random_state = 0)

lda_tf.fit(dtm_tf)




(14072, 250)


In [34]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()

topics = dict()
for topic_idx, topic in enumerate(lda_tf.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print(f"Topic {topic_idx+1}:")
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic 1:
  nasdaq |   nasdaq  | close  |  close |  close  |  share |  fell |  rise |  gain | company report
Topic 2:
billion  |  company | company  | year  | quarter  | share  |  read  | million  |  walmart | estimate 
Topic 3:
 p | p 500 |  p 500 | week  | index  | market  |  investors | year  |  stock |   p
Topic 4:
 msft |  microsoft | msft  |  msft  |   nasdaq |   nasdaq  |  nasdaq  msft | nasdaq  msft | nasdaq  msft  |  amzn
Topic 5:
say  |  price | price action | action  | price action  |  say |  price action |  price action  |  relate | relate link
Topic 6:
 stock |  share | new 52week | low  | morning  | 52week low | new 52week low | 52week low  | set new | high 
Topic 7:
percent  | trade  | premarket trade | price target | target  | price target  | corporation  |   nyse |   nyse  | premarket trade 
Topic 8:
revenue  | share revenue | share revenue  | earn  | quarterly earn | report quarterly | quarterly earn  | report quarterly earn | report quarterly earn  | million 
Topic 9:

In [36]:
# Save the LDA model
with open('lda_model.pkl', 'wb') as f:
    pickle.dump(lda_tf, f)

# Save the vectorizer
with open('tf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tf_vectorizer, f)


In [62]:
def get_topic_keywords(lda_model, vectorizer, text, num_keywords):
    test_matrix = vectorizer.transform([text])
    test_lda_output = lda_model.transform(test_matrix)
    topic_names = ['Topic ' + str(i) for i in range(lda_model.n_components)]
    lda_output_df = pd.DataFrame(np.round(test_lda_output, 2), columns=topic_names)
    dominant_topic = np.argmax(lda_output_df.values, axis=1)[0]
    topics = lda_model.components_
    feature_names = vectorizer.get_feature_names_out()
    keywords = [feature_names[i].strip() for i in topics[dominant_topic].argsort()[:-num_keywords-1:-1]]
    keywords = list(dict.fromkeys(keywords))
    keywords_str = ", ".join(keywords)
    return dominant_topic, keywords_str


### Example usage

In [63]:
# Load the LDA model
with open('lda_model.pkl', 'rb') as f:
    lda_model_loaded = pickle.load(f)

# Load the vectorizer
with open('tf_vectorizer.pkl', 'rb') as f:
    tf_vectorizer_loaded = pickle.load(f)
    
# Test the function
dominant_topic, keywords = get_topic_keywords(lda_model_loaded, tf_vectorizer_loaded, str(body[0:1].values[0]), 10)
print(f"Category: {dominant_topic}")
print(f"Keywords: {keywords}")

Category: 6
Keywords: percent, trade, premarket trade, price target, target, corporation, nyse


