In [1]:
import numpy as np
from constants import SHARED_RANDOM_STATE
from db_helper_functions import get_stock_news_from_db
from text_cleaning_functions import clean_text
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from finbert_embedding.embedding import FinbertEmbedding
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
import torch
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm





In [2]:
df = get_stock_news_from_db("AAPL")
df = df[~df.article.isnull()]

In [3]:
#df["article"] = df["article"].apply(lambda x: clean_text(x))
df["article"] = df["article"].apply(
        lambda x: x.replace("\xa0", " ").replace("\n", "").replace("Loading...", "")
    )

In [4]:
tf_vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 3))

In [5]:
# bert_topic_model = BERTopic(
#     vectorizer_model=tf_vectorizer,
#     language="english",
#     calculate_probabilities=True,
#     nr_topics=20,  
# )
# topics, probs = bert_topic_model.fit_transform(list(df["article"]))
# freq = bert_topic_model.get_topic_info()
# freq.head(20)

# Calculate the semantic score of each topic and get the score of the document by using topic probability distribution

 - Assign each topic a sentiment score based on the representative documents.
 - Calcualte the score of each article by using the topics distribution.

In [6]:
# for i in range(5):
#     topic = topics[i]
#     prob = probs[i]
#     print(f"The topic of the document is {topic}; The prob distribution is: {prob}")

#### One problem we can see with the above result is that some documents have a probability near 1 for certain topic and the rest of the probability for the rest of the topics are near 0.  In practice, documents may contain a mixture of topics. This will cause inaccurate results.
#### Using the approximate_distribution() can solve the problem
#### Reference: https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html#example

In [7]:
docs = list(df["article"])

model = BERTopic(
    vectorizer_model=tf_vectorizer,
    language="english",
    calculate_probabilities=True,
    nr_topics=20,
)

topic_model = model.fit(docs)
topic_distr, _ = topic_model.approximate_distribution(docs)

In [9]:
topic_distr[1]

array([0.18190981, 0.03373106, 0.0059593 , 0.09417619, 0.06263898,
       0.16513067, 0.02391345, 0.16485909, 0.02450132, 0.11628619,
       0.00949275, 0.        , 0.04088374, 0.02648213, 0.        ,
       0.0262721 , 0.        , 0.02376322, 0.        ])

In [10]:
topic_model.visualize_distribution(topic_distr[1])

In [11]:
topic_info_df = topic_model.get_topic_info()
topic_info_df.head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1495,-1_apple_market_said_company,"[apple, market, said, company, aapl, earnings,...",[If there’s a single day that defines earnings...
1,0,943,0_apple_iphone_said_aapl,"[apple, iphone, said, aapl, apples, analyst, a...",[Apple Inc. AAPL reported solid results for th...
2,1,842,1_high_shares_52week_52week high,"[high, shares, 52week, 52week high, new 52week...",[Thursday's morning session saw 122 companies ...
3,2,502,2_index_market_fed_stocks,"[index, market, fed, stocks, week, investors, ...","[We begin the week near record highs, and majo..."
4,3,394,3_apple_billion_said_qualcomm,"[apple, billion, said, qualcomm, company, prod...",[Semiconductor company Qualcomm Inc QCOM repor...
5,4,266,4_streaming_netflix_disney_apple,"[streaming, netflix, disney, apple, million, t...","[In the parlance of old-time show business, Ne..."
6,5,227,5_tesla_ev_electric_apple,"[tesla, ev, electric, apple, car, company, veh...",[The electric vehicle market is expected to to...
7,6,143,6_facebook_meta_fb_apple,"[facebook, meta, fb, apple, users, said, app, ...","[Facebook, Inc. FB shares were retreating Tues..."
8,7,108,7_twitter_social_musk_media,"[twitter, social, musk, media, said, trump, tr...",[A new social media app from former President ...
9,8,79,8_nft_cryptocurrency_coinbase_crypto,"[nft, cryptocurrency, coinbase, crypto, bitcoi...",[Crypto exchange Coinbase Global Inc COIN anno...


In [12]:
# finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
# finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# representative_docs = topic_info_df.loc[1]["Representative_Docs"]
# representative_docs_sentences = [sent_tokenize(x) for x in representative_docs]
# representative_docs_score= []

# for sentence_arr in representative_docs_sentences:
#     embedding = finbert_tokenizer(
#             sentence_arr, padding=True, return_tensors="pt", truncation=True
#         )
#     outputs = finbert_model(**embedding)
#     predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

#     score = np.round(np.mean(predictions.tolist(), axis=0), 4)
#     representative_docs_score.append(list(score))
    
# representative_docs_score = np.array(representative_docs_score)
# mean_value = np.mean(representative_docs_score, axis=0)


## Step1: Get the semantic score for each topic

In [13]:
# Initialize Finbert topkenizer and model
finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Get number of topics 
topic_num = len(topic_info_df) 

# Start with 1 because topic -1 are outliers
for i in range(1,topic_num):

    # Get representative documents
    representative_docs = topic_info_df.loc[i]["Representative_Docs"]
    
    # Tokenize the articles into sentences
    representative_docs_sentences = [sent_tokenize(x) for x in representative_docs]
    
    # Initiate empty array for the sentiment score of each representitive document
    representative_docs_score= []
    
    # Traverse each document and calculate the score 
    for sentence_arr in representative_docs_sentences:
        embedding = finbert_tokenizer(
                sentence_arr, padding=True, return_tensors="pt", truncation=True
            )
        outputs = finbert_model(**embedding)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
        score = np.round(np.mean(predictions.tolist(), axis=0), 4)
        representative_docs_score.append(list(score))
    
    representative_docs_score = np.array(representative_docs_score)
    mean_value_list = np.mean(representative_docs_score, axis=0)

    # Add the semantic score to the topic dataframe
    topic_info_df.loc[i,"positive"] = mean_value_list[0]
    topic_info_df.loc[i,"negative"] = mean_value_list[1]
    topic_info_df.loc[i,"neutral"] = mean_value_list[2]

In [14]:
topic_info_df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,positive,negative,neutral
0,-1,1495,-1_apple_market_said_company,"[apple, market, said, company, aapl, earnings,...",[If there’s a single day that defines earnings...,,,
1,0,943,0_apple_iphone_said_aapl,"[apple, iphone, said, aapl, apples, analyst, a...",[Apple Inc. AAPL reported solid results for th...,0.8163,0.0948,0.0889
2,1,842,1_high_shares_52week_52week high,"[high, shares, 52week, 52week high, new 52week...",[Thursday's morning session saw 122 companies ...,0.764933,0.1681,0.066933
3,2,502,2_index_market_fed_stocks,"[index, market, fed, stocks, week, investors, ...","[We begin the week near record highs, and majo...",0.314367,0.363167,0.3224
4,3,394,3_apple_billion_said_qualcomm,"[apple, billion, said, qualcomm, company, prod...",[Semiconductor company Qualcomm Inc QCOM repor...,0.660433,0.141067,0.1985
5,4,266,4_streaming_netflix_disney_apple,"[streaming, netflix, disney, apple, million, t...","[In the parlance of old-time show business, Ne...",0.281167,0.197967,0.5208
6,5,227,5_tesla_ev_electric_apple,"[tesla, ev, electric, apple, car, company, veh...",[The electric vehicle market is expected to to...,0.2091,0.220767,0.570133
7,6,143,6_facebook_meta_fb_apple,"[facebook, meta, fb, apple, users, said, app, ...","[Facebook, Inc. FB shares were retreating Tues...",0.314233,0.426533,0.259233
8,7,108,7_twitter_social_musk_media,"[twitter, social, musk, media, said, trump, tr...",[A new social media app from former President ...,0.148633,0.381967,0.4694
9,8,79,8_nft_cryptocurrency_coinbase_crypto,"[nft, cryptocurrency, coinbase, crypto, bitcoi...",[Crypto exchange Coinbase Global Inc COIN anno...,0.144167,0.035133,0.8207


In [33]:
topic_info_df.loc[1]["Representative_Docs"]

['Apple Inc. AAPL reported solid results for the fiscal third quarter, sending its shares to record territory. The stock was jumping 7% to $411.47 at the time of publication.The Apple AnalystsCredit Suisse analyst Matthew Cabral maintained a Neutral rating on Apple and increased the price target from $340 to $380.UBS analyst Timothy Arcuri maintained a Buy rating and lifted the price target from $400 to $425.Morgan Stanley analyst Katy Huberty named Apple as the firm\'s top pick. The analyst maintained an Overweight rating and hiked the price target from $419 to $431.Needham analyst Laura Martin maintained a Buy rating and $450 price target,Raymond James analyst Chris Caso reiterated an Outperform rating and hiked the price target from $400 to $440.Wedbush analyst Daniel Ives maintained an Outperform rating and raised the price target from $450 to a Street-high $475.‘We Underestimated Apple\'s Resiliency\': Apple delivered very impressive June quarter results, with the upside, broad-ba

## Calculate the semantic score for each articles

In [18]:
def calculate_article_score(top_distribution, score_array):
    product = [a * b for a, b in zip(top_distribution, score_array)]
    sum_of_products = sum(product)
    
    return sum_of_products

In [40]:
# Get the topic scores
topics_positive_score = topic_info_df["positive"][1:].tolist()
topics_negative_score = topic_info_df["negative"][1:].tolist()
topics_neutral_score = topic_info_df["neutral"][1:].tolist()

article_score_array = []

for article_topic_distribution in topic_distr:
    article_positive = calculate_article_score(article_topic_distribution, topics_positive_score)
    article_negative = calculate_article_score(article_topic_distribution, topics_negative_score)
    article_neutral = calculate_article_score(article_topic_distribution, topics_neutral_score)

    article_score_array.append([article_positive, article_negative,  article_neutral])

df[["positive", "negative", "neutral"]] = article_score_array
sort_df = df.sort_values(by="positive", ascending=False)
sort_df.head()

Unnamed: 0,fk_stock_news_id,ticker,title,url,article,date,positive,negative,neutral
4358,3086,AAPL,Homeland Security Warns Of Critical Flaw In Wi...,https://www.benzinga.com/news/21/12/24562511/h...,The Department of Homeland Security's (DHS) to...,2021-12-12,0.8163,0.0948,0.0889
2981,400,AAPL,Google Wants To Be Your Bank Account,https://www.benzinga.com/node/14796825,Google wants to expand its reach to consumers ...,2019-11-13,0.8163,0.0948,0.0889
3906,2417,AAPL,Why Are Baidu's Shares Trading Higher Today?,https://www.benzinga.com/news/21/08/22630905/w...,Baidu Inc's BIDU Xiaodu Technology closed Seri...,2021-08-24,0.806015,0.109477,0.084502
5371,5366,AAPL,Microsoft Nears Workers' Union Debut In US,https://www.benzinga.com/news/23/01/30269267/m...,A group of Microsoft Corp MSFT employees voted...,2023-01-04,0.799179,0.119232,0.081578
2154,4137,AAPL,Microsoft Azure Data Centers are Operating Wit...,https://www.benzinga.com/news/22/07/27935265/m...,Global Microsoft Corp MSFT Azure data centers ...,2022-07-01,0.798841,0.119713,0.081434


In [39]:
print(sort_df.iloc[4]["article"])

Investors who placed their hard-earned cash into major U.S. indices have enjoyed respectable returns over the past five years. Despite a number of market corrections in recent years, the recent market downturn partially generated by the Russia-Ukraine war and the previous stock market crash of 2020, the SPDR S&P 500 ETF SPY, Invesco QQQ Trust Series 1 QQQ and SPDR Dow Jones Industrial Average ETF Trust DIA have returned 58.31%, 104.65% and 45.65%.As good as investors in the major U.S. indices have had it over the past five years, a number of the world’s most popular consumer discretionary, EV and tech stocks have provided even better returns. Bulls that took a chance on these names were rewarded with gains that outperformed much of the broader market.Winners Since July 2017: Here’s how much $100 in each of the following stocks bought back in summer 2017 would be worth today:Bitcoin BTC/USD: $655.70Ethereum ETH/USD: $744.73Apple Inc AAPL: $405.56Microsoft Corporation MSFT: $344.88Tesla 

Investors who placed their hard-earned cash into major U.S. indices have enjoyed respectable returns over the past five years. Despite a number of market corrections in recent years, the recent market downturn partially generated by the Russia-Ukraine war and the previous stock market crash of 2020, the SPDR S&P 500 ETF SPY, Invesco QQQ Trust Series 1 QQQ and SPDR Dow Jones Industrial Average ETF Trust DIA have returned 58.31%, 104.65% and 45.65%.As good as investors in the major U.S. indices have had it over the past five years, a number of the world’s most popular consumer discretionary, EV and tech stocks have provided even better returns. Bulls that took a chance on these names were rewarded with gains that outperformed much of the broader market.Winners Since July 2017: Here’s how much $100 in each of the following stocks bought back in summer 2017 would be worth today:Bitcoin BTC/USD: $655.70Ethereum ETH/USD: $744.73Apple Inc AAPL: $405.56Microsoft Corporation MSFT: $344.88Tesla Inc TSLA: $1,158.90Amazon.com, Inc. AMZN: $225.12NVIDIA Corporation NVDA: $402.26


In [35]:
## Save the DataFrame to csv file
df = df.rename(columns={"id": "fk_stock_news_id"})
df.to_csv('bertopic_sentiment_score.csv', index=False)

In [None]:
#bert_topic_model.visualize_barchart()

In [8]:
#bert_topic_model.visualize_heatmap()

In [9]:
#bert_topic_model.visualize_topics()

# Use Finbert Embedding

In [82]:
#finbert_embedding = FinbertEmbedding()

In [83]:
# bert_topic_model = BERTopic(
#     embedding_model=finbert_embedding,
#     language="english",
#     calculate_probabilities=True,
#     nr_topics=10,
# )
# topics, probs = bert_topic_model.fit_transform(list(df["article"]))
# freq = bert_topic_model.get_topic_info()
# freq.head(10)

In [None]:
#bert_topic_model.visualize_barchart()

In [None]:
#bert_topic_model.visualize_heatmap()

In [None]:
#bert_topic_model.visualize_topics()