In [1]:
import numpy as np
from constants import SHARED_RANDOM_STATE
from db_helper_functions import get_stock_news_from_db
from text_cleaning_functions import clean_text
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from finbert_embedding.embedding import FinbertEmbedding
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
import torch
from nltk.tokenize import sent_tokenize
import pandas as pd
import time

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Load data from db
df = get_stock_news_from_db("AAPL")
df = df[~df.article.isnull()]

In [3]:
# Initial cleaning
df["article"] = df["article"].apply(
    lambda x: x.replace("\xa0", " ").replace("\n", "").replace("Loading...", "")
)

In [4]:
# Initialize vectorizer
tf_vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 3))

In [5]:
# Helper functions

def calculate_article_score(top_distribution, score_array):
    product = [a * b for a, b in zip(top_distribution, score_array)]
    sum_of_products = sum(product)
    
    return sum_of_products

In [6]:
# Function to calcualte the mean sentiment score

def get_mean_sentiment_score(df, vectorizer, num_topic):
    docs = list(df["article"])

    model = BERTopic(
        vectorizer_model=vectorizer,
        language="english",
        calculate_probabilities=True,
        nr_topics=num_topic,
    )
    
    topic_model = model.fit(docs)
    topic_distr, _ = topic_model.approximate_distribution(docs)

    # Get the semantic score for each topic
    finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    # Get topic info from the topic model
    topic_info_df = topic_model.get_topic_info()

    # Get number of topics 
    topic_num = len(topic_info_df) 

    # Start with 1 because topic -1 are outliers
    for i in range(1,topic_num):

        print(f"processing topic{i}")
        # Get representative documents
        representative_docs = topic_info_df.loc[i]["Representative_Docs"]

        # Tokenize the articles into sentences
        representative_docs_sentences = [sent_tokenize(x) for x in representative_docs]

        # Initiate empty array for the sentiment score of each representitive document
        representative_docs_score= []

        # Traverse each document and calculate the score 
        for sentence_arr in representative_docs_sentences:
            if len(sentence_arr) > 100:
                sentence_arr = sentence_arr[:100]
            #print(f"The length of the sentence array is{len(sentence_arr)}")
            embedding = finbert_tokenizer(
                    sentence_arr, padding=True, return_tensors="pt", max_length=512, truncation=True
                )
            outputs = finbert_model(**embedding)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

            score = np.round(np.mean(predictions.tolist(), axis=0), 4)
            representative_docs_score.append(list(score))

        representative_docs_score = np.array(representative_docs_score)
        mean_value_list = np.mean(representative_docs_score, axis=0)

        # Add the semantic score to the topic dataframe
        topic_info_df.loc[i,"positive"] = mean_value_list[0]
        topic_info_df.loc[i,"negative"] = mean_value_list[1]
        topic_info_df.loc[i,"neutral"] = mean_value_list[2]
    
    # Calculate the semantic score for each articles
    topics_positive_score = topic_info_df["positive"][1:].tolist()
    topics_negative_score = topic_info_df["negative"][1:].tolist()
    topics_neutral_score = topic_info_df["neutral"][1:].tolist()

    article_score_array = []

    for article_topic_distribution in topic_distr:
        article_positive = calculate_article_score(article_topic_distribution, topics_positive_score)
        article_negative = calculate_article_score(article_topic_distribution, topics_negative_score)
        article_neutral = calculate_article_score(article_topic_distribution, topics_neutral_score)

        article_score_array.append([article_positive, article_negative,  article_neutral])

    df[["positive", "negative", "neutral"]] = article_score_array

    positive_mean = df["positive"].mean()
    negative_mean = df["negative"].mean()
    neutral_mean = df["neutral"].mean()

    return (positive_mean,negative_mean,neutral_mean)



In [8]:
topic_num_arr = [5, 10, 15, 20, 25, 30, 35, 40]
result_df = pd.DataFrame(columns=["number_of_topic","mean_positive_score", "mean_negative_score", "mean_neutral_score"])

for topic_num in topic_num_arr:
    print(f"Start testing with {topic_num} topics")
    positive_mean, negative_mean, neutral_mean = get_mean_sentiment_score(df, tf_vectorizer, topic_num)

    new_row = {
        "number_of_topic":topic_num,
        "mean_positive_score":positive_mean,
        "mean_negative_score":negative_mean,
        "mean_neutral_score":neutral_mean,
    }

    result_df = result_df._append(new_row, ignore_index=True)
    time.sleep(60)

# Write result to csv
result_df.to_csv('sensitivity_analysis.csv', index=False)


Start testing with 5 topics
processing topic1
processing topic2
processing topic3
processing topic4
Start testing with 10 topics
processing topic1
processing topic2
processing topic3
processing topic4
processing topic5
processing topic6
processing topic7
processing topic8
processing topic9
Start testing with 15 topics
processing topic1
processing topic2
processing topic3
processing topic4
processing topic5
processing topic6
processing topic7
processing topic8
processing topic9
processing topic10
processing topic11
processing topic12
processing topic13
processing topic14
Start testing with 20 topics
processing topic1
processing topic2
processing topic3
processing topic4
processing topic5
processing topic6
processing topic7
processing topic8
processing topic9
processing topic10
processing topic11
processing topic12
processing topic13
processing topic14
processing topic15
processing topic16
processing topic17
processing topic18
processing topic19
Start testing with 25 topics
processing to