# Topic Modeling with BERTopic

In [None]:
import logging
import random
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import torch
from bertopic import BERTopic
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from transformers import BertModel, BertTokenizer
from umap import UMAP

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
# Set up loggings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
import csv

import numpy as np
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

random.seed(42)

# Get Topic with Highest Probability

In this script, I cluster the rationales based on the topic with the highest probability, thus I try to minimize -1 classifications.

In [None]:
with open(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/bld/data/topic_model_intermediate/text_snippets_lemmatized_v003.csv",
) as file:
    reader = csv.reader(file)
    text_snippets = next(reader)
embeddings = np.load(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/bld/data/topic_model_intermediate/embeddings.npy",
)

In [None]:
def fit_BERT_model_and_return_topic_probabilities(
    text_snippets: list[str],
    embeddings: np.ndarray,
    min_topic_size: int = 50,
    n_neighbors: int = 5,
) -> pd.DataFrame:
    """Fit a BERTopic model on the text snippets and embeddings and return a dataframe with text snippets, topics and probabilities."""
    # Create vectorizer and UMAP models with the specified hyperparameters
    vectorizer_model = CountVectorizer(stop_words="english")
    umap_model = UMAP(n_neighbors=n_neighbors, random_state=42)

    # Initialize BERTopic with the vectorizer and embeddings
    topic_model = BERTopic(
        vectorizer_model=vectorizer_model,
        calculate_probabilities=True,
        min_topic_size=min_topic_size,
        umap_model=umap_model,
    )

    # Fit the model on text snippets and embeddings
    topics, probabilities = topic_model.fit_transform(text_snippets, embeddings)

    # Convert the probabilities array to a DataFrame
    probabilities_df = pd.DataFrame(
        probabilities,
        columns=[f"Topic_Probability_{i}" for i in range(probabilities.shape[1])],
    )

    # Create a DataFrame with text snippets and topic probabilities
    df = pd.DataFrame(
        {
            "Text Snippet": text_snippets,
            "Topic": topics,
        },
    )

    df = pd.concat([df, probabilities_df], axis=1)

    return df, topic_model

In [None]:
def self_assign_topics(df: pd.DataFrame, threshold: float):
    """Self-assigns topics based on topic probabilities. Assigns the topic with the highest probability
    if the difference between the highest and second highest probability exceeds the threshold.

    Parameters:
    df (pd.DataFrame): DataFrame containing text snippets and topic probabilities.
    threshold (float): The minimum difference required between the highest and second highest probabilities to assign a topic.

    Returns:
    pd.DataFrame: Updated DataFrame with a new column for self-assigned topics.
    """
    # Extract only the probability columns
    probability_columns = [
        col for col in df.columns if col.startswith("Topic_Probability")
    ]

    # Create a new column for self-assigned topics
    df["Self_Assigned_Topic"] = -1  # Default to -1 for unassigned topics

    # Iterate over each row to determine self-assigned topics
    for idx, row in df.iterrows():
        # Get the probabilities for the current row
        probabilities = row[probability_columns].values

        # Find the indices of the highest and second highest probabilities
        highest_index = np.argmax(probabilities)
        second_highest_index = np.argsort(probabilities)[-2]

        # Calculate the difference between the highest and second highest probabilities
        difference = probabilities[highest_index] - probabilities[second_highest_index]

        # Assign the topic if the difference exceeds the threshold
        if difference > threshold:
            df.at[idx, "Self_Assigned_Topic"] = highest_index

    return df

In [None]:
embeddings.shape

In [None]:
# # Select 10 random indices

# # Select the random text snippets and corresponding embeddings
# Display the DataFrame
df, topic_model = fit_BERT_model_and_return_topic_probabilities(
    text_snippets,
    embeddings,
    min_topic_size=10,
    n_neighbors=5,
)

df = self_assign_topics(df, 0.01)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10, 6))
df["Self_Assigned_Topic"].value_counts().sort_index().plot(kind="bar")
plt.xlabel("Self_Assigned_Topic")
plt.ylabel("Frequency")
plt.title("Histogram of Self_Assigned_Topic Frequencies")
plt.show()

In [None]:
frequency_plot = topic_model.visualize_barchart(top_n_topics=10)
frequency_plot.show()

In [None]:
topic_model.get_topic_freq().head(30)

# Hyperparameter Tuning

I want to tune hyperparameters to have a reasonable amount of topics and most snippets classified. 

In [None]:
with open(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/bld/data/topic_model_intermediate/text_snippets_lemmatized.csv",
) as file:
    reader = csv.reader(file)
    text_snippets = next(reader)
embeddings = np.load(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/bld/data/topic_model_intermediate/embeddings.npy",
)

In [None]:
def evaluate_topic_model(
    text_snippets: list[str],
    embeddings: np.ndarray,
    n_neighbors: int,
    min_topic_size: int,
    number_of_topics: int,
) -> pd.DataFrame:
    """Evaluate BERTopic model and return the top 30 topics and their sizes.

    Parameters:
    ----------
    text_snippets : List[str]
        A list of text snippets to be used for topic modeling. Each entry in the list
        should be a string representing a text document.

    embeddings : Any
        Pre-computed embeddings for the text snippets. These are usually generated
        using a sentence transformer or any compatible embedding model.
        This parameter should be a NumPy array or a list of lists representing the embeddings.

    n_neighbors : int
        The number of neighboring points used in UMAP's local manifold approximation.
        Lower values result in a more local approximation, while higher values produce
        a more global view of the manifold.

    min_topic_size : int
        The minimum number of documents required to form a topic. Smaller values allow
        the model to create smaller, more granular topics, whereas larger values ensure
        topics are more significant in size.

    Returns:
    -------
    pd.DataFrame
        A DataFrame containing the top 30 topics generated by the BERTopic model,
        with columns for 'Topic' and 'Frequency'. The 'Topic' column indicates the
        topic label, and 'Frequency' shows how many documents are assigned to each topic.
    """
    # Create vectorizer and UMAP models with the specified hyperparameters
    vectorizer_model = CountVectorizer(stop_words="english")
    umap_model = UMAP(n_neighbors=n_neighbors, random_state=42)

    # Initialize BERTopic with the vectorizer and embeddings
    topic_model = BERTopic(
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        min_topic_size=min_topic_size,
        calculate_probabilities=False,
        nr_topics=number_of_topics,
    )

    # Fit the model on text snippets and embeddings
    topics, _ = topic_model.fit_transform(text_snippets, embeddings)

    # Get the top 30 topics and their frequencies
    topic_freq = topic_model.get_topic_freq().head(30)

    # Attempt to visualize the topics
    try:
        # Check if there are enough topics to visualize
        if topic_freq.shape[0] > 0:
            fig = topic_model.visualize_topics()
            # Add a title with hyperparameters
            plt.title(
                f"Topics Visualization (n_neighbors={n_neighbors}, min_topic_size={min_topic_size})",
            )

            # Save the figure
            filename = f"topics_visualization_n{n_neighbors}_min{min_topic_size}.html"
            fig.write_html(filename)
            print(f"Saved topics visualization to {filename}")
        else:
            print(
                f"No topics to visualize for n_neighbors={n_neighbors}, min_topic_size={min_topic_size}",
            )
    except Exception as e:
        print(
            f"Could not visualize topics for n_neighbors={n_neighbors}, min_topic_size={min_topic_size}: {e}",
        )

    return topic_freq

In [None]:
n_neighbors = 15
min_topic_size = 10

topic_freq = evaluate_topic_model(
    text_snippets=text_snippets,
    embeddings=embeddings,
    n_neighbors=n_neighbors,
    min_topic_size=min_topic_size,
)

# Log results
num_topics_after_fitting = topic_freq.shape[0]
topic_counts = topic_freq.set_index("Topic").to_dict()["Count"]
print(f"Found {num_topics_after_fitting} topics with the following counts:")
print(topic_counts)

In [None]:
# Define hyperparameter grid
param_grid = {
    "n_neighbors": [3, 5, 10],  # UMAP n_neighbors
    "min_topic_size": [5, 10, 20],
    "num_topics": [20, 40, 80],  # BERTopic min_topic_size
}

# Store results
results = []

# Perform grid search with bootstrapping
for n_neighbors in param_grid["n_neighbors"]:
    for min_topic_size in param_grid["min_topic_size"]:
        num_topics = 200

        # Evaluate the topic model
        topic_freq = evaluate_topic_model(
            text_snippets=text_snippets,
            embeddings=embeddings,
            n_neighbors=n_neighbors,
            min_topic_size=min_topic_size,
            number_of_topics=num_topics,
        )

        # Log results
        num_topics_after_fitting = topic_freq.shape[0]
        topic_counts = topic_freq.set_index("Topic").to_dict()["Count"]

        result = {
            "n_neighbors": n_neighbors,
            "min_topic_size": min_topic_size,
            "num_topics": num_topics,
            "topic_counts": topic_counts,
        }
        results.append(result)
        print(
            f"n_neighbors: {n_neighbors}, min_topic_size: {min_topic_size}, num_topics: {num_topics_after_fitting}, topic_counts: {topic_counts}",
        )

# Display results
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
def create_set_with_all_country_words(country_names_file):
    # Flatten the DataFrame to a single list
    country_words = country_names_file.values.flatten()

    # Remove NaN values
    country_words = [word for word in country_words if pd.notna(word)]

    # Create a set of unique words
    return set(country_words)

# Preprocessing

In [None]:
data = pd.read_pickle(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/bld/data/GPT_Output_Data/sentiment_data_clean_full.pkl",
)
country_names_file = pd.read_csv(
    "/Users/nicolasroever/Documents/Promotion/Debt_Crisis/debt_crisis/src/debt_crisis/data/country_names/country_names.csv",
)

Now, I delete all country words from the rationales. 

In [None]:
countries_set = create_set_with_all_country_words(country_names_file)
country_words = {word for entry in countries_set for word in entry.split(";") if word}

# Create a regex pattern that matches any of these words
pattern = re.compile(
    r"\b(" + "|".join(map(re.escape, country_words)) + r")\b",
    re.IGNORECASE,
)

# Assuming df is your DataFrame and "Rationale for Prediction" is the column you want to modify
data["Rationale_for_Prediction"] = data["Rationale_for_Prediction"].str.replace(
    pattern,
    "",
    regex=True,
)

In [None]:
def lemmatize_text(text: str) -> str:
    """Lemmatize the given text using spaCy.

    Parameters:
    text (str): A string containing the text to be lemmatized.

    Returns:
    str: A string containing the lemmatized version of the input text,
         with only alphabetic tokens included.
    """
    # Process the text using spaCy
    doc = nlp(text)
    # Extract the lemma for each token and filter out non-alphabetic tokens
    return " ".join(token.lemma_ for token in doc if token.is_alpha)

In [None]:
display(data["Rationale_for_Prediction"].iloc[0])

In [None]:
data_subset = data.sample(n=1000, random_state=42)

In [None]:
# Add a progress bar to the pandas apply method
tqdm.pandas(desc="Lemmatizing Texts")

# Apply lemmatization to the "Rationale_for_Prediction" column
data_subset["Rationale_for_Prediction_Lemmatized"] = data_subset[
    "Rationale_for_Prediction"
].progress_apply(lemmatize_text)

In [None]:
text_snippets = data_subset["Rationale_for_Prediction_Lemmatized"].to_list()
timestamps = pd.to_datetime(data_subset["Date"])
timestamps = timestamps.tolist()

# Run FinBert Topic Model

In [None]:
def get_embeddings_for_text_snippets(
    texts: list[str],
    tokenizer: BertTokenizer = BertTokenizer.from_pretrained(
        "yiyanghkust/finbert-pretrain",
    ),
    model: BertModel = BertModel.from_pretrained("yiyanghkust/finbert-pretrain"),
    max_length: int = 128,
    batch_size: int = 32,
    use_gpu: bool = False,
) -> np.ndarray:
    """Generate embeddings for a list of texts using a pre-trained BERT-based model.

    Parameters:
    texts (List[str]): A list of text snippets to be encoded into embeddings.
    tokenizer (BertTokenizer, optional): The tokenizer to process the input texts.
    model (BertModel, optional): The BERT-based model to generate embeddings.
    max_length (int, optional): The maximum length for tokenization. Defaults to 128.
    batch_size (int, optional): The number of texts to process in a batch. Defaults to 32.
    use_gpu (bool, optional): Whether to use GPU acceleration. Defaults to False.

    Returns:
    np.ndarray: A 2D numpy array where each row corresponds to the embedding of a text snippet.
    """
    if use_gpu and torch.cuda.is_available():
        model = model.to("cuda")

    embeddings = []
    total_texts = len(texts)

    # Process texts in batches
    for start_idx in range(0, total_texts, batch_size):
        end_idx = min(start_idx + batch_size, total_texts)
        batch_texts = texts[start_idx:end_idx]

        # Tokenize and encode the text data
        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=max_length,
        )

        if use_gpu and torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        # Generate embeddings without gradients
        with torch.no_grad():
            outputs = model(**inputs)

        # Mean pooling of token embeddings
        mean_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(mean_embeddings)

        # Print progress every 20 texts
        if (end_idx) % 20 == 0:
            progress = end_idx / total_texts * 100
            print(f"Progress: {progress:.2f}% ({end_idx}/{total_texts})")

    # Stack embeddings into a single numpy array
    return np.vstack(embeddings)

In [None]:
embeddings = get_embeddings_for_text_snippets(
    text_snippets,
    batch_size=100,
    use_gpu=True,
)

In [None]:
# Create a CountVectorizer for BERTopic
vectorizer_model = CountVectorizer(stop_words="english")

# Initialize UMAP with adjusted parameters
umap_model = UMAP(n_neighbors=10, random_state=42)

# Initialize BERTopic with the vectorizer and FinBERT embeddings
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    min_topic_size=15,
    umap_model=umap_model,
    calculate_probabilities=True,
)

logger.info("Starting topic modeling with BERTopic")
topics, probabilities = topic_model.fit_transform(text_snippets, embeddings)
logger.info("Completed topic modeling with BERTopic")

## Analyze Model Output

In [None]:
topic_model.get_topic_freq().head(30)

In [None]:
probabilities.shape

In [None]:
probabilities_df = pd.DataFrame(
    probabilities,
    columns=[f"Topic_{i}" for i in range(probabilities.shape[1])],
)
probabilities_df.head()

In [None]:
# Find and print texts assigned to topic -1
outlier_texts = [text for text, topic in zip(text_snippets, topics) if topic == -1]

print("Text snippets assigned to topic -1:")
for idx, text in enumerate(outlier_texts[0:2]):
    print(f"{idx + 1}: {text}")

In [None]:
topic_model.visualize_barchart(top_n_topics=5)

In [None]:
topic_model.visualize_topics()

In [None]:
topics_over_time = topic_model.topics_over_time(text_snippets, timestamps, nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time, topics=[0, 1, 2, 3, 4, 5])

# Decompose Sentiment Score

In [None]:
# Get the topic names
topic_names = topic_model.get_topic_info()

# Create a mapping of topic numbers to their names
topic_name_dict = topic_names.set_index("Topic")["Name"].to_dict()

# Map the topic numbers to their names
data["topic"] = [
    topic_name_dict[topic] if topic != -1 else "Outlier" for topic in topics
]

In [None]:
# Group by 'topic' and sum the 'Prediction' column
topic_sums = data.groupby("topic")["Prediction"].sum().reset_index()

# Sort by the absolute value of the summed predictions and select the top 10
top_10_topics = topic_sums.reindex(
    topic_sums["Prediction"].abs().sort_values(ascending=False).index,
).head(10)

# Plotting the column chart
plt.figure(figsize=(10, 6))
plt.bar(top_10_topics["topic"], top_10_topics["Prediction"], color="skyblue")
plt.xlabel("Topic")
plt.ylabel("Sum of Predictions")
plt.title("Top 10 Topics by Sum of Predictions")
plt.xticks(rotation=45, ha="right")  # Rotate labels and align them to the right
plt.tight_layout()  # Adjust layout to make room for the rotated labels
plt.show()

# Graveyard

In [None]:
# Specify the directory where you want to save the model and tokenizer
import pickle

# Assume `topic_model` is your trained topic model
with open("saved_bert_model_1000_lemma.pkl", "wb") as f:
    pickle.dump(topic_model, f)

# Save embeddings as a .npy file
np.save("embeddings_lemma.npy", embeddings)

In [None]:
def get_embeddings_for_text_snippets(
    texts: list[str],
    tokenizer: BertTokenizer = BertTokenizer.from_pretrained(
        "yiyanghkust/finbert-pretrain",
    ),
    model: BertModel = BertModel.from_pretrained("yiyanghkust/finbert-pretrain"),
    max_length: int = 128,
    batch_size: int = 32,
    use_gpu: bool = False,
) -> np.ndarray:
    """Generate embeddings for a list of texts using a pre-trained BERT-based model.

    Parameters:
    texts (List[str]): A list of text snippets to be encoded into embeddings.
    tokenizer (BertTokenizer, optional): The tokenizer to process the input texts.
    model (BertModel, optional): The BERT-based model to generate embeddings.
    max_length (int, optional): The maximum length for tokenization. Defaults to 128.
    batch_size (int, optional): The number of texts to process in a batch. Defaults to 32.
    use_gpu (bool, optional): Whether to use GPU acceleration. Defaults to False.

    Returns:
    np.ndarray: A 2D numpy array where each row corresponds to the embedding of a text snippet.
    """
    if use_gpu and torch.cuda.is_available():
        model = model.to("cuda")

    embeddings = []
    total_texts = len(texts)

    # Process texts in batches
    for start_idx in range(0, total_texts, batch_size):
        end_idx = min(start_idx + batch_size, total_texts)
        batch_texts = texts[start_idx:end_idx]

        # Tokenize and encode the text data
        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=max_length,
        )

        if use_gpu and torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        # Generate embeddings without gradients
        with torch.no_grad():
            outputs = model(**inputs)

        # Mean pooling of token embeddings
        mean_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(mean_embeddings)

        # Print progress every 20 texts
        if (end_idx) % 20 == 0 or end_idx == total_texts:
            progress = end_idx / total_texts * 100
            print(f"Progress: {progress:.2f}% ({end_idx}/{total_texts})")

    # Stack embeddings into a single numpy array
    return np.vstack(embeddings)

In [None]:
embeddings = get_embeddings_for_text_snippets_parallel(selected_text_snippets)

# Run Topic Model

In [None]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

model = BERTopic(
    vectorizer_model=vectorizer_model,
    language="english",
    calculate_probabilities=True,
    verbose=True,
    nr_topics=50,
)

topics, probabilities = model.fit_transform(selected_text_snippets)

# Analyze Model Output

In [None]:
model.get_topic_freq().head(30)

In [None]:
model.visualize_barchart(top_n_topics=6)

In [None]:
model.visualize_topics()

In [None]:
for i, topic in enumerate(topics[:5]):
    print(f"Document {i}: Topic {topic}")

# Topics over Time

In [None]:
topics_over_time = model.topics_over_time(
    selected_text_snippets,
    selected_timestamps,
    nr_bins=20,
)

In [None]:
model.visualize_topics_over_time(topics_over_time, topics=[0, 1, 2, 3, 4, 5])

# Decompose Sentiment Score into Topic Components

In [None]:
# Get the topic names
topic_names = model.get_topic_info()

# Create a mapping of topic numbers to their names
topic_name_dict = topic_names.set_index("Topic")["Name"].to_dict()

# Map the topic numbers to their names
full_data["topic"] = [
    topic_name_dict[topic] if topic != -1 else "Outlier" for topic in topics
]

In [None]:
# Group by 'topic' and sum the 'Prediction' column
topic_sums = full_data.groupby("topic")["Prediction"].sum().reset_index()

# Sort by the absolute value of the summed predictions and select the top 10
top_10_topics = topic_sums.reindex(
    topic_sums["Prediction"].abs().sort_values(ascending=False).index,
).head(10)

# Plotting the column chart
plt.figure(figsize=(10, 6))
plt.bar(top_10_topics["topic"], top_10_topics["Prediction"], color="skyblue")
plt.xlabel("Topic")
plt.ylabel("Sum of Predictions")
plt.title("Top 10 Topics by Sum of Predictions")
plt.xticks(rotation=45, ha="right")  # Rotate labels and align them to the right
plt.tight_layout()  # Adjust layout to make room for the rotated labels
plt.show()