In [4]:
import json
from bertopic import BERTopic
import matplotlib.pyplot as plt
import torch
from sentence_transformers import SentenceTransformer


In [30]:
# Load configuration from config.json
config_file_path = r'C:\Users\nikla\OneDrive\Dokumente\winfoMaster\Masterarbeit\bertopic_ecc\config.json'
with open(config_file_path, 'r') as f:
    config = json.load(f)

# Get the correct model path from the config
model_load_path = config["model_load_path"]

# Load the embedding model and ensure it's loaded onto the CPU
embedding_model = SentenceTransformer(config["embedding_model_choice"], device="cpu")

# Manually load the model and ensure all GPU-based tensors are mapped to CPU
def custom_torch_load(path):
    return torch.load(path, map_location=torch.device('cpu'))

# Load the BERTopic model from the local path
# Catching any model-level components that need to be loaded on the CPU
topic_model = BERTopic.load(model_load_path)

# Manually adjust any GPU-related components within the model if necessary (for example, handling specific components that still have GPU data)
# Ensure all parts are mapped to the CPU
topic_model.embedding_model = embedding_model



In [15]:
# Get the correct model path from the updated config
model_load_path_with_data = config["model_load_path_with_data"]

# Load the model from the local path
topic_model = BERTopic.load(model_load_path_with_data)

# Access the original documents (untransformed)
original_documents = topic_model.original_documents_

# Access the transformed topics and probabilities
transformed_topics = topic_model.topics_
transformed_probabilities = topic_model.probabilities_

In [32]:
# topic_model.visualize_topics()

In [21]:
# Access original documents
documents = topic_model.original_documents_

# Access topics assigned to each document
topics = topic_model.topics_

# Access topic probabilities (if available)
probabilities = topic_model.probabilities_

In [28]:
#print first 5 documents
print(documents[:5])

['Good day, ladies and gentlemen, and welcome to the Neuronetics Fourth Quarter and Full Year 2018 Earnings Conference Call. (Operator Instructions) As a reminder, this conference call is being recorded', "I would now like to introduce your host for today's conference, Mr. Mark Klausner from Westwicke. Sir, you may begin", "Thank you, operator. Good morning, and thank you for joining us for Neuronetics' Fourth Quarter and Full Year 2018 Conference Call. A replay of this call will be available on our website for 30 days. Joining me on today's call are: Neuronetics' Chief Executive Officer, Chris Thatcher; and its Chief Financial Officer, Peter Donato", "Before we begin, I would like to caution listeners that certain information discussed by management during this conference call will include forward-looking statements covered under the safe harbor provisions of the Private Securities Litigation Reform Act of 1995, including statements related to our business strategy, financial and reve

In [29]:
new_topics, new_probs = topic_model.reduce_topics(nr_topics=20,docs=documents)

AttributeError: 'BERTopic' object has no attribute '_outliers'

In [17]:
def explore_basic_info(topic_model):
    """
    Explore the basic information about the BERTopic model.
    """
    print("Exploring Basic Information about the Model...")
    # Get the number of topics
    num_topics = len(topic_model.get_topic_info())
    print(f"Number of Topics: {num_topics}")
    
    # Get topic frequency (number of documents per topic)
    topic_info = topic_model.get_topic_info()
    print("Top 5 Topics by Frequency:")
    print(topic_info.head(5))
    
    # Get top words for a specific topic (example: topic 0)
    example_topic = 0
    top_words = topic_model.get_topic(example_topic)
    print(f"Top words for Topic {example_topic}:")
    for word, score in top_words:
        print(f"  - {word}: {score}")
    
    print("\n" + "-"*50 + "\n")

# Call this function
explore_basic_info(topic_model)


Exploring Basic Information about the Model...


AttributeError: 'BERTopic' object has no attribute 'topic_labels_'

In [None]:
def reduce_and_explore(topic_model, nr_topics):
    """
    Reduce the number of topics in the model and explore the new model.
    """
    print(f"Reducing number of topics to {nr_topics}...")
    reduced_model = topic_model.reduce_topics(topic_model.original_documents_, nr_topics=nr_topics)
    
    # Print reduced model info
    explore_basic_info(reduced_model)
    
    return reduced_model

# Example: Reduce topics to 30
reduced_model = reduce_and_explore(topic_model, nr_topics=30)


In [10]:
def visualize_topics_distribution(topic_model):
    """
    Visualize the distribution of topics.
    """
    print("Visualizing Topic Distribution...")
    topic_info = topic_model.get_topic_info()
    fig, ax = plt.subplots()
    ax.bar(topic_info['Topic'], topic_info['Count'])
    ax.set_xlabel('Topic')
    ax.set_ylabel('Number of Documents')
    ax.set_title('Topic Distribution')
    plt.show()

# Visualize topic distribution of the reduced model
visualize_topics_distribution(topic_model)


Visualizing Topic Distribution...


AttributeError: 'BERTopic' object has no attribute 'topic_labels_'

In [None]:
def explore_topic_diversity(topic_model):
    """
    Explore the diversity of topics.
    """
    print("Exploring Topic Diversity...")
    diversity_scores = topic_model.topic_diversity(topic_model.topics_)
    print(f"Average Topic Diversity: {sum(diversity_scores)/len(diversity_scores)}")
    
    # Print diversity of the first few topics
    for i in range(5):
        print(f"Topic {i} Diversity: {diversity_scores[i]}")
    
    print("\n" + "-"*50 + "\n")

# Call the function to explore topic diversity
explore_topic_diversity(reduced_model)


In [None]:
# Try reducing to a different number of topics
nr_topics = 50
reduced_model_50 = reduce_and_explore(topic_model, nr_topics=nr_topics)

# Visualize topic distribution after further reduction
visualize_topics_distribution(reduced_model_50)


In [None]:
# Save the reduced model
reduced_model.save("path_to_save_reduced_model")
print("Reduced model saved.")
