In [1]:
import json
from bertopic import BERTopic
import matplotlib.pyplot as plt
import torch
from sentence_transformers import SentenceTransformer


In [3]:
# Load configuration from config.json
config_file_path = r'C:\Users\nikla\OneDrive\Dokumente\winfoMaster\Masterarbeit\bertopic_ecc\config.json'
with open(config_file_path, 'r') as f:
    config = json.load(f)

# Get the correct model path from the config
model_load_path_with_data = config["model_load_path_with_data"]

# Load the embedding model and ensure it's loaded onto the CPU
embedding_model = SentenceTransformer(config["embedding_model_choice"], device="cpu")

# Manually load the model and ensure all GPU-based tensors are mapped to CPU
def custom_torch_load(path):
    return torch.load(path, map_location=torch.device('cpu'))

# Load the BERTopic model from the local path
# Catching any model-level components that need to be loaded on the CPU
topic_model = BERTopic.load(model_load_path_with_data)

# Manually adjust any GPU-related components within the model if necessary (for example, handling specific components that still have GPU data)
# Ensure all parts are mapped to the CPU
topic_model.embedding_model = embedding_model

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
def explore_basic_info(topic_model):
    """
    Explore the basic information about the BERTopic model.
    """
    print("Exploring Basic Information about the Model...")
    # Get the number of topics
    num_topics = len(topic_model.get_topic_info())
    print(f"Number of Topics: {num_topics}")
    
    # Get topic frequency (number of documents per topic)
    topic_info = topic_model.get_topic_info()
    print("Top 5 Topics by Frequency:")
    print(topic_info.head(5))
    
    # Get top words for a specific topic (example: topic 0)
    example_topic = 0
    top_words = topic_model.get_topic(example_topic)
    print(f"Top words for Topic {example_topic}:")
    for word, score in top_words:
        print(f"  - {word}: {score}")
    
    print("\n" + "-"*50 + "\n")

# Call this function
explore_basic_info(topic_model)


In [None]:
def reduce_and_explore(topic_model, nr_topics):
    """
    Reduce the number of topics in the model and explore the new model.
    """
    print(f"Reducing number of topics to {nr_topics}...")
    reduced_model = topic_model.reduce_topics(topic_model.original_documents_, nr_topics=nr_topics)
    
    # Print reduced model info
    explore_basic_info(reduced_model)
    
    return reduced_model

# Example: Reduce topics to 30
reduced_model = reduce_and_explore(topic_model, nr_topics=30)


In [None]:
def visualize_topics_distribution(topic_model):
    """
    Visualize the distribution of topics.
    """
    print("Visualizing Topic Distribution...")
    topic_info = topic_model.get_topic_info()
    fig, ax = plt.subplots()
    ax.bar(topic_info['Topic'], topic_info['Count'])
    ax.set_xlabel('Topic')
    ax.set_ylabel('Number of Documents')
    ax.set_title('Topic Distribution')
    plt.show()

# Visualize topic distribution of the reduced model
visualize_topics_distribution(reduced_model)


In [None]:
def explore_topic_diversity(topic_model):
    """
    Explore the diversity of topics.
    """
    print("Exploring Topic Diversity...")
    diversity_scores = topic_model.topic_diversity(topic_model.topics_)
    print(f"Average Topic Diversity: {sum(diversity_scores)/len(diversity_scores)}")
    
    # Print diversity of the first few topics
    for i in range(5):
        print(f"Topic {i} Diversity: {diversity_scores[i]}")
    
    print("\n" + "-"*50 + "\n")

# Call the function to explore topic diversity
explore_topic_diversity(reduced_model)


In [None]:
# Try reducing to a different number of topics
nr_topics = 50
reduced_model_50 = reduce_and_explore(topic_model, nr_topics=nr_topics)

# Visualize topic distribution after further reduction
visualize_topics_distribution(reduced_model_50)


In [None]:
# Save the reduced model
reduced_model.save("path_to_save_reduced_model")
print("Reduced model saved.")
