# Creating topics for the cases
This notebooks explores the topics we can create from the cases. It's gonna involve a lot of trial and error, but it will hopefully be well-documented

In [1]:
import pickle
import numpy as np
from pathlib import Path
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from typing import List

In [7]:
def read_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def flatten_embeddings(embedding_dict):
    """ Creates a big matrix with all the embeddings from the dict """
    return np.vstack(embedding_dict.values())

def flatten_list(lst: List):
    return [elem for sublist in lst for elem in sublist]

def get_paragraphs(paragraph_dict):
    return flatten_list(list(paragraph_dict.values()))

## Loading the data

In [3]:
DATA_DIR = Path("../../BscThesisData/data")
embedding_dict = read_pickle(DATA_DIR / "embedding_dict.pkl")
paragraphs_dict = read_pickle(DATA_DIR / "paragraph_dict.pkl")

Now it's time to extract the embeddings for BERTopic to process

In [8]:
embeddings = flatten_embeddings(embedding_dict)
docs = get_paragraphs(paragraphs_dict)

### Now we initialize the models

In [15]:
# Powering up the transformer!
topic_model = BERTopic("Maltehb/-l-ctra-danish-electra-small-cased", nr_topics=10)

In [16]:
topics, probs = topic_model.fit_transform(docs, embeddings)

In [17]:
topic_model.visualize_topics()