# Bringing it all together!
This notebook is intended for bringing together the pipeline

In [6]:
import random
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from bertopic import BERTopic
from typing import List, Tuple, Union, Mapping, Any
from pathlib import Path

In [2]:
def weighted_mean(X, weights):
    return np.dot(X.T, weights) / np.sum(weights)


def get_unique_topics(topic_model):
    topic_info = topic_model.get_topic_info()
    return topic_info["Topic"].unique()


def find_centroid(embeddings: np.ndarray, topics: np.ndarray, probs: np.ndarray, target_topic: int):
    """
    Arguments:
        embeddings: 2d with dimensions (num_documents, num_dimensions)
        topics: list of length num documents
        probs: np.array of length num_documents showing the probability of the assigned topic
        target_topic: the topic, we want to find the centroid for
    returns: 
        The centroid for the cluster
    """
    # Filtering the embeddings
    filtered_embeddings = embeddings[topics == target_topic, :]
    filtered_probs = probs[topics == target_topic]

    # Calculating the centroid
    return weighted_mean(filtered_embeddings, filtered_probs)

def calc_cosine_sim(centroids, embedding):
    """ 
    Calculates the cosine similarity between a single embedding and the centroids
    """
    return cosine_similarity(centroids, embedding.reshape(1, -1))


In [7]:
# Loading topic model
MODEL_PATH = Path("../models/topic_model")
topic_model = BERTopic.load(str(MODEL_PATH), embedding_model="Maltehb/-l-ctra-danish-electra-small-cased")

FileNotFoundError: [Errno 2] No such file or directory: '..\\models\\topic_model'