# Bringing it all together!
This notebook is intended for bringing together the pipeline

In [1]:
import random
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from bertopic import BERTopic
from typing import List, Tuple, Union, Mapping, Any
from pathlib import Path

In [11]:
def weighted_mean(X, weights):
    return np.dot(X.T, weights) / np.sum(weights)


def get_unique_topics(topic_model):
    topic_info = topic_model.get_topic_info()
    return topic_info["Topic"].unique()


def get_topic_range(topic_model):
    topic_info = topic_model.get_topic_info()
    max_topic = np.max(topic_info["Topic"])
    return list(range(0, max_topic+1))
    

def find_centroid(embeddings: np.ndarray, topics: np.ndarray, probs: np.ndarray, target_topic: int):
    """
    Arguments:
        embeddings: 2d with dimensions (num_documents, num_dimensions)
        topics: list of length num documents
        probs: np.array of length num_documents showing the probability of the assigned topic
        target_topic: the topic, we want to find the centroid for
    returns: 
        The centroid for the cluster
    """
    # Filtering the embeddings
    filtered_embeddings = embeddings[topics == target_topic, :]
    filtered_probs = probs[topics == target_topic]

    # Calculating the centroid
    return weighted_mean(filtered_embeddings, filtered_probs)

def calc_cosine_sim(centroids, embedding):
    """ 
    Calculates the cosine similarity between a single embedding and the centroids
    """
    return cosine_similarity(centroids, embedding.reshape(1, -1))

def read_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def flatten_embeddings(embedding_dict):
    """ Creates a big matrix with all the embeddings from the dict """
    return np.vstack(embedding_dict.values())

In [3]:
# Loading data
DATA_DIR = Path("../../BscThesisData/data")
doc_topics = pd.read_csv(DATA_DIR / "doc_topics.csv")
embeddings_dict = read_pickle(DATA_DIR / "embedding_dict.pkl")

embeddings = flatten_embeddings(embeddings_dict)

In [4]:
# Loading topic model
MODEL_PATH = Path("../models/topic_model")
topic_model = BERTopic.load(str(MODEL_PATH), embedding_model="Maltehb/-l-ctra-danish-electra-small-cased")

Some weights of the model checkpoint at C:\Users\jhr/.cache\torch\sentence_transformers\Maltehb_-l-ctra-danish-electra-small-cased were not used when initializing ElectraModel: ['generator.encoder.layer.11.output.LayerNorm.bias', 'discriminator_predictions.LayerNorm.weight', 'generator.encoder.layer.8.attention.self.query.bias', 'generator.encoder.layer.1.attention.self.query.bias', 'generator.encoder.layer.7.output.dense.weight', 'generator.encoder.layer.7.intermediate.dense.bias', 'generator.encoder.layer.1.attention.self.value.bias', 'generator.encoder.layer.10.intermediate.dense.bias', 'generator.encoder.layer.2.attention.self.key.bias', 'generator.encoder.layer.8.attention.output.LayerNorm.weight', 'generator.encoder.layer.4.attention.output.LayerNorm.bias', 'generator.encoder.layer.8.output.dense.weight', 'generator.encoder.layer.9.attention.self.key.bias', 'generator.encoder.layer.7.intermediate.dense.weight', 'generator.encoder.layer.6.attention.self.query.bias', 'generator.enc

In [7]:
topics = doc_topics["topic"].values
probs = doc_topics["prob"].values

In [17]:
unique_topics = get_topic_range(topic_model)
centroids = np.zeros((len(unique_topics),embeddings.shape[1])) # Centroids need dimensions (number of topics, embedding-dimensionality)
for i in unique_topics:
    centroids[i, :] += find_centroid(embeddings, topics, probs, i)

In [18]:
centroids.shape

(5, 256)

### Pipeline for calculating features
1. Calculate centroids
2. Calculate paragraph-centroid similarity for each paragraph (in embedding_dict)
3. average the similarities 