# Bringing it all together!
This notebook is intended for bringing together the pipeline

In [1]:
import random
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from bertopic import BERTopic
from typing import List, Tuple, Union, Mapping, Any
from pathlib import Path

In [32]:
def weighted_mean(X, weights):
    return np.dot(X.T, weights) / np.sum(weights)


def get_unique_topics(topic_model):
    topic_info = topic_model.get_topic_info()
    return topic_info["Topic"].unique()


def get_topic_range(topic_model):
    topic_info = topic_model.get_topic_info()
    max_topic = np.max(topic_info["Topic"])
    return list(range(0, max_topic+1))
    

def find_centroid(embeddings: np.ndarray, topics: np.ndarray, probs: np.ndarray, target_topic: int):
    """
    Arguments:
        embeddings: 2d with dimensions (num_documents, num_dimensions)
        topics: list of length num documents
        probs: np.array of length num_documents showing the probability of the assigned topic
        target_topic: the topic, we want to find the centroid for
    returns: 
        The centroid for the cluster
    """
    # Filtering the embeddings
    filtered_embeddings = embeddings[topics == target_topic, :]
    filtered_probs = probs[topics == target_topic]

    # Calculating the centroid
    return weighted_mean(filtered_embeddings, filtered_probs)

def calc_cosine_sim(centroids, embedding):
    """ 
    Calculates the cosine similarity between a single embedding and the centroids
    """
    return cosine_similarity(centroids, embedding.reshape(1, -1))

def read_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def flatten_embeddings(embedding_dict):
    """ Creates a big matrix with all the embeddings from the dict """
    return np.vstack(embedding_dict.values())

def featurize_document(doc_embeddings, centroids):
    """ Calculates similarity to centroids for a document
    Arguments: 
        doc_embeddings (np.array): the paragraph embeddings for the document with shape (n_paragraphs, embedding_dim)
        centroids (np.array): centroid embeddings with shape (n_topics, embedding_dim)
    returns: 
        np.array of size (n_topics, ) with the
    """
    return np.mean(cosine_similarity(doc_embeddings, centroids), axis=0)

def remove_empty_embeddings(embedding_dict):
    return {k: v for k, v in embedding_dict.items() if v.shape[0] != 0}

In [3]:
# Loading data
DATA_DIR = Path("../../BscThesisData/data")
doc_topics = pd.read_csv(DATA_DIR / "doc_topics.csv")
embeddings_dict = read_pickle(DATA_DIR / "embedding_dict.pkl")

embeddings = flatten_embeddings(embeddings_dict)

In [4]:
# Loading topic model
MODEL_PATH = Path("../models/topic_model")
topic_model = BERTopic.load(str(MODEL_PATH), embedding_model="Maltehb/-l-ctra-danish-electra-small-cased")

Some weights of the model checkpoint at C:\Users\jhr/.cache\torch\sentence_transformers\Maltehb_-l-ctra-danish-electra-small-cased were not used when initializing ElectraModel: ['generator.encoder.layer.11.output.LayerNorm.bias', 'discriminator_predictions.LayerNorm.weight', 'generator.encoder.layer.8.attention.self.query.bias', 'generator.encoder.layer.1.attention.self.query.bias', 'generator.encoder.layer.7.output.dense.weight', 'generator.encoder.layer.7.intermediate.dense.bias', 'generator.encoder.layer.1.attention.self.value.bias', 'generator.encoder.layer.10.intermediate.dense.bias', 'generator.encoder.layer.2.attention.self.key.bias', 'generator.encoder.layer.8.attention.output.LayerNorm.weight', 'generator.encoder.layer.4.attention.output.LayerNorm.bias', 'generator.encoder.layer.8.output.dense.weight', 'generator.encoder.layer.9.attention.self.key.bias', 'generator.encoder.layer.7.intermediate.dense.weight', 'generator.encoder.layer.6.attention.self.query.bias', 'generator.enc

In [7]:
topics = doc_topics["topic"].values
probs = doc_topics["prob"].values

In [17]:
unique_topics = get_topic_range(topic_model)
centroids = np.zeros((len(unique_topics),embeddings.shape[1])) # Centroids need dimensions (number of topics, embedding-dimensionality)
for i in unique_topics:
    centroids[i, :] += find_centroid(embeddings, topics, probs, i)

In [18]:
centroids.shape

(5, 256)

### Pipeline for calculating features
1. Calculate centroids
2. Calculate paragraph-centroid similarity for each paragraph (in embedding_dict)
3. average the similarities 

## Steps 2 and 3

In [33]:
# add centroid similarity array to dict
filtered_embedding_dict = remove_empty_embeddings(embeddings_dict)
embeddings_dict_new = {k: {"raw": v, "dist": featurize_document(v, centroids)} for k, v in filtered_embedding_dict.items()}

In [36]:
example_key = "D_2298497"
example_embedding = embeddings_dict[example_key]
print(embeddings_dict_new[example_key]["dist"].shape)
print(example_embedding.shape)

(5,)
(4, 256)


In [31]:
empty_embeddings = [embedding for embedding in embeddings_dict.values() if embedding.shape[0] == 0]

427

In [25]:
cosine_similarity(example_embedding, centroids).shape

(4, 5)

In [30]:
embeddings_dict

{'D_2298497': array([[-0.12821189,  0.04971012,  0.19977432, ...,  0.01962287,
         -0.41259784, -0.11253516],
        [-0.30515921,  0.06334347,  0.26476905, ...,  0.19317919,
         -0.3113322 , -0.11021709],
        [-0.41798106,  0.14391242,  0.30698317, ...,  0.16588725,
         -0.30324835, -0.05334198],
        [-0.3382591 ,  0.10470629,  0.2951532 , ...,  0.09454307,
         -0.31594926, -0.10104408]]),
 'D_2298401': array([[-2.76633769e-01,  1.07304685e-01,  2.70012796e-01,
          4.23992351e-02, -2.79958505e-04, -6.34310901e-01,
         -8.39758292e-03, -9.54674661e-01, -3.27428840e-02,
          1.68590143e-01,  2.74170369e-01,  1.58608496e-01,
         -1.40138343e-01,  2.50500709e-01, -1.56263545e-01,
          1.10131994e-01, -2.16990232e-01, -2.96362769e-02,
          1.92195810e-02, -4.13691178e-02, -5.61120473e-02,
         -5.95838763e-02, -1.92649946e-01,  3.33001405e-01,
          2.94563323e-02,  4.67413962e-02, -6.02369457e-02,
         -8.47788006e-02