# Bringing it all together!
This notebook is intended for bringing together the pipeline

In [34]:
import random
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from umap import UMAP
from bertopic import BERTopic
from pathlib import Path

In [35]:
def weighted_mean(X, weights):
    return np.dot(X.T, weights) / np.sum(weights)


def get_unique_topics(topic_model):
    topic_info = topic_model.get_topic_info()
    return topic_info["Topic"].unique()


def get_topic_range(topic_model):
    topic_info = topic_model.get_topic_info()
    max_topic = np.max(topic_info["Topic"])
    return list(range(0, max_topic+1))
    

def find_centroid(embeddings: np.ndarray, topics: np.ndarray, probs: np.ndarray, target_topic: int):
    """
    Arguments:
        embeddings: 2d with dimensions (num_documents, num_dimensions)
        topics: list of length num documents
        probs: np.array of length num_documents showing the probability of the assigned topic
        target_topic: the topic, we want to find the centroid for
    returns: 
        The centroid for the cluster
    """
    # Filtering the embeddings
    filtered_embeddings = embeddings[topics == target_topic, :]
    filtered_probs = probs[topics == target_topic]

    # Calculating the centroid
    return weighted_mean(filtered_embeddings, filtered_probs)

def calc_cosine_sim(centroids, embedding):
    """ 
    Calculates the cosine similarity between a single embedding and the centroids
    """
    return cosine_similarity(centroids, embedding.reshape(1, -1))

def read_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def flatten_embeddings(embedding_dict):
    """ Creates a big matrix with all the embeddings from the dict """
    return np.vstack(embedding_dict.values())

def featurize_document(doc_embeddings, centroids):
    """ Calculates similarity to centroids for a document
    Arguments: 
        doc_embeddings (np.array): the paragraph embeddings for the document with shape (n_paragraphs, embedding_dim)
        centroids (np.array): centroid embeddings with shape (n_topics, embedding_dim)
    returns: 
        np.array of size (n_topics, ) with the
    """
    return np.mean(cosine_similarity(doc_embeddings, centroids), axis=0)

def remove_empty_embeddings(embedding_dict):
    return {k: v for k, v in embedding_dict.items() if v.shape[0] != 0}

    
def find_closest_index(featurized_doc, tree):
    """ Returns the tree index of nearest neighbour of a document """
    return tree.query(featurized_doc, 2)[1][1]

def index_to_key(idx, index_dict):
    return index_dict[idx]


def get_best_match(feautrized_doc, tree, index_dict):
    closest_index = find_closest_index(feautrized_doc, tree)
    return index_to_key(closest_index, index_dict)

In [36]:
# Loading data
DATA_DIR = Path("../../BscThesisData/data")
doc_topics = pd.read_csv(DATA_DIR / "doc_topics.csv")
embeddings_dict = read_pickle(DATA_DIR / "embedding_dict.pkl")

embeddings = flatten_embeddings(embeddings_dict)

In [37]:
# Loading topic model
MODEL_PATH = Path("../models/topic_model")
topic_model = BERTopic.load(str(MODEL_PATH), embedding_model="Maltehb/-l-ctra-danish-electra-small-cased")

Some weights of the model checkpoint at C:\Users\jhr/.cache\torch\sentence_transformers\Maltehb_-l-ctra-danish-electra-small-cased were not used when initializing ElectraModel: ['generator.encoder.layer.8.attention.self.key.weight', 'generator.encoder.layer.1.attention.self.key.weight', 'generator.encoder.layer.9.output.dense.bias', 'generator.embeddings_project.bias', 'generator.encoder.layer.8.output.dense.bias', 'generator.encoder.layer.10.attention.self.key.bias', 'generator.encoder.layer.5.attention.output.LayerNorm.bias', 'generator.encoder.layer.5.output.LayerNorm.bias', 'generator.encoder.layer.9.attention.output.LayerNorm.weight', 'generator.encoder.layer.6.attention.output.LayerNorm.weight', 'discriminator_predictions.dense.weight', 'generator.encoder.layer.0.attention.output.dense.bias', 'generator.encoder.layer.3.attention.output.dense.bias', 'generator.encoder.layer.4.output.dense.bias', 'discriminator_predictions.LayerNorm.weight', 'generator.encoder.layer.4.attention.out

In [38]:
topics = doc_topics["topic"].values
probs = doc_topics["prob"].values

In [39]:
unique_topics = get_topic_range(topic_model)
centroids = np.zeros((len(unique_topics),embeddings.shape[1])) # Centroids need dimensions (number of topics, embedding-dimensionality)
for i in unique_topics:
    centroids[i, :] += find_centroid(embeddings, topics, probs, i)

In [40]:
centroids.shape

(5, 256)

### Pipeline for calculating features
1. Calculate centroids
2. Calculate paragraph-centroid similarity for each paragraph (in embedding_dict)
3. average the similarities 

## Steps 2 and 3

In [41]:
# add centroid similarity array to dict
filtered_embedding_dict = remove_empty_embeddings(embeddings_dict)
embeddings_dict_new = {k: {"raw": v, "dist": featurize_document(v, centroids)} for k, v in filtered_embedding_dict.items()}

In [42]:
example_key = "D_2298497"
example_embedding = embeddings_dict[example_key]
print(embeddings_dict_new[example_key]["dist"].shape)
print(example_embedding.shape)

(5,)
(4, 256)


In [43]:
cosine_similarity(example_embedding, centroids).shape

(4, 5)

### Finding nearest neighbours 
Following [this SO](https://stackoverflow.com/a/32446753)

In [44]:
# Create flat feature structure 
doc_features = np.vstack((v["dist"] for v in embeddings_dict_new.values()))
assert doc_features.shape[1] == len(unique_topics)

# create KDTree
tree = spatial.KDTree(doc_features)

### Note on structure!
It's becoming a bit of a pain not having proper indeces, so I might have to change the embedding dict to have some numerical keys 
This could feasibly be an extra key or evt. change it to a pd.DataFrame. Alternatively, I  couuld just have a doc-index dict (which is easier to create)

In [45]:
doc_index_dict = {i: doc_id for i, doc_id in enumerate(embeddings_dict_new.keys())}

In [46]:
get_best_match(doc_features[0], tree, doc_index_dict)

'D_2242034'

In [47]:
doc_features[1]

array([0.96871278, 0.07983996, 0.96061683, 0.86443179, 0.91594289])

### Experimenting with the shizzle
NB: The embeddings haven't been updated yet, so results might vary

Let's sanity check some of these delicious matches

In [48]:
# Loading data
paragraph_dict = read_pickle(DATA_DIR / "paragraph_dict.pkl")
print(paragraph_dict[example_key])

['Sagen angik en skønsmæssig forhøjelse af sagsøgerens skattepligtige indkomst for indkomstårene 2013 og 2014, som skattemyndighederne havde foretaget på grundlag af en privatforbrugsopgørelse for sagsøgerens husstand, der udviste et negativt privatforbrug for husstanden i de pågældende indkomstår. ', 'Retten fandt det ikke godtgjort, at sagsøgeren i årene 2000-2012 havde sparet 600.000-700.000 kr. op i kontanter, som han havde brugt til at finansiere sit privatforbrug i 2013 og 2014. ', 'Retten fandt det heller ikke godtgjort, at SKATs skønsmæssige ansættelse af husstandens privatforbrug, som var baseret på dels faktiske konstaterede udgifter, dels et skøn, var fastsat for højt. ', 'Endelig fandt retten det ikke godtgjort, at en del af den skønsmæssige forhøjelse, som var foretaget på baggrund af en opgørelse af husstandens privatforbrug, skulle henføres til sagsøgerens ægtefælle og ikke som sket udelukkende til sagsøgeren. ']


In [49]:
i = 0
for doc_id, items in embeddings_dict_new.items():
    if i > 2:
        break
    doc_features = items["dist"]
    print(f"query text: {paragraph_dict[doc_id]}")
    best_match_id = get_best_match(doc_features, tree, doc_index_dict)
    print(f"matches with {best_match_id}")
    print(paragraph_dict[best_match_id])
    print("\n")
    i += 1

query text: ['Sagen angik en skønsmæssig forhøjelse af sagsøgerens skattepligtige indkomst for indkomstårene 2013 og 2014, som skattemyndighederne havde foretaget på grundlag af en privatforbrugsopgørelse for sagsøgerens husstand, der udviste et negativt privatforbrug for husstanden i de pågældende indkomstår. ', 'Retten fandt det ikke godtgjort, at sagsøgeren i årene 2000-2012 havde sparet 600.000-700.000 kr. op i kontanter, som han havde brugt til at finansiere sit privatforbrug i 2013 og 2014. ', 'Retten fandt det heller ikke godtgjort, at SKATs skønsmæssige ansættelse af husstandens privatforbrug, som var baseret på dels faktiske konstaterede udgifter, dels et skøn, var fastsat for højt. ', 'Endelig fandt retten det ikke godtgjort, at en del af den skønsmæssige forhøjelse, som var foretaget på baggrund af en opgørelse af husstandens privatforbrug, skulle henføres til sagsøgerens ægtefælle og ikke som sket udelukkende til sagsøgeren. ']
matches with D_2242034
['Sagsøgerne et selskab