## Sentence pairing

In [None]:
import pandas as pd
from bertopic import BERTopic

In [None]:
%load_ext snakeviz

In [None]:
%store -r tp_mo
%store -r tp_es
%store -r mo_tekster
%store -r tekster

In [None]:
# Using the tp_es that includes stopwords.
df_mo = tp_mo.get_document_info(mo_tekster)
df_es = tp_es.get_document_info(tekster)

df_tekster = pd.concat([df_mo, df_es])
df_tekster = df_tekster.loc[df_tekster['Probability'] > 0.85]

topics_es = tp_es.topic_labels_
topics_mo = tp_mo.topic_labels_

n_topics = len(topics_es) + len(topics_mo)
n_topics

In [None]:
es_topics = tp_es.get_topic_info()["Name"].tolist()
mo_topics = tp_mo.get_topic_info()["Name"].tolist()

topics = es_topics + mo_topics

In [None]:
df = pd.DataFrame(columns=['s1','s2','sim', 'topic'])

In [None]:
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
model = SentenceTransformer('NbAiLab/nb-sbert-base')

In [None]:
df_tekster["Document"] = df_tekster["Document"].str.split(".")
df_tekster = df_tekster[["Document", "Name", "Probability"]]

In [None]:
import numpy as np

In [None]:
#%%snakeviz

for i in tqdm(topics):
    df_topic = df_tekster.loc[df_tekster['Name']==i]
    df_es_list = df_topic["Document"].tolist()
    
    sentences = [item for sublist in df_es_list for item in sublist]
    
    if(len(sentences) < 1):
        continue
    
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    # Calculate the pairwise cosine similarity scores
    scores = util.pytorch_cos_sim(embeddings, embeddings).numpy()
    
    # Set the diagonal elements to -1 to exclude self-similarity scores
    np.fill_diagonal(scores, -1)
    

    # Finding the pairs with highest cosine similarity score
    idx = np.argpartition(scores, -100, axis=None)[-100:]
    pairs_flat = np.unravel_index(idx, scores.shape)
    pairs = [{'index': [i, j], 'score': scores[i, j]} for i, j in zip(*pairs_flat)]
    
    # Sorting the pairs in decreasing order of score
    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
    
    df_new = pd.DataFrame([{"s1": sentences[i], "s2": sentences[j], "sim": pair["score"].item(), "topic": i} for pair in pairs[:100] for i, j in [pair['index']]])

    df = pd.concat([df, df_new])
        
df = df.reset_index()

In [None]:
df.to_csv(index=False, encoding='utf-8', header='true')