# Quantitative Evaluations of Topic Models
This notebook is for evaluating the quality of the topic models. Mainly it will be a running benchmark of my models + a comparison to LDA (when I set that up)

### Learnings:
- It is a bit of a hassle to convert sklearn / bertopic 
- remember to check for congruency between corpus and topics! (re-run pipeline)
- 

In [110]:
import pickle
from bertopic import BERTopic
from pathlib import Path
from gensim.models.coherencemodel import CoherenceModel
from gensim.matutils import Sparse2Corpus
from gensim.corpora import Dictionary

In [111]:
def read_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def pickle_object(obj, file_path):
    with open(file_path, "wb") as f:
        pickle.dump(obj, f)

def flatten_list(lst):
    return [elem for sublist in lst for elem in sublist]
    
def get_paragraphs(paragraph_dict):
    return flatten_list(list(paragraph_dict.values()))
    
def is_lower(s):
    return s.lower() == s

In [112]:
# Loading topic model
MODEL_PATH = Path("../models/")
DATA_DIR = Path("../../BscThesisData/data")
topic_model = BERTopic.load(str(MODEL_PATH / "topic_model"), embedding_model="Maltehb/-l-ctra-danish-electra-small-cased")

Some weights of the model checkpoint at C:\Users\jhr/.cache\torch\sentence_transformers\Maltehb_-l-ctra-danish-electra-small-cased were not used when initializing ElectraModel: ['generator.encoder.layer.7.attention.output.LayerNorm.bias', 'generator.encoder.layer.8.attention.self.value.weight', 'generator.encoder.layer.7.attention.output.dense.bias', 'generator.encoder.layer.9.attention.self.key.bias', 'generator.encoder.layer.1.intermediate.dense.bias', 'generator.encoder.layer.5.attention.self.query.weight', 'generator.encoder.layer.3.output.dense.bias', 'generator.encoder.layer.10.attention.output.dense.bias', 'generator.encoder.layer.9.intermediate.dense.bias', 'generator.encoder.layer.2.output.LayerNorm.bias', 'generator.encoder.layer.4.attention.self.value.bias', 'generator.encoder.layer.9.attention.self.query.bias', 'generator.encoder.layer.2.attention.self.key.bias', 'discriminator_predictions.LayerNorm.bias', 'generator.encoder.layer.11.attention.output.LayerNorm.bias', 'gener

In [113]:
clean_paragraphs = read_pickle(DATA_DIR / "paragraph_dict.pkl")
paragraph_list = get_paragraphs(clean_paragraphs)
vectorizer = read_pickle(MODEL_PATH / "vectorizer.pkl")
X = vectorizer.fit_transform(paragraph_list)

In [114]:
# transform sparse matrix into gensim corpus
corpus_vect_gensim = Sparse2Corpus(X, documents_columns=False)
dictionary = Dictionary.from_corpus(corpus_vect_gensim,
                                    id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))

In [123]:
tokenizer = vectorizer.build_tokenizer()
tokenized_docs = [[word.lower() for word in tokenizer(doc) if not word.lower() in vectorizer.stop_words] for doc in paragraph_list]

In [128]:
# Getting the Topics 
topics = topic_model.get_topics()
topic_list = [[item[0] for item in topic] for key, topic in topics.items() if not key==-1]
assert len(topic_list) == len(list(topics.keys())) - 1
assert type(topic_list) == list
assert type(topic_list[0]) == list
assert type(topic_list[0][0]) == str

In [129]:
cm = CoherenceModel(topics=topic_list, coherence="c_v", dictionary=dictionary, texts=tokenized_docs)

In [133]:
cm.get_coherence_per_topic()

[0.5138103769278116,
 0.6713135994442715,
 0.6199477050234451,
 0.4090339291266286,
 0.47907132277369946]

In [122]:
all_words = set(word.lower() for doc in tokenized_docs for word in doc)
topic_words = set(topic_list[0])

for topic in topic_list:
    topic_words = set(topic)
    print(f"superflous words: {topic_words - all_words}")

superflous words: set()
superflous words: set()
superflous words: set()
superflous words: set()
superflous words: set()
superflous words: set()


In [109]:
# Test if it works to remove invalid words :)) 
clean_topic_list = [[word for word in topic if word in all_words] for topic in topic_list]
cm = CoherenceModel(topics=clean_topic_list, coherence="c_v", dictionary=dictionary, texts=tokenized_docs)
print(cm.get_coherence_per_topic())
print(cm.get_coherence())

[0.3043144840452862, 0.27015012488366497, 0.4865898101777429, 0.6728787468629828, 0.41105431905355916, 0.416226571950254]
0.4268690094955816


It does!