# Measures of Topic Coherence for NMF feature space

In [1]:
import os
import sys
parent_dr = os.path.split(os.getcwd())[0]
if parent_dr not in sys.path:
    sys.path.append(parent_dr)

In [2]:
from sklearn.decomposition import NMF
from core.data.training_data import *

In [3]:
# path to serialized training data
file_name = "tokenized_arxiv_subset_15540.pkl"
full_path = os.path.join(parent_dr, "core", "resources", file_name)

# wrapper for training data and computed matrices
data_obj = TrainingData(full_path)

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [5]:
# Checking train split
assert len(data_obj.train_df)/len(data_obj.data_df) == 0.8

# Do NMF

In [6]:
def compute_nmf(k, A):
    nmf_model = NMF(n_components = k, 
                init='nndsvd', 
                max_iter=1000,
                random_state=1)
    W = nmf_model.fit_transform(A)

    H = nmf_model.components_
    
    return nmf_model, W, H

In [7]:
def generate_topics(topic_model, index_to_word, top_n_words = 15, print_out = False):
    topic_list = []
    for topic_idx, topic in enumerate(topic_model.components_):
        top_n = [index_to_word[i] for i in topic.argsort()[-top_n_words:]][::-1]
        topic_list.append([topic_idx, top_n])
        if print_out:
            print(f"Topic {topic_idx}:\n{top_n}\n")
    return pd.DataFrame(topic_list, columns=["Topic", "Terms"])

### NMF with k = 7 (i.e. known number of cateogories)

In [8]:
model_7, W_7, H_7 = compute_nmf(k=7, A=data_obj.tfidf_train_matrix)

In [9]:
topic_terms = generate_topics(data_obj.index_to_word, top_n_words = 15)

In [10]:
topic_terms

Unnamed: 0,Topic,Terms
0,0,"[problem, algorithm, search, solution, constraint, solve, method, heuristic, planning, approach, optimization, optimal, plan, time, instance]"
1,1,"[number, study, proceeding, conference, time, result, hold, covid, bind, et, al, frac, word, dataset, set]"
2,2,"[datum, system, technology, student, ai, research, user, information, service, social, study, use, development, paper, design]"
3,3,"[network, brain, neuron, model, neural, spike, activity, dynamic, stimulus, input, connectivity, functional, cell, neuronal, synaptic]"
4,4,"[agent, game, learn, learning, environment, reinforcement, action, task, human, reward, policy, model, state, rl, ai]"
5,5,"[graph, vertex, edge, free, tree, class, problem, polynomial, coloring, set, color, np, bipartite, complete, chordal]"
6,6,"[belief, logic, probability, knowledge, rule, model, theory, reasoning, set, inference, probabilistic, fuzzy, semantic, base, decision]"


In [11]:
topics = topic_terms['Terms'].tolist()

In [12]:
data_obj.compute_coherence(topics)

0.6110867524097108