### Imports

In [8]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

import pyLDAvis.gensim_models

import numpy as np
import logging
import warnings
warnings.filterwarnings('ignore')

from numpy import array

from nlp import *

### Setting up Logger

In [9]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

DEBUG:root:test


### Set up corpus

In [10]:
texts = [['human', 'interface', 'computer'],
         ['survey', 'user', 'computer', 'system', 'response', 'time'],
         ['eps', 'user', 'interface', 'system'],
         ['system', 'human', 'system', 'eps'],
         ['user', 'response', 'time'],
         ['trees'],
         ['graph', 'trees'],
         ['graph', 'minors', 'trees'],
         ['graph', 'minors', 'survey']]

vocab = Dictionary(texts)
bow_corpus = [vocab.doc2bow(text) for text in texts]

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary<0 unique tokens: []>
INFO:gensim.corpora.dictionary:built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
DEBUG:gensim.utils:starting a new internal lifecycle event log for Dictionary
INFO:gensim.utils:Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2023-01-31T12:29:20.027856', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:26:04) [GCC 10.4.0]', 'platform': 'Linux-5.15.0-58-generic-x86_64-with-glibc2.35', 'event': 'created'}


### train LDA models

In [11]:
goodLdaModel = LdaModel(corpus=bow_corpus, id2word=vocab, iterations=50, num_topics=2)
badLdaModel = LdaModel(corpus=bow_corpus, id2word=vocab, iterations=1, num_topics=2)

INFO:gensim.models.ldamodel:using symmetric alpha at 0.5
INFO:gensim.models.ldamodel:using symmetric eta at 0.5
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (single-pass) LDA training, 2 topics, 1 passes over the supplied corpus of 9 documents, updating model once every 9 documents, evaluating perplexity every 9 documents, iterating 50x with a convergence threshold of 0.001000
DEBUG:gensim.models.ldamodel:bound: at document #0
INFO:gensim.models.ldamodel:-3.305 per-word bound, 9.9 perplexity estimate based on a held-out corpus of 9 documents with 29 words
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #9/9
DEBUG:gensim.models.ldamodel:performing inference on a chunk of 9 documents
DEBUG:gensim.models.ldamodel:2/9 documents converged within 50 iterations
DEBUG:gensim.models.ldamodel:updating topics
INFO:gensim.models.ldamodel:topic #0 (0.500): 0.149*"graph" + 0.116*"minors" + 0.091*"human" + 0.090*"trees" + 0.082

### Get Coherence Model - `u_mass`

In [12]:
good_cm = CoherenceModel(model=goodLdaModel, corpus=bow_corpus, dictionary=vocab, coherence='u_mass')
bad_cm = CoherenceModel(model=badLdaModel, corpus=bow_corpus, dictionary=vocab, coherence='u_mass')

DEBUG:gensim.models.coherencemodel:Setting topics to those of the model: LdaModel<num_terms=12, num_topics=2, decay=0.5, chunksize=2000>
DEBUG:gensim.models.coherencemodel:Setting topics to those of the model: LdaModel<num_terms=12, num_topics=2, decay=0.5, chunksize=2000>


### Get Coherence Model - `c_v`

In [13]:
good_cm = CoherenceModel(model=goodLdaModel, texts=texts, dictionary=vocab, coherence='c_v')
bad_cm = CoherenceModel(model=badLdaModel, texts=texts, dictionary=vocab, coherence='c_v')

DEBUG:gensim.models.coherencemodel:Setting topics to those of the model: LdaModel<num_terms=12, num_topics=2, decay=0.5, chunksize=2000>
DEBUG:gensim.models.coherencemodel:Setting topics to those of the model: LdaModel<num_terms=12, num_topics=2, decay=0.5, chunksize=2000>


### Print Coherence Values

In [None]:
print(good_cm.get_coherence())
print(bad_cm.get_coherence())

### Visualise Model

In [15]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(goodLdaModel, bow_corpus, vocab)

DEBUG:gensim.models.ldamodel:performing inference on a chunk of 9 documents
DEBUG:gensim.models.ldamodel:9/9 documents converged within 50 iterations


In [16]:
pyLDAvis.gensim_models.prepare(badLdaModel, bow_corpus, vocab)

DEBUG:gensim.models.ldamodel:performing inference on a chunk of 9 documents
DEBUG:gensim.models.ldamodel:0/9 documents converged within 1 iterations
