### Critical note:
Topic modelling was optimised on the NON-DEDUPLICATED corpus; however, this resulted in more "reasonable" topics than implemented on the deduplicated one, so the model used below comes from the full cleaned but not deduplicated corpus. It is being APPLIED in this document to the deduplicated corpus.

The optimal model on the deduplicated corpus had 18 topics and is also provided.

In [1]:
import pathlib
from utils import get_projectpaths
(projectroot, rawdatapath, cleandatapath, processeddatapath) = get_projectpaths()
import re
import numpy as np
import pandas as pd
# silence annoying warning
pd.options.mode.chained_assignment = None  # default='warn'
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
# %matplotlib inline
# Plotting tools
import pyLDAvis
# may need more from here https://stackoverflow.com/questions/66759852/no-module-named-pyldavis
import pyLDAvis.gensim_models as gensimvis
# Enable logging for gensim - optional
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import pickle
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# spacy for lemmatization
import spacy
from pprint import pprint

In [2]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatize(texts, allowed_postags=['PROPN','NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations



# WORK BLOCK ------------------------------------------------
# %% load data and obesity names
corpusdf = pd.read_pickle(processeddatapath/'corpusdf_deduped_by_source.pickle')



# %% 
# Convert body to list
bodies = corpusdf.body.values.tolist()
# Remove new lines
bodies = [re.sub('\s+', ' ', sent) for sent in bodies]
# Remove single quotes
bodies = [re.sub("\'", "", sent) for sent in bodies]
# Remove double quotes
bodies = [re.sub('"', "", sent) for sent in bodies]
bodies_words = list(sent_to_words(bodies))
# Remove Stop Words
bodies_words_nostops = remove_stopwords(bodies_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun/propn, adj, vb, adv
bodies_lemmatized = lemmatize(bodies_words_nostops, allowed_postags=['PROPN','NOUN', 'ADJ', 'VERB', 'ADV'])



# %% Create Dictionary
corpusdict = corpora.Dictionary(bodies_lemmatized)
# print how many words are in the dictionary
# gensim mem usage will be 24 * num_topics * this
print(corpusdict)


# Term Document Frequency
corpus = [corpusdict.doc2bow(text) for text in bodies_lemmatized]


# %%
lda_model =  gensim.models.LdaModel.load(str(processeddatapath/'topicmodels/full_m_17_model'))


# %% Print the Keyword in the topics
pprint(lda_model.print_topics())
#doc_lda = lda_model[corpus]


# %% Compute Perplexity
# Perplexity:  -8.685147984244903
# print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.




Dictionary(95737 unique tokens: ['accord', 'adult', 'ask', 'authoritie', 'balance']...)
[(0,
  '0.010*"people" + 0.007*"year" + 0.006*"time" + 0.006*"work" + 0.005*"say" + '
  '0.005*"need" + 0.005*"many" + 0.005*"well" + 0.005*"do" + 0.005*"life"'),
 (1,
  '0.022*"study" + 0.016*"say" + 0.013*"people" + 0.011*"find" + '
  '0.011*"researcher" + 0.010*"research" + 0.009*"university" + 0.008*"brain" '
  '+ 0.008*"exercise" + 0.008*"eat"'),
 (2,
  '0.023*"say" + 0.021*"patient" + 0.011*"pain" + 0.010*"treatment" + '
  '0.009*"liver" + 0.009*"condition" + 0.008*"surgery" + 0.008*"year" + '
  '0.008*"dr" + 0.008*"replacement"'),
 (3,
  '0.025*"food" + 0.018*"diet" + 0.016*"eat" + 0.016*"fat" + 0.013*"sugar" + '
  '0.011*"vitamin" + 0.011*"fruit" + 0.011*"say" + 0.009*"low" + '
  '0.008*"cancer"'),
 (4,
  '0.015*"get" + 0.014*"say" + 0.013*"dog" + 0.011*"do" + 0.009*"go" + '
  '0.008*"exercise" + 0.008*"people" + 0.008*"time" + 0.008*"day" + '
  '0.007*"run"'),
 (5,
  '0.008*"get" + 0.007*"s

In [3]:
# %%
import pyLDAvis.gensim_models as gensim_models
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensim_models.prepare(lda_model, corpus, corpusdict)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
