# Apply Topic Coherence

The purpose of this notebook is to validate the LDA model using topic coherence.

In [1]:
import pandas as pd
from nltk.corpus import stopwords, words
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
import string
import gensim
import numpy as np
from sklearn.model_selection import train_test_split

#### Change Setting Variables

Change the variables below in order to change how the notebook functions.

In [2]:
testing = False #Set to True if testing the script. Set to False if running the script fully.
lda_file = "data/ldaoutput.csv.gz" #The path to the output generated by the Apply VADER Model notebook.
lda_model = "final.lda" #Path to the lda model

#### Generate Dictionary

In [3]:
#Instatiating lemmatizer
wnl = WordNetLemmatizer()

In [4]:
#Creating translation table to filter out punctuation
#https://stackoverflow.com/questions/11692199/string-translate-with-unicode-data-in-python
translate_table = dict((ord(char), None) for char in string.punctuation)

In [5]:
#Creating gensim dict
#https://stackoverflow.com/questions/62602646/removing-the-non-english-data
#https://stackoverflow.com/questions/39634222/is-there-a-way-to-remove-proper-nouns-from-a-sentence-using-python/39635503
tagged_set = pos_tag(words.words("en"))

In [6]:
len(tagged_set)

235886

In [7]:
word_set = set([word.lower() for word, tag in tagged_set if tag != "NNP" and tag != "NNPS"])

In [8]:
word_set.add("covid")
word_set.add("corona")
word_set.add("coronavirus")

In [9]:
len(word_set)

209964

In [10]:
gensimdict = gensim.corpora.Dictionary([word_set])

In [11]:
#Creating master list of stop words
master_stop = ["would", "also", "even", "u", "one", "could"] + stopwords.words("english")
master_stop = set(master_stop)

In [12]:
#Creating function to clean docs
def clean(doc):
    normalized = []
    try:
        doc = doc.translate(translate_table).lower()
        doc_list = doc.strip().split()
        for word in doc_list:
            word = word.strip()
            lemma = wnl.lemmatize(word)
            if len(lemma)>2 and lemma in word_set and lemma not in master_stop:
                normalized.append(lemma)
    except AttributeError: #In case of error in cleaning doc
        pass
    return normalized

#### Getting Model Coherence

In [13]:
def get_coherences(model_path, dic, lda_path):
    if testing is True:
        size = 5
        train_size = 0.3
        chunksize = 100000
    else:
        size = 40
        train_size = 0.025
        chunksize = 250000
    coherences = []
    sample_sizes = []
    np.random.seed(8283)
    seeds = np.random.randint(low = 1, high=999999, size = size)
    model = gensim.models.LdaModel.load(model_path)
    for index, seed in enumerate(seeds):
        print("Processing seed %s of %s" % (index+1, size))
        chunks = pd.read_csv(lda_path, compression = "gzip", chunksize = chunksize, usecols = ['comments', 'best topic'])
        mini_chunks = []
        for chunk in chunks:
            chunk.dropna(inplace = True)
            sample, dump = train_test_split(chunk, train_size = train_size, random_state = seed, stratify = chunk['best topic'])
            del dump
            mini_chunks += [gensimdict.doc2bow(clean(doc)) for doc in sample["comments"].to_list()]
        sample_sizes.append(len(mini_chunks))
        #https://radimrehurek.com/gensim/models/coherencemodel.html
        cm = gensim.models.coherencemodel.CoherenceModel(model = model, corpus = mini_chunks, dictionary = dic, coherence = "u_mass")
        coherences.append(cm.get_coherence())
    return coherences, sample_sizes

In [14]:
coherences, sample_sizes = get_coherences(lda_model, gensimdict, lda_file)

Processing seed 1 of 40
Processing seed 2 of 40
Processing seed 3 of 40
Processing seed 4 of 40
Processing seed 5 of 40
Processing seed 6 of 40
Processing seed 7 of 40
Processing seed 8 of 40
Processing seed 9 of 40
Processing seed 10 of 40
Processing seed 11 of 40
Processing seed 12 of 40
Processing seed 13 of 40
Processing seed 14 of 40
Processing seed 15 of 40
Processing seed 16 of 40
Processing seed 17 of 40
Processing seed 18 of 40
Processing seed 19 of 40
Processing seed 20 of 40
Processing seed 21 of 40
Processing seed 22 of 40
Processing seed 23 of 40
Processing seed 24 of 40
Processing seed 25 of 40
Processing seed 26 of 40
Processing seed 27 of 40
Processing seed 28 of 40
Processing seed 29 of 40
Processing seed 30 of 40
Processing seed 31 of 40
Processing seed 32 of 40
Processing seed 33 of 40
Processing seed 34 of 40
Processing seed 35 of 40
Processing seed 36 of 40
Processing seed 37 of 40
Processing seed 38 of 40
Processing seed 39 of 40
Processing seed 40 of 40


In [15]:
coherences

[-3.1154050880184756,
 -3.102062077021193,
 -3.1101989039722278,
 -3.126496501857887,
 -3.1150928401828235,
 -3.1262551378979095,
 -3.0877197918933232,
 -3.121917245466807,
 -3.0851024029663923,
 -3.0889992058043387,
 -3.0882556633270775,
 -3.1504983219830454,
 -3.1014787493018146,
 -3.1459057761632456,
 -3.138697578483115,
 -3.1451226536393433,
 -3.113505082479159,
 -3.1542921403681476,
 -3.1144359814136293,
 -3.1042901945308805,
 -3.056579616588082,
 -3.107275824284513,
 -3.1560037787710784,
 -3.1118898120826723,
 -3.129518230658376,
 -3.1178083120944846,
 -3.0927170067512897,
 -3.1068986310191375,
 -3.1650571301025825,
 -3.1198842901116963,
 -3.0728145067106687,
 -3.105452911675316,
 -3.1359747107644393,
 -3.122473751836836,
 -3.1249414511922184,
 -3.100564835261313,
 -3.0868108979833266,
 -3.0902831460723252,
 -3.105505910605413,
 -3.133365817624767]

In [16]:
np.min(coherences), np.mean(coherences), np.max(coherences)

(-3.1650571301025825, -3.114438797724034, -3.056579616588082)

In [17]:
sample_sizes

[254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782,
 254782]

In [18]:
sum(sample_sizes)

10191280