# Coherence across model hyper-parameters

This script is used to generate a plot of model coherence metrics for a particular model hyper-parameter. The most obvious use of this is for the number of topics.

This script generates figure 1 in the paper

In [None]:
#This loads individual processed tweets, removes required tokens, and creates a corpus for all tweets
#IMPORTS
from os import listdir
import json
import logging
from gensim import corpora
from nltk.corpus import stopwords

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#This path should contain processed tweets
path = ''
tweets = []
stops = set(stopwords.words('dutch'))

for month in ['08', '09', '10', '11', '12', '01', '02', '03', '04', '05', '06', '07']: #controls for month 
    for file in listdir(path):
        if file.split('_')[0] == month:   
            with open(path + '/' + file, 'r') as infile:
                data = json.loads(infile.read())
                for identifier in data.keys():
                    tweet = []
                    for token in range(len(data[identifier]['full_frog'])):
                        #punctuation removal
                        if data[identifier]['full_frog'][token]['dep'] != 'punct':
                            #stopword removal
                            if data[identifier]['full_frog'][token]['lemma'] not in stops:
                                #lowercasing the entire token
                                tweet.append(data[identifier]['full_frog'][token]['lemma'].lower())
                    tweets.append(tweet)

#converting the tweets into format gensim works with
dictionary = corpora.Dictionary(tweets)
corpus = [dictionary.doc2bow(tweet) for tweet in tweets]

In [None]:
#This computes the actual coherence metrics
#IMPORTS
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore
from collections import defaultdict
from gensim import corpora
import datetime
import logging
import json

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#This is a list of values to test, the script is set up for topic numbers
numbers = []
                     
#Models
from gensim.models import LdaModel

for number in numbers:
    print('Starting for '+ str(number) +' topics')
    
    #This trains the moidel itself, here training parameters can be adjusted to balance speed and sufficient training.
    #It might also be desirable to control for randomness by assigning a constant random seed.
    #If training for something else than topic number assign 'number' to a different paramater
    lda = LdaModel(corpus, num_topics=number, id2word=dictionary, alpha='auto', eta='auto',
                   passes=1, iterations=100000000, gamma_threshold=0.001, chunksize=25000)
    
    #This determines where model files get saved (useful for manual inspection)
    temp_file = r"path" + str(number)
    lda.save(temp_file)
    
    #This computes all four coherence metrics available in gensim - some of these can be removed to speed up computation
    time = datetime.datetime.now().strftime("%I:%M")
    print(time +"   Starting u_mass")
    cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass')
    u_mass = cm.get_coherence()
      
    time = datetime.datetime.now().strftime("%I:%M")
    print(time +"   Starting c_v")
    cm = CoherenceModel(model=lda, texts=tweets, coherence='c_v')
    c_v = cm.get_coherence()
       
    time = datetime.datetime.now().strftime("%I:%M")
    print(time +"   Starting c_uci")
    cm = CoherenceModel(model=lda, texts=tweets, coherence='c_uci')
    c_uci = cm.get_coherence()
      
    time = datetime.datetime.now().strftime("%I:%M")
    print(time +"   Starting c_npmi")
    cm = CoherenceModel(model=lda, texts=tweets, coherence='c_npmi')
    c_npmi = cm.get_coherence()
       
    coherences = {'u_mass': u_mass,
                 'c_v': c_v,
                 'c_uci': c_v,
                 'c_npmi': c_npmi}
    
    #This saves the coherence metrics in a simple .txt
    #Adjust the path as needed
    with open(r'path', 'a+') as outfile:
        json.dump({number: coherences}, outfile)
    
    print('Finished for '+ str(number) +' topics')  



In [1]:
#This plots the coherence metrics
#IMPORTS
import json 
import matplotlib.pylab as plt
from matplotlib.pyplot import figure
from matplotlib.pyplot import savefig
from sklearn import preprocessing
import numpy as np

coherence = {}

#path corresponding to where the coherence .txt is located
with open(r'path', 'r') as infile:
    for line in infile:
        coherence.update(json.loads(line))
        

plt.figure(figsize=(16,8))
for metric in ['u_mass', 'c_v', 'c_uci', 'c_npmi']:
    dicti = {}
    for topics in coherence.keys():
        dicti[int(topics)] = coherence[topics][metric]
    
    tuples = sorted(dicti.items())
    x, y = zip(*tuples) # unpack a list of pairs into two tuples
    y = preprocessing.scale(y)   
    plt.plot(x, y, label = metric)
    
plt.xlabel('Number of topics', fontsize = 15)
plt.ylabel('Coherence (rescaled)', fontsize = 15)
plt.legend(prop={'size':20}) 
savefig(r'PATH')