# LDA Baseline
## Training and LDA model on speaker documents from the 111th session"

In [1]:
import numpy as np
import pandas as pd

import gensim
import os
import time


os.chdir("../../scripts/assembly")
from session_speaker_assembly import *

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Select 111th session 
s = 111
speaker_map_df, session_phrase_df = session_phrases(s)

In [3]:
# filter for bigrams that are considered voacbulary
valid_phrase_df = select_phrase_classes(session_phrase_df, classes = ['vocab'])

In [34]:
def make_bow_doc(df):
    """
    Takes a dataframe belonging to a single speaker with the fields 'speakerid' and 'phrase_code'
    and returns a dictionary containg their phrases and their speakerid  
    """
    # Assumes every document in the df has the same speaker
    bow_doc = {'speakerid': df.speakerid.values[0], 
               'phrase_code': list(df.phrsase_code.values)}
    
    return bow_doc

In [35]:
# Create BOW docs according to the global dictionary that
# was imported in the speaker assembly module
import time 
start = time.time()
bow_docs = speaker_bow_docs(valid_phrase_df)
end = time.time()
elapsed = end - start
print(round(elapsed, 2), " seconds")

10.42  seconds


In [31]:
# viauslize it
bow_docs_df = pd.DataFrame(bow_docs)
bow_docs_df.head()

Unnamed: 0,speakerid,phrase_code
0,111113931,"[(423216, 1), (423439, 1), (423558, 1), (42408..."
1,111113951,"[(452460, 1), (453117, 1), (472778, 3), (47298..."
2,111113981,"[(425414, 4), (438903, 1), (453117, 4), (45747..."
3,111114011,"[(425414, 2), (429414, 1), (440135, 1), (45311..."
4,111114021,"[(440332, 1), (451618, 1), (462774, 9), (47279..."


## LDA Model

In [48]:
from gensim import models

corpus =  list(bow_docs_df.phrase_code.values)
tfidf_model = models.TfidfModel(corpus)

In [50]:
corpus_tfidf = tfidf_model[corpus]

In [51]:
type(corpus_tfidf)

gensim.interfaces.TransformedCorpus

In [53]:
lda = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=global_dct, passes=1, workers=2)

In [57]:
for idx, topic in lda.print_topics():
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.000*"passag jonescostigan" + 0.000*"passag jordan" + 0.000*"passag join" + 0.000*"passag joint" + 0.000*"passag jone" + 0.000*"passag john" + 0.000*"passag joneswhit" + 0.000*"passag judici" + 0.000*"passag judgment" + 0.000*"passag joelson"
Topic: 1 
Words: 0.000*"passag jonescostigan" + 0.000*"passag jordan" + 0.000*"passag join" + 0.000*"passag joint" + 0.000*"passag jone" + 0.000*"passag john" + 0.000*"passag joneswhit" + 0.000*"passag judici" + 0.000*"passag judgment" + 0.000*"passag joelson"
Topic: 2 
Words: 0.000*"passag jonescostigan" + 0.000*"passag jordan" + 0.000*"passag join" + 0.000*"passag joint" + 0.000*"passag jone" + 0.000*"passag john" + 0.000*"passag joneswhit" + 0.000*"passag judici" + 0.000*"passag judgment" + 0.000*"passag joelson"
Topic: 3 
Words: 0.000*"passag jonescostigan" + 0.000*"passag jordan" + 0.000*"passag join" + 0.000*"passag joint" + 0.000*"passag jone" + 0.000*"passag john" + 0.000*"passag joneswhit" + 0.000*"passag judici" + 0.000

Since the topics have all the sample top words I suspect the model hasn't trained enough.

One way to evaluate a model is through its coherence score. Let's calculate that now.



In [65]:
from gensim.models import CoherenceModel
# Compute Coherence Score
cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass')
coherence = cm.get_coherence()
print('\nCoherence Score: ', coherence)


Coherence Score:  0.0


Unsurprisingly the model has a coherence score of 0.

We can try to retrain it with more passes

In [66]:
lda_2 = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=global_dct, passes=10, workers=7)

In [67]:
for idx, topic in lda_2.print_topics():
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.000*"passag jonescostigan" + 0.000*"passag jordan" + 0.000*"passag join" + 0.000*"passag joint" + 0.000*"passag jone" + 0.000*"passag john" + 0.000*"passag joneswhit" + 0.000*"passag judici" + 0.000*"passag judgment" + 0.000*"passag joelson"
Topic: 1 
Words: 0.000*"passag jonescostigan" + 0.000*"passag jordan" + 0.000*"passag join" + 0.000*"passag joint" + 0.000*"passag jone" + 0.000*"passag john" + 0.000*"passag joneswhit" + 0.000*"passag judici" + 0.000*"passag judgment" + 0.000*"passag joelson"
Topic: 2 
Words: 0.000*"passag jonescostigan" + 0.000*"passag jordan" + 0.000*"passag join" + 0.000*"passag joint" + 0.000*"passag jone" + 0.000*"passag john" + 0.000*"passag joneswhit" + 0.000*"passag judici" + 0.000*"passag judgment" + 0.000*"passag joelson"
Topic: 3 
Words: 0.000*"passag jonescostigan" + 0.000*"passag jordan" + 0.000*"passag join" + 0.000*"passag joint" + 0.000*"passag jone" + 0.000*"passag john" + 0.000*"passag joneswhit" + 0.000*"passag judici" + 0.000

In [68]:
cm = CoherenceModel(model=lda_2, corpus=corpus, coherence='u_mass')
coherence = cm.get_coherence()
print('\nCoherence Score: ', coherence)


Coherence Score:  0.0


Same thing.

In [72]:
len(bow_docs_df.phrase_code.values[0])

3776

The output above means that the documents are at lease 5204 bigrams from the frist speaker. Perhaps this is too many, and the documents are saturated with too many words:

