# LDA Baseline
## Training and LDA model on speaker documents from the 111th session"

In [1]:
import numpy as np
import pandas as pd

import gensim
import os
import time


os.chdir("../../scripts/assembly")
from session_speaker_assembly import *

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Select 111th session 
s = 111
speaker_map_df, session_phrase_df = session_phrases(s)

In [4]:
# filter for bigrams that are considered voacbulary
valid_phrase_df = select_phrase_classes(session_phrase_df, classes = ['vocab'], ngram='unigram')

In [5]:
import time

In [6]:
start = time.time()
unigram_df = valid_phrase_df.explode('phrase')
end = time.time()
elapsed = end - start
print(round(elapsed, 3), " seconds")

14.263  seconds


In [7]:
unigrams = select_phrase_classes(phrases_classes, classes = ['vocab'], ngram = 'unigram').explode('phrase')

In [8]:
unigrams

Unnamed: 0,phrase,_classify
2296875,aa,vocab
2296875,aaa,vocab
2296876,aa,vocab
2296876,act,vocab
2296877,aa,vocab
...,...,...
6822115,bodi,vocab
6822116,zyuganov,vocab
6822116,liber,vocab
6822117,zz,vocab


In [9]:
# create gensim dictionary out of master vocabulary
global_uni_dct = Dictionary([list(unigrams.phrase.values)])

In [10]:
unigram_df_sample = unigram_df.sample(50)

In [11]:
bow_docs = speaker_bow_docs(unigram_df, dct=global_uni_dct)

In [12]:
bow_docs[:10]

[{'speakerid': 111113931,
  'phrase_code': [(204, 1),
   (96897, 1),
   (204, 1),
   (108372, 1),
   (204, 1),
   (117260, 1),
   (226, 1),
   (7937, 1),
   (226, 1),
   (25644, 1),
   (226, 1),
   (35843, 1),
   (226, 1),
   (97628, 1),
   (226, 1),
   (129659, 1),
   (268, 1),
   (103248, 1),
   (375, 1),
   (18730, 1),
   (441, 1),
   (80934, 1),
   (443, 5),
   (95509, 5),
   (443, 1),
   (95750, 1),
   (443, 1),
   (117713, 1),
   (476, 1),
   (58546, 1),
   (476, 2),
   (79943, 2),
   (476, 2),
   (91985, 2),
   (476, 1),
   (101091, 1),
   (476, 1),
   (115413, 1),
   (476, 1),
   (117947, 1),
   (477, 1),
   (39967, 1),
   (506, 1),
   (86212, 1),
   (506, 1),
   (95974, 1),
   (511, 1),
   (21326, 1),
   (516, 1),
   (81287, 1),
   (535, 1),
   (54706, 1),
   (552, 1),
   (6232, 1),
   (552, 1),
   (8250, 1),
   (552, 1),
   (16856, 1),
   (552, 3),
   (61863, 3),
   (552, 1),
   (61889, 1),
   (552, 1),
   (122467, 1),
   (552, 1),
   (136579, 1),
   (566, 1),
   (19424, 1),


In [13]:
# viauslize it
bow_docs_df = pd.DataFrame(bow_docs)
bow_docs_df.head()

Unnamed: 0,speakerid,phrase_code
0,111113931,"[(204, 1), (96897, 1), (204, 1), (108372, 1), ..."
1,111113951,"[(552, 1), (122467, 1), (566, 1), (19424, 1), ..."
2,111113981,"[(226, 4), (97628, 4), (443, 1), (95509, 1), (..."
3,111114011,"[(226, 2), (97628, 2), (268, 1), (103248, 1), ..."
4,111114021,"[(476, 1), (115413, 1), (552, 1), (61889, 1), ..."


## LDA Model

In [14]:
from gensim import models

corpus =  list(bow_docs_df.phrase_code.values)
tfidf_model = models.TfidfModel(corpus)

In [15]:
corpus_tfidf = tfidf_model[corpus]

In [16]:
lda = gensim.models.LdaMulticore(corpus_tfidf, num_topics=50, id2word=global_uni_dct, passes=15, workers=7, chunksize = 50)

In [17]:
for idx, topic in lda.print_topics():
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 48 
Words: nan*"pfau" + nan*"pfc" + nan*"pfafftown" + nan*"pfalmer" + nan*"pfaltzgraff" + nan*"pfander" + nan*"pfann" + nan*"pfaender" + nan*"pfeil" + nan*"pfel"
Topic: 40 
Words: nan*"pfau" + nan*"pfc" + nan*"pfafftown" + nan*"pfalmer" + nan*"pfaltzgraff" + nan*"pfander" + nan*"pfann" + nan*"pfaender" + nan*"pfeil" + nan*"pfel"
Topic: 17 
Words: nan*"pfau" + nan*"pfc" + nan*"pfafftown" + nan*"pfalmer" + nan*"pfaltzgraff" + nan*"pfander" + nan*"pfann" + nan*"pfaender" + nan*"pfeil" + nan*"pfel"
Topic: 39 
Words: nan*"pfau" + nan*"pfc" + nan*"pfafftown" + nan*"pfalmer" + nan*"pfaltzgraff" + nan*"pfander" + nan*"pfann" + nan*"pfaender" + nan*"pfeil" + nan*"pfel"
Topic: 16 
Words: nan*"pfau" + nan*"pfc" + nan*"pfafftown" + nan*"pfalmer" + nan*"pfaltzgraff" + nan*"pfander" + nan*"pfann" + nan*"pfaender" + nan*"pfeil" + nan*"pfel"
Topic: 14 
Words: nan*"pfau" + nan*"pfc" + nan*"pfafftown" + nan*"pfalmer" + nan*"pfaltzgraff" + nan*"pfander" + nan*"pfann" + nan*"pfaender" + nan*"pfeil"