In [2]:
# for text preprocessing
import re
import spacy
import os
import json
import nltk


from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import numpy for matrix operation
import numpy as np

# Importing Gensim
import gensim
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

# for data visualisation
import pyLDAvis.gensim

In [3]:
os.chdir('/home/ubuntu/transcription_test/output')

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
# Read data into papers
with open('M1S3-Text.json') as f:
  corpus  = json.load(f)

In [6]:
corpus

["hey guys a and a welcome back to our achievement or i i'm donny and i'm a very and we are to almost three americans living in germany and sharing all of our experiences living in traveling throughout europe as many of you guys are already you were we",
 "we're expecting our first baby and that explains the mustache because behind every good mustache of course is a great father as my hope but of course having a baby no matter where you live is exciting and thrilling and kind of daunting and could be a little bit scary but whenever you are foreigners living in a foreign country",
 "the and giving birth that as a whole another element to the experience so we wanted to share some of our experiences that we've had so far as americans living in germany and going through this process aid that is exactly what we're going to talk about today in our video",
 "incoming your mama what's that hey bomber",
 "as always we are from a southern state in the us called oklahoma and we now live in the so

In [7]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
# Apply Preprocessing on the Corpus

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus] 

In [9]:
clean_corpus

[['hey',
  'guy',
  'welcome',
  'back',
  'achievement',
  'im',
  'donny',
  'im',
  'almost',
  'three',
  'american',
  'living',
  'germany',
  'sharing',
  'experience',
  'living',
  'traveling',
  'throughout',
  'europe',
  'many',
  'guy',
  'already'],
 ['were',
  'expecting',
  'first',
  'baby',
  'explains',
  'mustache',
  'behind',
  'every',
  'good',
  'mustache',
  'course',
  'great',
  'father',
  'hope',
  'course',
  'baby',
  'matter',
  'live',
  'exciting',
  'thrilling',
  'kind',
  'daunting',
  'could',
  'little',
  'bit',
  'scary',
  'whenever',
  'foreigner',
  'living',
  'foreign',
  'country'],
 ['giving',
  'birth',
  'whole',
  'another',
  'element',
  'experience',
  'wanted',
  'share',
  'experience',
  'weve',
  'far',
  'american',
  'living',
  'germany',
  'going',
  'process',
  'aid',
  'exactly',
  'were',
  'going',
  'talk',
  'today',
  'video'],
 ['incoming', 'mama', 'whats', 'hey', 'bomber'],
 ['always',
  'southern',
  'state',
  '

In [10]:
# Creating the term dictionary of our courpus that is of all the words (Sepcific to Genism syntax perspective), 
# where every unique term is assigned an index. 

dict_ = corpora.Dictionary(clean_corpus)

print(dict_)

Dictionary<581 unique tokens: ['achievement', 'almost', 'already', 'american', 'back']...>


In [11]:
# The dictionary had 52 unqiue words in the cleaned corpus.

for i in dict_.values():
    print(i)

achievement
almost
already
american
back
donny
europe
experience
germany
guy
hey
im
living
many
sharing
three
throughout
traveling
welcome
baby
behind
bit
could
country
course
daunting
every
exciting
expecting
explains
father
first
foreign
foreigner
good
great
hope
kind
little
live
matter
mustache
scary
thrilling
were
whenever
aid
another
birth
element
exactly
far
giving
going
process
share
talk
today
video
wanted
weve
whole
bomber
incoming
mama
whats
always
anything
area
called
compare
comparing
definitely
false
german
oklahoma
really
rhineland
southern
southwestern
specific
state
two
u
also
comment
either
funny
guide
leave
maybe
observation
pregnancy
simply
story
surfer
tell
thing
thrown
time
want
know
let
like
reason
although
barrier
biggest
concern
english
issue
language
lot
much
non
one
people
pretty
run
something
speak
speaker
well
able
come
fairly
fear
general
gonna
high
however
ive
learn
learned
medical
overall
situation
someone
stress
telling
thankful
thats
think
understand
vo

In [12]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary 
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]
doc_term_matrix

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 2),
  (10, 1),
  (11, 2),
  (12, 2),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1)],
 [(12, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1)],
 [(3, 1),
  (7, 2),
  (8, 1),
  (12, 1),
  (44, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 2),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1)],
 [(10, 1), (62, 1), (63, 1), (64, 1), (65, 1)],
 [(7, 2),
  (19, 1),
  (31, 1),
  (39, 1),
  (44, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80,

In [13]:
# Creating the object for LDA model using gensim library

Lda = gensim.models.ldamodel.LdaModel

In [14]:
# Running and Training LDA model on the document term matrix.

ldamodel = Lda(doc_term_matrix, num_topics=6, id2word = dict_, passes=1, random_state=0, eval_every=None)

In [15]:
# Prints the topics with the indexes: 0,1,2 :

ldamodel.print_topics()

# we need to manually check whethere the topics are different from one another or not

[(0,
  '0.027*"midwife" + 0.019*"germany" + 0.015*"like" + 0.014*"birth" + 0.014*"state" + 0.013*"u" + 0.012*"really" + 0.011*"guy" + 0.011*"hey" + 0.010*"im"'),
 (1,
  '0.018*"paternity" + 0.018*"maternity" + 0.017*"leave" + 0.012*"paid" + 0.012*"job" + 0.011*"know" + 0.010*"german" + 0.009*"word" + 0.009*"time" + 0.009*"jewelry"'),
 (2,
  '0.020*"like" + 0.018*"baby" + 0.018*"really" + 0.018*"kind" + 0.016*"u" + 0.015*"thing" + 0.015*"germany" + 0.013*"also" + 0.012*"get" + 0.011*"know"'),
 (3,
  '0.018*"week" + 0.017*"kind" + 0.017*"day" + 0.016*"get" + 0.015*"u" + 0.014*"mother" + 0.011*"baby" + 0.009*"like" + 0.009*"leave" + 0.009*"thats"'),
 (4,
  '0.025*"baby" + 0.022*"german" + 0.016*"really" + 0.015*"come" + 0.015*"experience" + 0.015*"u" + 0.014*"thing" + 0.012*"also" + 0.011*"birth" + 0.011*"shower"'),
 (5,
  '0.019*"day" + 0.017*"going" + 0.013*"father" + 0.013*"baby" + 0.012*"like" + 0.011*"experience" + 0.011*"mother" + 0.011*"germany" + 0.010*"midwife" + 0.010*"even"')]

In [16]:
print(ldamodel.print_topics(num_topics=6, num_words=5))

# num_topics mean: how many topics want to extract 
# num_words: the number of words that want per topic

[(0, '0.027*"midwife" + 0.019*"germany" + 0.015*"like" + 0.014*"birth" + 0.014*"state"'), (1, '0.018*"paternity" + 0.018*"maternity" + 0.017*"leave" + 0.012*"paid" + 0.012*"job"'), (2, '0.020*"like" + 0.018*"baby" + 0.018*"really" + 0.018*"kind" + 0.016*"u"'), (3, '0.018*"week" + 0.017*"kind" + 0.017*"day" + 0.016*"get" + 0.015*"u"'), (4, '0.025*"baby" + 0.022*"german" + 0.016*"really" + 0.015*"come" + 0.015*"experience"'), (5, '0.019*"day" + 0.017*"going" + 0.013*"father" + 0.013*"baby" + 0.012*"like"')]


In [17]:
# printing the topic associations with the documents
count = 0
for i in ldamodel[doc_term_matrix]:
    print("doc : ",count,i)
    count += 1

doc :  0 [(0, 0.9636772)]
doc :  1 [(2, 0.44170123), (3, 0.5374015)]
doc :  2 [(5, 0.96514976)]
doc :  3 [(0, 0.8610151), (1, 0.027786488), (2, 0.027785048), (3, 0.027855061), (4, 0.02777909), (5, 0.02777926)]
doc :  4 [(0, 0.20658712), (4, 0.76862293)]
doc :  5 [(4, 0.96899784)]
doc :  6 [(0, 0.018589357), (1, 0.018545182), (2, 0.90707225), (3, 0.018576533), (4, 0.018631117), (5, 0.018585522)]
doc :  7 [(0, 0.75741625), (5, 0.22223887)]
doc :  8 [(0, 0.97670954)]
doc :  9 [(3, 0.9720621)]
doc :  10 [(2, 0.21166743), (3, 0.77110356)]
doc :  11 [(1, 0.37638083), (2, 0.59960383)]
doc :  12 [(4, 0.97908103)]
doc :  13 [(0, 0.03347798), (1, 0.8322715), (2, 0.03363149), (3, 0.033434622), (4, 0.03370282), (5, 0.033481553)]
doc :  14 [(0, 0.9730411)]
doc :  15 [(5, 0.98052484)]
doc :  16 [(0, 0.9745653)]
doc :  17 [(5, 0.97733665)]
doc :  18 [(4, 0.9800418)]
doc :  19 [(0, 0.9700693)]
doc :  20 [(5, 0.9738552)]
doc :  21 [(2, 0.8215503), (5, 0.16031249)]
doc :  22 [(3, 0.8366295), (5, 0.14940

In [18]:
model = LdaMulticore(corpus=doc_term_matrix,id2word = dict_, num_topics = 6,
                     alpha=.1, eta=0.1, random_state = 0)

coherence = CoherenceModel(model = model, texts = clean_corpus, dictionary = dict_, coherence = 'u_mass')

print(coherence.get_coherence())
print(model.show_topics())

-6.355395593811912
[(0, '0.028*"midwife" + 0.021*"germany" + 0.016*"state" + 0.016*"u" + 0.016*"like" + 0.015*"birth" + 0.015*"really" + 0.012*"hey" + 0.012*"guy" + 0.011*"german"'), (1, '0.024*"paternity" + 0.024*"maternity" + 0.024*"leave" + 0.016*"paid" + 0.016*"job" + 0.014*"know" + 0.012*"german" + 0.011*"word" + 0.011*"jewelry" + 0.011*"time"'), (2, '0.022*"like" + 0.019*"really" + 0.019*"kind" + 0.019*"baby" + 0.018*"u" + 0.016*"germany" + 0.016*"thing" + 0.014*"also" + 0.013*"get" + 0.012*"know"'), (3, '0.020*"week" + 0.019*"kind" + 0.019*"get" + 0.019*"day" + 0.017*"u" + 0.016*"mother" + 0.012*"baby" + 0.012*"like" + 0.011*"thats" + 0.010*"leave"'), (4, '0.028*"baby" + 0.024*"german" + 0.017*"really" + 0.017*"come" + 0.016*"thing" + 0.015*"experience" + 0.015*"u" + 0.013*"also" + 0.012*"shower" + 0.012*"birth"'), (5, '0.021*"day" + 0.018*"going" + 0.014*"father" + 0.013*"baby" + 0.012*"germany" + 0.012*"midwife" + 0.012*"experience" + 0.012*"like" + 0.012*"mother" + 0.011*"wer

In [19]:
lda_display = pyLDAvis.gensim.prepare(model, doc_term_matrix, dict_, sort_topics = False)
pyLDAvis.display(lda_display)