In [77]:
# Let's read in our document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,ai,algorithm,analysis,application,approximate,assignment,attented,aug,awesome,bayes,...,vector,video,visualize,waiting,wasnnto,week,word,write,younes,youtube
0,0,1,1,0,1,0,0,0,0,1,...,2,1,1,0,0,0,3,1,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,1,1,2,1,0,...,1,0,0,1,1,1,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [79]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
ai,0,0,0,1,0,0,0,0,0,0
algorithm,1,0,0,0,0,0,0,0,0,0
analysis,1,1,0,0,0,1,0,0,1,0
application,0,1,0,0,0,0,0,0,0,0
approximate,1,0,0,0,0,0,0,0,0,0


In [80]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [81]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [82]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

2022-02-24 17:27:44,887 : INFO : using symmetric alpha at 0.5
2022-02-24 17:27:44,890 : INFO : using symmetric eta at 0.5
2022-02-24 17:27:44,891 : INFO : using serial LDA version on this node
2022-02-24 17:27:44,893 : INFO : running online (multi-pass) LDA training, 2 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 17:27:44,911 : INFO : -5.475 per-word bound, 44.5 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 17:27:44,912 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 17:27:44,923 : INFO : topic #0 (0.500): 0.030*"specialization" + 0.026*"course" + 0.025*"word" + 0.022*"instructor" + 0.022*"vector" + 0.022*"learning" + 0.020*"using" + 0.019*"logistic" + 0.018*"use" + 0.018*"language"
2022-02-24 17:27:44,924 : INFO : topic #1 (0.500): 0.030*"course" + 0.027*"analysis" + 0.019*

[(0,
  '0.032*"specialization" + 0.024*"word" + 0.024*"course" + 0.024*"vector" + 0.024*"using" + 0.024*"learning" + 0.024*"instructor" + 0.024*"logistic" + 0.018*"language" + 0.017*"use"'),
 (1,
  '0.033*"course" + 0.024*"analysis" + 0.023*"aug" + 0.014*"machine" + 0.014*"designed" + 0.014*"text" + 0.014*"vector" + 0.014*"embeddings" + 0.014*"localitysensitive" + 0.014*"translation"')]

In [83]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

2022-02-24 17:27:45,750 : INFO : using symmetric alpha at 0.25
2022-02-24 17:27:45,751 : INFO : using symmetric eta at 0.25
2022-02-24 17:27:45,752 : INFO : using serial LDA version on this node
2022-02-24 17:27:45,753 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 17:27:45,768 : INFO : -6.847 per-word bound, 115.2 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 17:27:45,769 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 17:27:45,781 : INFO : topic #0 (0.250): 0.053*"instructor" + 0.036*"specialization" + 0.025*"helped" + 0.022*"build" + 0.021*"text" + 0.021*"analysis" + 0.020*"learning" + 0.020*"deep" + 0.020*"bensouda" + 0.020*"university"
2022-02-24 17:27:45,783 : INFO : topic #1 (0.250): 0.042*"course" + 0.039*"vector" + 

2022-02-24 17:27:45,888 : INFO : topic #2 (0.250): 0.044*"course" + 0.031*"learning" + 0.031*"machine" + 0.031*"aug" + 0.017*"taught" + 0.017*"expert" + 0.017*"trax" + 0.017*"tensorflow" + 0.017*"paper" + 0.017*"kaiser"
2022-02-24 17:27:45,890 : INFO : topic #3 (0.250): 0.035*"model" + 0.034*"straight" + 0.034*"lecture" + 0.034*"detailed" + 0.034*"helped" + 0.034*"youtube" + 0.034*"little" + 0.034*"forward" + 0.034*"hard" + 0.034*"exciting"
2022-02-24 17:27:45,892 : INFO : topic diff=0.013665, rho=0.353553
2022-02-24 17:27:45,899 : INFO : -5.125 per-word bound, 34.9 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 17:27:45,900 : INFO : PROGRESS: pass 7, at document #10/10
2022-02-24 17:27:45,904 : INFO : topic #0 (0.250): 0.056*"instructor" + 0.039*"specialization" + 0.022*"helped" + 0.022*"build" + 0.022*"deep" + 0.022*"text" + 0.022*"designed" + 0.022*"learning" + 0.022*"bensouda" + 0.022*"university"
2022-02-24 17:27:45,905 : INFO : topic #1 (

[(0,
  '0.056*"instructor" + 0.039*"specialization" + 0.022*"deep" + 0.022*"designed" + 0.022*"helped" + 0.022*"build" + 0.022*"text" + 0.022*"learning" + 0.022*"bensouda" + 0.022*"university"'),
 (1,
  '0.039*"course" + 0.039*"vector" + 0.039*"word" + 0.039*"logistic" + 0.039*"using" + 0.027*"analysis" + 0.027*"specialization" + 0.027*"natural" + 0.027*"language" + 0.027*"tweet"'),
 (2,
  '0.045*"course" + 0.031*"learning" + 0.031*"machine" + 0.031*"aug" + 0.017*"taught" + 0.017*"expert" + 0.017*"trax" + 0.017*"tensorflow" + 0.017*"paper" + 0.017*"kaiser"'),
 (3,
  '0.035*"model" + 0.035*"helped" + 0.035*"straight" + 0.035*"lecture" + 0.035*"detailed" + 0.035*"youtube" + 0.035*"little" + 0.035*"forward" + 0.035*"hard" + 0.035*"exciting"')]

In [84]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [85]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript
0,rating â â students enrolled course nat...
1,by end specialization designed nlp applicatio...
2,this specialization designed taught two expert...
3,younes bensouda mourri instructor ai stanford ...
4,åukasz kaiser staff research scientist google...
5,machine translation word embeddings locality...
6,the lecture exciting detailed though little h...
7,other i informative fun
8,from lesson sentiment analysis logistic regres...
9,instructor instructor senior curriculum developer


In [86]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

Unnamed: 0,transcript
0,rating â students course language processing s...
1,specialization application perform sentiment a...
2,specialization machine learning
3,younes instructor ai stanford university learn...
4,åukasz staff research scientist google brain ...
5,machine translation word embeddings sentiment ...
6,lecture straight regression model
7,fun
8,sentiment analysis regression learn feature ve...
9,instructor instructor curriculum developer


In [87]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,ai,analysis,application,assignment,aug,awesome,bayes,brain,chatbot,coauthor,...,tweet,ukasz,university,use,vector,video,visualize,week,word,younes
0,0,1,0,0,0,0,1,0,0,0,...,1,0,0,2,2,1,1,0,3,0
1,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,0,0,0,0
5,0,1,0,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [89]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

2022-02-24 17:27:50,780 : INFO : using symmetric alpha at 0.25
2022-02-24 17:27:50,781 : INFO : using symmetric eta at 0.25
2022-02-24 17:27:50,782 : INFO : using serial LDA version on this node
2022-02-24 17:27:50,784 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 17:27:50,796 : INFO : -6.247 per-word bound, 75.9 perplexity estimate based on a held-out corpus of 10 documents with 107 words
2022-02-24 17:27:50,798 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 17:27:50,806 : INFO : topic #0 (0.250): 0.043*"specialization" + 0.043*"sentiment" + 0.043*"chatbot" + 0.043*"tool" + 0.043*"translate" + 0.043*"analysis" + 0.043*"language" + 0.043*"perform" + 0.043*"application" + 0.043*"text"
2022-02-24 17:27:50,807 : INFO : topic #1 (0.250): 0.050*"regression" + 0.037*"uka

2022-02-24 17:27:50,919 : INFO : topic #2 (0.250): 0.086*"course" + 0.034*"vector" + 0.034*"sentiment" + 0.034*"analysis" + 0.033*"machine" + 0.033*"word" + 0.033*"fun" + 0.033*"space" + 0.033*"embeddings" + 0.033*"rating"
2022-02-24 17:27:50,920 : INFO : topic #3 (0.250): 0.057*"specialization" + 0.044*"course" + 0.044*"word" + 0.044*"instructor" + 0.030*"vector" + 0.030*"space" + 0.030*"learning" + 0.030*"processing" + 0.030*"relationship" + 0.030*"model"
2022-02-24 17:27:50,921 : INFO : topic diff=0.009128, rho=0.353553
2022-02-24 17:27:50,926 : INFO : -4.770 per-word bound, 27.3 perplexity estimate based on a held-out corpus of 10 documents with 107 words
2022-02-24 17:27:50,927 : INFO : PROGRESS: pass 7, at document #10/10
2022-02-24 17:27:50,933 : INFO : topic #0 (0.250): 0.047*"specialization" + 0.047*"sentiment" + 0.047*"analysis" + 0.047*"perform" + 0.047*"chatbot" + 0.047*"tool" + 0.047*"translate" + 0.047*"application" + 0.047*"text" + 0.047*"language"
2022-02-24 17:27:50,93

[(0,
  '0.047*"specialization" + 0.047*"sentiment" + 0.047*"analysis" + 0.047*"perform" + 0.047*"chatbot" + 0.047*"tool" + 0.047*"translate" + 0.047*"application" + 0.047*"text" + 0.047*"language"'),
 (1,
  '0.065*"regression" + 0.036*"ukasz" + 0.036*"coauthor" + 0.036*"scientist" + 0.036*"trax" + 0.036*"google" + 0.036*"brain" + 0.036*"paper" + 0.036*"transformer" + 0.036*"research"'),
 (2,
  '0.086*"course" + 0.034*"machine" + 0.034*"sentiment" + 0.034*"vector" + 0.033*"analysis" + 0.033*"embeddings" + 0.033*"word" + 0.033*"rating" + 0.033*"space" + 0.033*"fun"'),
 (3,
  '0.057*"specialization" + 0.044*"course" + 0.044*"word" + 0.044*"instructor" + 0.030*"vector" + 0.030*"space" + 0.030*"processing" + 0.030*"language" + 0.030*"learning" + 0.030*"relationship"')]

In [90]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [91]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
0,rating â â students course natural language pr...
1,end specialization nlp application perform sen...
2,specialization expert nlp machine deep learning
3,younes mourri instructor ai stanford universit...
4,åukasz staff research scientist google brain ...
5,machine translation word embeddings localityse...
6,lecture detailed little hard straight helped r...
7,other i informative fun
8,lesson sentiment analysis logistic regression ...
9,instructor instructor senior curriculum developer


In [92]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,ai,analysis,application,approximate,assignment,aug,awesome,bayes,best,binary,...,tweet,ukasz,university,use,vector,video,visualize,week,word,younes
0,0,1,0,1,0,0,0,1,0,0,...,1,0,0,2,2,1,1,0,3,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,0,1,0,0,1,1,1,0,1,0,...,0,0,0,0,1,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [94]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=80)
ldana.print_topics()

2022-02-24 17:27:56,064 : INFO : using symmetric alpha at 0.2
2022-02-24 17:27:56,066 : INFO : using symmetric eta at 0.2
2022-02-24 17:27:56,067 : INFO : using serial LDA version on this node
2022-02-24 17:27:56,069 : INFO : running online (multi-pass) LDA training, 5 topics, 80 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 17:27:56,086 : INFO : -7.534 per-word bound, 185.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 17:27:56,087 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 17:27:56,096 : INFO : topic #0 (0.200): 0.058*"regression" + 0.058*"logistic" + 0.032*"analysis" + 0.032*"vector" + 0.032*"text" + 0.032*"sentiment" + 0.032*"tweet" + 0.032*"extract" + 0.032*"feature" + 0.032*"binary"
2022-02-24 17:27:56,097 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.031*"course" + 0

2022-02-24 17:27:56,186 : INFO : topic #2 (0.200): 0.032*"machine" + 0.032*"deep" + 0.032*"learning" + 0.032*"transformer" + 0.032*"paper" + 0.032*"ukasz" + 0.032*"google" + 0.032*"brain" + 0.032*"staff" + 0.032*"scientist"
2022-02-24 17:27:56,186 : INFO : topic #3 (0.200): 0.063*"course" + 0.043*"instructor" + 0.024*"word" + 0.024*"space" + 0.024*"vector" + 0.024*"embeddings" + 0.024*"localitysensitive" + 0.024*"translation" + 0.024*"sentiment" + 0.024*"rating"
2022-02-24 17:27:56,188 : INFO : topic #4 (0.200): 0.038*"specialization" + 0.038*"language" + 0.038*"perform" + 0.038*"sentiment" + 0.038*"analysis" + 0.037*"nlp" + 0.037*"chatbot" + 0.037*"application" + 0.037*"tool" + 0.037*"end"
2022-02-24 17:27:56,189 : INFO : topic diff=0.025381, rho=0.377964
2022-02-24 17:27:56,196 : INFO : -5.191 per-word bound, 36.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 17:27:56,196 : INFO : PROGRESS: pass 6, at document #10/10
2022-02-24 17:27:56,200 

2022-02-24 17:27:56,283 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"analysis" + 0.032*"sentiment" + 0.032*"tweet" + 0.032*"extract" + 0.032*"binary" + 0.032*"feature" + 0.032*"classifier" + 0.032*"learn"
2022-02-24 17:27:56,284 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"model" + 0.025*"vector" + 0.025*"regression" + 0.025*"relationship" + 0.025*"language" + 0.025*"processing" + 0.025*"natural"
2022-02-24 17:27:56,285 : INFO : topic #2 (0.200): 0.032*"deep" + 0.032*"learning" + 0.032*"transformer" + 0.032*"paper" + 0.032*"ukasz" + 0.032*"google" + 0.032*"brain" + 0.032*"staff" + 0.032*"scientist" + 0.032*"tensorflow"
2022-02-24 17:27:56,286 : INFO : topic #3 (0.200): 0.063*"course" + 0.043*"instructor" + 0.024*"word" + 0.024*"space" + 0.024*"vector" + 0.024*"embeddings" + 0.024*"localitysensitive" + 0.024*"translation" + 0.024*"sentiment" + 0.024*"rating"
2022-02-24 17:27:56,287 : INFO : topic #4 (0.200): 0.03

2022-02-24 17:27:56,369 : INFO : topic diff=0.000373, rho=0.235702
2022-02-24 17:27:56,376 : INFO : -5.190 per-word bound, 36.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 17:27:56,377 : INFO : PROGRESS: pass 17, at document #10/10
2022-02-24 17:27:56,382 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"analysis" + 0.032*"sentiment" + 0.032*"tweet" + 0.032*"binary" + 0.032*"classifier" + 0.032*"learn" + 0.032*"extract" + 0.032*"feature"
2022-02-24 17:27:56,383 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"model" + 0.025*"language" + 0.025*"relationship" + 0.025*"space" + 0.025*"processing" + 0.025*"natural"
2022-02-24 17:27:56,384 : INFO : topic #2 (0.200): 0.032*"deep" + 0.032*"learning" + 0.032*"transformer" + 0.032*"ukasz" + 0.032*"paper" + 0.032*"google" + 0.032*"staff" + 0.032*"brain" + 0.032*"research" + 0.032*"tensorflow"
2022-02-24 17:27:56,385 : I

2022-02-24 17:27:56,466 : INFO : topic #3 (0.200): 0.063*"course" + 0.044*"instructor" + 0.024*"machine" + 0.024*"embeddings" + 0.024*"localitysensitive" + 0.024*"translation" + 0.024*"rating" + 0.024*"sentiment" + 0.024*"analysis" + 0.024*"oa"
2022-02-24 17:27:56,467 : INFO : topic #4 (0.200): 0.038*"specialization" + 0.038*"nlp" + 0.038*"sentiment" + 0.038*"analysis" + 0.038*"perform" + 0.038*"text" + 0.038*"language" + 0.038*"chatbot" + 0.038*"application" + 0.038*"end"
2022-02-24 17:27:56,468 : INFO : topic diff=0.000072, rho=0.204124
2022-02-24 17:27:56,474 : INFO : -5.190 per-word bound, 36.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 17:27:56,475 : INFO : PROGRESS: pass 23, at document #10/10
2022-02-24 17:27:56,479 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"sentiment" + 0.032*"analysis" + 0.032*"tweet" + 0.032*"numerical" + 0.032*"binary" + 0.032*"learn" + 0.032*"feature" + 0.032*"extract"
2022-02-24 1

2022-02-24 17:27:56,581 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"language" + 0.025*"space" + 0.025*"model" + 0.025*"relationship" + 0.025*"processing" + 0.025*"natural"
2022-02-24 17:27:56,582 : INFO : topic #2 (0.200): 0.032*"deep" + 0.032*"learning" + 0.032*"brain" + 0.032*"research" + 0.032*"paper" + 0.032*"coauthor" + 0.032*"staff" + 0.032*"scientist" + 0.032*"transformer" + 0.032*"tensorflow"
2022-02-24 17:27:56,582 : INFO : topic #3 (0.200): 0.063*"course" + 0.044*"instructor" + 0.024*"machine" + 0.024*"embeddings" + 0.024*"localitysensitive" + 0.024*"translation" + 0.024*"rating" + 0.024*"oa" + 0.024*"cover" + 0.024*"good"
2022-02-24 17:27:56,583 : INFO : topic #4 (0.200): 0.038*"specialization" + 0.038*"nlp" + 0.038*"sentiment" + 0.038*"analysis" + 0.038*"perform" + 0.038*"text" + 0.038*"chatbot" + 0.038*"application" + 0.038*"end" + 0.038*"tool"
2022-02-24 17:27:56,585 : INFO : topic diff=0.000018, rho=0.182574

2022-02-24 17:27:56,717 : INFO : PROGRESS: pass 34, at document #10/10
2022-02-24 17:27:56,726 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"sentiment" + 0.032*"analysis" + 0.032*"tweet" + 0.032*"classifier" + 0.032*"learn" + 0.032*"numerical" + 0.032*"lesson" + 0.032*"binary"
2022-02-24 17:27:56,728 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"language" + 0.025*"space" + 0.025*"relationship" + 0.025*"processing" + 0.025*"natural" + 0.025*"use"
2022-02-24 17:27:56,729 : INFO : topic #2 (0.200): 0.032*"deep" + 0.032*"learning" + 0.032*"staff" + 0.032*"paper" + 0.032*"research" + 0.032*"coauthor" + 0.032*"scientist" + 0.032*"brain" + 0.032*"google" + 0.032*"tensorflow"
2022-02-24 17:27:56,730 : INFO : topic #3 (0.200): 0.063*"course" + 0.044*"instructor" + 0.024*"machine" + 0.024*"embeddings" + 0.024*"localitysensitive" + 0.024*"translation" + 0.024*"rating" + 0.024*"oa" + 0.024*"good" + 0.024*"cov

2022-02-24 17:27:56,854 : INFO : topic #4 (0.200): 0.038*"specialization" + 0.038*"nlp" + 0.038*"sentiment" + 0.038*"analysis" + 0.038*"perform" + 0.038*"text" + 0.038*"chatbot" + 0.038*"application" + 0.038*"tool" + 0.038*"translate"
2022-02-24 17:27:56,855 : INFO : topic diff=0.000002, rho=0.156174
2022-02-24 17:27:56,865 : INFO : -5.190 per-word bound, 36.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 17:27:56,866 : INFO : PROGRESS: pass 40, at document #10/10
2022-02-24 17:27:56,874 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"sentiment" + 0.032*"analysis" + 0.032*"tweet" + 0.032*"classifier" + 0.032*"learn" + 0.032*"numerical" + 0.032*"lesson" + 0.032*"binary"
2022-02-24 17:27:56,877 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"language" + 0.025*"space" + 0.025*"relationship" + 0.025*"processing" + 0.025*"natural" + 0.025*"use"
2022-02-24 17:27:56

2022-02-24 17:27:56,992 : INFO : topic #3 (0.200): 0.063*"course" + 0.044*"instructor" + 0.024*"machine" + 0.024*"rating" + 0.024*"embeddings" + 0.024*"translation" + 0.024*"localitysensitive" + 0.024*"oa" + 0.024*"good" + 0.024*"ha"
2022-02-24 17:27:56,994 : INFO : topic #4 (0.200): 0.038*"specialization" + 0.038*"nlp" + 0.038*"sentiment" + 0.038*"analysis" + 0.038*"perform" + 0.038*"text" + 0.038*"application" + 0.038*"chatbot" + 0.038*"tool" + 0.038*"translate"
2022-02-24 17:27:56,994 : INFO : topic diff=0.000001, rho=0.145865
2022-02-24 17:27:57,001 : INFO : -5.190 per-word bound, 36.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 17:27:57,001 : INFO : PROGRESS: pass 46, at document #10/10
2022-02-24 17:27:57,005 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"sentiment" + 0.032*"analysis" + 0.032*"tweet" + 0.032*"classifier" + 0.032*"learn" + 0.032*"numerical" + 0.032*"lesson" + 0.032*"binary"
2022-02-24 17:27:57

2022-02-24 17:27:57,103 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"language" + 0.025*"space" + 0.025*"processing" + 0.025*"natural" + 0.025*"relationship" + 0.025*"use"
2022-02-24 17:27:57,105 : INFO : topic #2 (0.200): 0.032*"learning" + 0.032*"deep" + 0.032*"brain" + 0.032*"paper" + 0.032*"library" + 0.032*"research" + 0.032*"coauthor" + 0.032*"scientist" + 0.032*"google" + 0.032*"staff"
2022-02-24 17:27:57,106 : INFO : topic #3 (0.200): 0.063*"course" + 0.044*"instructor" + 0.024*"machine" + 0.024*"localitysensitive" + 0.024*"rating" + 0.024*"embeddings" + 0.024*"translation" + 0.024*"solve" + 0.024*"deeplearnigai" + 0.024*"cover"
2022-02-24 17:27:57,107 : INFO : topic #4 (0.200): 0.038*"specialization" + 0.038*"nlp" + 0.038*"sentiment" + 0.038*"analysis" + 0.038*"perform" + 0.038*"text" + 0.038*"application" + 0.038*"chatbot" + 0.038*"tool" + 0.038*"translate"
2022-02-24 17:27:57,108 : INFO : topic diff=0.000000, rho=

2022-02-24 17:27:57,212 : INFO : topic diff=0.000000, rho=0.131306
2022-02-24 17:27:57,221 : INFO : -5.190 per-word bound, 36.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 17:27:57,223 : INFO : PROGRESS: pass 57, at document #10/10
2022-02-24 17:27:57,229 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"sentiment" + 0.032*"analysis" + 0.032*"tweet" + 0.032*"classifier" + 0.032*"learn" + 0.032*"numerical" + 0.032*"lesson" + 0.032*"binary"
2022-02-24 17:27:57,231 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"language" + 0.025*"space" + 0.025*"use" + 0.025*"processing" + 0.025*"natural" + 0.025*"relationship"
2022-02-24 17:27:57,232 : INFO : topic #2 (0.200): 0.032*"learning" + 0.032*"deep" + 0.032*"brain" + 0.032*"paper" + 0.032*"library" + 0.032*"research" + 0.032*"coauthor" + 0.032*"scientist" + 0.032*"google" + 0.032*"staff"
2022-02-24 17:27:57,233 : INFO

2022-02-24 17:27:57,336 : INFO : topic #3 (0.200): 0.063*"course" + 0.044*"instructor" + 0.024*"machine" + 0.024*"localitysensitive" + 0.024*"rating" + 0.024*"embeddings" + 0.024*"translation" + 0.024*"solve" + 0.024*"deeplearnigai" + 0.024*"cover"
2022-02-24 17:27:57,337 : INFO : topic #4 (0.200): 0.038*"specialization" + 0.038*"nlp" + 0.038*"sentiment" + 0.038*"analysis" + 0.038*"perform" + 0.038*"text" + 0.038*"application" + 0.038*"chatbot" + 0.038*"tool" + 0.038*"translate"
2022-02-24 17:27:57,338 : INFO : topic diff=0.000000, rho=0.125000
2022-02-24 17:27:57,346 : INFO : -5.190 per-word bound, 36.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 17:27:57,347 : INFO : PROGRESS: pass 63, at document #10/10
2022-02-24 17:27:57,353 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"sentiment" + 0.032*"analysis" + 0.032*"tweet" + 0.032*"classifier" + 0.032*"learn" + 0.032*"numerical" + 0.032*"lesson" + 0.032*"binary"
2022

2022-02-24 17:27:57,456 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"language" + 0.025*"space" + 0.025*"use" + 0.025*"processing" + 0.025*"natural" + 0.025*"relationship"
2022-02-24 17:27:57,457 : INFO : topic #2 (0.200): 0.032*"learning" + 0.032*"deep" + 0.032*"brain" + 0.032*"paper" + 0.032*"library" + 0.032*"research" + 0.032*"coauthor" + 0.032*"scientist" + 0.032*"google" + 0.032*"staff"
2022-02-24 17:27:57,458 : INFO : topic #3 (0.200): 0.063*"course" + 0.044*"instructor" + 0.024*"machine" + 0.024*"localitysensitive" + 0.024*"rating" + 0.024*"embeddings" + 0.024*"translation" + 0.024*"solve" + 0.024*"deeplearnigai" + 0.024*"cover"
2022-02-24 17:27:57,460 : INFO : topic #4 (0.200): 0.038*"specialization" + 0.038*"nlp" + 0.038*"sentiment" + 0.038*"analysis" + 0.038*"perform" + 0.038*"text" + 0.038*"application" + 0.038*"chatbot" + 0.038*"tool" + 0.038*"translate"
2022-02-24 17:27:57,460 : INFO : topic diff=0.000000, rho=

2022-02-24 17:27:57,558 : INFO : topic diff=0.000000, rho=0.115470
2022-02-24 17:27:57,564 : INFO : -5.190 per-word bound, 36.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 17:27:57,565 : INFO : PROGRESS: pass 74, at document #10/10
2022-02-24 17:27:57,570 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"sentiment" + 0.032*"analysis" + 0.032*"tweet" + 0.032*"classifier" + 0.032*"learn" + 0.032*"numerical" + 0.032*"lesson" + 0.032*"binary"
2022-02-24 17:27:57,571 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"language" + 0.025*"space" + 0.025*"use" + 0.025*"processing" + 0.025*"natural" + 0.025*"relationship"
2022-02-24 17:27:57,572 : INFO : topic #2 (0.200): 0.032*"learning" + 0.032*"deep" + 0.032*"brain" + 0.032*"paper" + 0.032*"library" + 0.032*"research" + 0.032*"coauthor" + 0.032*"scientist" + 0.032*"google" + 0.032*"staff"
2022-02-24 17:27:57,573 : INFO

2022-02-24 17:27:57,676 : INFO : topic #3 (0.200): 0.063*"course" + 0.044*"instructor" + 0.024*"machine" + 0.024*"localitysensitive" + 0.024*"rating" + 0.024*"embeddings" + 0.024*"translation" + 0.024*"solve" + 0.024*"deeplearnigai" + 0.024*"cover"
2022-02-24 17:27:57,677 : INFO : topic #4 (0.200): 0.038*"specialization" + 0.038*"nlp" + 0.038*"analysis" + 0.038*"sentiment" + 0.038*"perform" + 0.038*"text" + 0.038*"application" + 0.038*"chatbot" + 0.038*"tool" + 0.038*"translate"
2022-02-24 17:27:57,678 : INFO : topic diff=0.000000, rho=0.111111
2022-02-24 17:27:57,681 : INFO : topic #0 (0.200): 0.059*"regression" + 0.059*"logistic" + 0.032*"sentiment" + 0.032*"analysis" + 0.032*"tweet" + 0.032*"classifier" + 0.032*"learn" + 0.032*"numerical" + 0.032*"lesson" + 0.032*"binary"
2022-02-24 17:27:57,681 : INFO : topic #1 (0.200): 0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"language" + 0.025*"space" + 0.025*"use" + 0.025*"processing" + 0.025*"natural" + 0

[(0,
  '0.059*"regression" + 0.059*"logistic" + 0.032*"sentiment" + 0.032*"analysis" + 0.032*"tweet" + 0.032*"classifier" + 0.032*"learn" + 0.032*"numerical" + 0.032*"lesson" + 0.032*"binary"'),
 (1,
  '0.037*"specialization" + 0.037*"word" + 0.037*"course" + 0.025*"vector" + 0.025*"language" + 0.025*"space" + 0.025*"use" + 0.025*"processing" + 0.025*"natural" + 0.025*"relationship"'),
 (2,
  '0.032*"learning" + 0.032*"deep" + 0.032*"brain" + 0.032*"paper" + 0.032*"library" + 0.032*"research" + 0.032*"coauthor" + 0.032*"scientist" + 0.032*"google" + 0.032*"staff"'),
 (3,
  '0.063*"course" + 0.044*"instructor" + 0.024*"machine" + 0.024*"localitysensitive" + 0.024*"rating" + 0.024*"embeddings" + 0.024*"translation" + 0.024*"solve" + 0.024*"deeplearnigai" + 0.024*"cover"'),
 (4,
  '0.038*"specialization" + 0.038*"nlp" + 0.038*"analysis" + 0.038*"sentiment" + 0.038*"perform" + 0.038*"text" + 0.038*"application" + 0.038*"chatbot" + 0.038*"tool" + 0.038*"translate"')]

In [95]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for a in corpus_transformed], data_dtmna.index))

[([(1, 0.9842049)], 0),
 ([(0, 0.015472394),
   (1, 0.015496536),
   (2, 0.015443179),
   (3, 0.015427861),
   (4, 0.93816)],
  1),
 ([(0, 0.02857671),
   (1, 0.028900608),
   (2, 0.88494927),
   (3, 0.028670197),
   (4, 0.028903188)],
  2),
 ([(0, 0.020007774),
   (1, 0.91934437),
   (2, 0.020343369),
   (3, 0.020250553),
   (4, 0.020053955)],
  3),
 ([(0, 0.015387398),
   (1, 0.015385863),
   (2, 0.9384521),
   (3, 0.015386697),
   (4, 0.015387925)],
  4),
 ([(3, 0.9702209)], 5),
 ([(0, 0.022410458),
   (1, 0.9108991),
   (2, 0.022230314),
   (3, 0.022228278),
   (4, 0.022231828)],
  6),
 ([(0, 0.73328745),
   (1, 0.06667272),
   (2, 0.06668023),
   (3, 0.0666768),
   (4, 0.06668278)],
  7),
 ([(0, 0.9527546),
   (1, 0.011833893),
   (2, 0.011766516),
   (3, 0.011808598),
   (4, 0.011836318)],
  8),
 ([(0, 0.033339832),
   (1, 0.033427913),
   (2, 0.033339832),
   (3, 0.8665514),
   (4, 0.03334105)],
  9)]

In [96]:
sent_topics_df = pd.DataFrame()
for i, row_list in enumerate(ldana[corpusna]):
        row = row_list[0] if ldana.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldana.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
sent_topics_df.columns = ['Dom_Topic', 'Topic_Contri', 'Keywords']
print(sent_topics_df)

   Dom_Topic  Topic_Contri                                           Keywords
0        1.0        0.9842  specialization, word, course, vector, language...
1        4.0        0.9382  specialization, nlp, analysis, sentiment, perf...
2        2.0        0.8850  learning, deep, brain, paper, library, researc...
3        1.0        0.9193  specialization, word, course, vector, language...
4        2.0        0.9385  learning, deep, brain, paper, library, researc...
5        3.0        0.9702  course, instructor, machine, localitysensitive...
6        1.0        0.9109  specialization, word, course, vector, language...
7        0.0        0.7333  regression, logistic, sentiment, analysis, twe...
8        0.0        0.9528  regression, logistic, sentiment, analysis, twe...
9        3.0        0.8666  course, instructor, machine, localitysensitive...
