In [1]:
# Let's read in our document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,ai,algorithm,analysis,application,approximate,assignment,attented,aug,awesome,bayes,...,vector,video,visualize,waiting,wasnnto,week,word,write,younes,youtube
0,0,1,1,0,1,0,0,0,0,1,...,2,1,1,0,0,0,3,1,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,1,1,2,1,0,...,1,0,0,1,1,1,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
ai,0,0,0,1,0,0,0,0,0,0
algorithm,1,0,0,0,0,0,0,0,0,0
analysis,1,1,0,0,0,1,0,0,1,0
application,0,1,0,0,0,0,0,0,0,0
approximate,1,0,0,0,0,0,0,0,0,0


In [4]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [5]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [6]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

2022-02-24 12:21:46,411 : INFO : using symmetric alpha at 0.5
2022-02-24 12:21:46,414 : INFO : using symmetric eta at 0.5
2022-02-24 12:21:46,415 : INFO : using serial LDA version on this node
2022-02-24 12:21:46,423 : INFO : running online (multi-pass) LDA training, 2 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 12:21:46,446 : INFO : -5.478 per-word bound, 44.6 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 12:21:46,447 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 12:21:46,463 : INFO : topic #0 (0.500): 0.036*"course" + 0.028*"analysis" + 0.027*"vector" + 0.024*"word" + 0.019*"aug" + 0.019*"logistic" + 0.018*"specialization" + 0.016*"using" + 0.016*"localitysensitive" + 0.016*"translation"
2022-02-24 12:21:46,464 : INFO : topic #1 (0.500): 0.030*"specialization" + 0.025*"

[(0,
  '0.042*"course" + 0.029*"analysis" + 0.029*"vector" + 0.029*"word" + 0.023*"specialization" + 0.023*"logistic" + 0.023*"using" + 0.023*"language" + 0.016*"aug" + 0.016*"localitysensitive"'),
 (1,
  '0.036*"learning" + 0.036*"instructor" + 0.026*"specialization" + 0.026*"helped" + 0.026*"deep" + 0.016*"build" + 0.016*"designed" + 0.016*"model" + 0.015*"straight" + 0.015*"detailed"')]

In [7]:
# LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

2022-02-24 12:21:46,694 : INFO : using symmetric alpha at 0.3333333333333333
2022-02-24 12:21:46,696 : INFO : using symmetric eta at 0.3333333333333333
2022-02-24 12:21:46,697 : INFO : using serial LDA version on this node
2022-02-24 12:21:46,698 : INFO : running online (multi-pass) LDA training, 3 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 12:21:46,733 : INFO : -6.115 per-word bound, 69.3 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 12:21:46,734 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 12:21:46,751 : INFO : topic #0 (0.333): 0.037*"course" + 0.026*"aug" + 0.023*"build" + 0.020*"analysis" + 0.019*"helped" + 0.018*"vector" + 0.018*"specialization" + 0.017*"instructor" + 0.017*"learning" + 0.017*"deep"
2022-02-24 12:21:46,752 : INFO : topic #1 (0.333): 0.027*"analysi

2022-02-24 12:21:46,918 : INFO : PROGRESS: pass 8, at document #10/10
2022-02-24 12:21:46,925 : INFO : topic #0 (0.333): 0.045*"course" + 0.031*"aug" + 0.018*"instructor" + 0.018*"build" + 0.018*"helped" + 0.018*"analysis" + 0.018*"machine" + 0.018*"vector" + 0.018*"ai" + 0.018*"stanford"
2022-02-24 12:21:46,925 : INFO : topic #1 (0.333): 0.027*"specialization" + 0.027*"vector" + 0.027*"analysis" + 0.027*"word" + 0.027*"language" + 0.027*"logistic" + 0.027*"course" + 0.027*"using" + 0.019*"text" + 0.019*"model"
2022-02-24 12:21:46,926 : INFO : topic #2 (0.333): 0.044*"learning" + 0.025*"specialization" + 0.025*"designed" + 0.025*"deep" + 0.025*"taught" + 0.025*"expert" + 0.025*"coauthor" + 0.025*"kaiser" + 0.025*"google" + 0.025*"staff"
2022-02-24 12:21:46,927 : INFO : topic diff=0.006084, rho=0.316228
2022-02-24 12:21:46,936 : INFO : -5.120 per-word bound, 34.8 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 12:21:46,937 : INFO : PROGRESS: pass

[(0,
  '0.045*"course" + 0.031*"aug" + 0.018*"instructor" + 0.018*"build" + 0.018*"helped" + 0.018*"analysis" + 0.018*"machine" + 0.018*"vector" + 0.018*"translation" + 0.018*"localitysensitive"'),
 (1,
  '0.027*"specialization" + 0.027*"vector" + 0.027*"analysis" + 0.027*"word" + 0.027*"language" + 0.027*"logistic" + 0.027*"course" + 0.027*"using" + 0.019*"text" + 0.019*"model"'),
 (2,
  '0.044*"learning" + 0.025*"specialization" + 0.025*"designed" + 0.025*"deep" + 0.025*"taught" + 0.025*"expert" + 0.025*"coauthor" + 0.025*"kaiser" + 0.025*"google" + 0.025*"staff"')]

In [8]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

2022-02-24 12:21:46,977 : INFO : using symmetric alpha at 0.25
2022-02-24 12:21:46,979 : INFO : using symmetric eta at 0.25
2022-02-24 12:21:46,980 : INFO : using serial LDA version on this node
2022-02-24 12:21:46,981 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 12:21:47,001 : INFO : -6.846 per-word bound, 115.0 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 12:21:47,002 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 12:21:47,015 : INFO : topic #0 (0.250): 0.034*"language" + 0.031*"analysis" + 0.031*"specialization" + 0.026*"perform" + 0.026*"course" + 0.025*"vector" + 0.025*"word" + 0.024*"using" + 0.023*"text" + 0.023*"natural"
2022-02-24 12:21:47,016 : INFO : topic #1 (0.250): 0.032*"informative" + 0.031*"fun" + 0.029*

2022-02-24 12:21:47,163 : INFO : topic #2 (0.250): 0.065*"learning" + 0.065*"instructor" + 0.045*"specialization" + 0.045*"deep" + 0.025*"machine" + 0.025*"designed" + 0.025*"helped" + 0.025*"build" + 0.025*"expert" + 0.025*"taught"
2022-02-24 12:21:47,164 : INFO : topic #3 (0.250): 0.037*"logistic" + 0.021*"vector" + 0.021*"using" + 0.021*"model" + 0.020*"analysis" + 0.020*"tweet" + 0.020*"library" + 0.020*"youtube" + 0.020*"tensorflow" + 0.020*"trax"
2022-02-24 12:21:47,165 : INFO : topic diff=0.023351, rho=0.353553
2022-02-24 12:21:47,174 : INFO : -5.085 per-word bound, 33.9 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 12:21:47,175 : INFO : PROGRESS: pass 7, at document #10/10
2022-02-24 12:21:47,187 : INFO : topic #0 (0.250): 0.039*"language" + 0.039*"specialization" + 0.039*"course" + 0.039*"word" + 0.027*"analysis" + 0.027*"vector" + 0.027*"perform" + 0.027*"using" + 0.027*"natural" + 0.027*"use"
2022-02-24 12:21:47,188 : INFO : topic #

[(0,
  '0.039*"language" + 0.039*"specialization" + 0.039*"word" + 0.039*"course" + 0.027*"analysis" + 0.027*"vector" + 0.027*"using" + 0.027*"perform" + 0.027*"natural" + 0.027*"use"'),
 (1,
  '0.058*"course" + 0.040*"aug" + 0.022*"informative" + 0.022*"fun" + 0.022*"machine" + 0.022*"embeddings" + 0.022*"analysis" + 0.022*"translation" + 0.022*"localitysensitive" + 0.022*"vector"'),
 (2,
  '0.065*"learning" + 0.065*"instructor" + 0.045*"specialization" + 0.045*"deep" + 0.025*"designed" + 0.025*"machine" + 0.025*"helped" + 0.025*"build" + 0.025*"expert" + 0.025*"taught"'),
 (3,
  '0.037*"logistic" + 0.021*"vector" + 0.021*"using" + 0.021*"model" + 0.021*"analysis" + 0.021*"tweet" + 0.020*"library" + 0.020*"tensorflow" + 0.020*"trax" + 0.020*"youtube"')]

In [9]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [10]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript
0,rating â â students enrolled course nat...
1,by end specialization designed nlp applicatio...
2,this specialization designed taught two expert...
3,younes bensouda mourri instructor ai stanford ...
4,åukasz kaiser staff research scientist google...
5,machine translation word embeddings locality...
6,the lecture exciting detailed though little h...
7,other i informative fun
8,from lesson sentiment analysis logistic regres...
9,instructor instructor senior curriculum developer


In [11]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

Unnamed: 0,transcript
0,rating â students course language processing s...
1,specialization application perform sentiment a...
2,specialization machine learning
3,younes instructor ai stanford university learn...
4,åukasz staff research scientist google brain ...
5,machine translation word embeddings sentiment ...
6,lecture straight regression model
7,fun
8,sentiment analysis regression learn feature ve...
9,instructor instructor curriculum developer


In [12]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,ai,analysis,application,assignment,aug,awesome,bayes,brain,chatbot,coauthor,...,tweet,ukasz,university,use,vector,video,visualize,week,word,younes
0,0,1,0,0,0,0,1,0,0,0,...,1,0,0,2,2,1,1,0,3,0
1,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,0,0,0,0
5,0,1,0,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [14]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

2022-02-24 12:21:49,011 : INFO : using symmetric alpha at 0.5
2022-02-24 12:21:49,014 : INFO : using symmetric eta at 0.5
2022-02-24 12:21:49,016 : INFO : using serial LDA version on this node
2022-02-24 12:21:49,018 : INFO : running online (multi-pass) LDA training, 2 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 12:21:49,047 : INFO : -5.007 per-word bound, 32.2 perplexity estimate based on a held-out corpus of 10 documents with 107 words
2022-02-24 12:21:49,048 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 12:21:49,070 : INFO : topic #0 (0.500): 0.041*"regression" + 0.038*"specialization" + 0.035*"course" + 0.034*"analysis" + 0.034*"sentiment" + 0.033*"instructor" + 0.025*"vector" + 0.023*"language" + 0.022*"word" + 0.021*"perform"
2022-02-24 12:21:49,071 : INFO : topic #1 (0.500): 0.047*"course" + 0.037*"word" + 

2022-02-24 12:21:49,299 : INFO : topic #1 (0.500): 0.069*"course" + 0.048*"word" + 0.038*"vector" + 0.037*"space" + 0.027*"specialization" + 0.027*"sentiment" + 0.027*"analysis" + 0.027*"language" + 0.026*"processing" + 0.026*"translation"


[(0,
  '0.044*"instructor" + 0.044*"regression" + 0.044*"specialization" + 0.032*"learning" + 0.031*"analysis" + 0.031*"sentiment" + 0.019*"text" + 0.019*"translate" + 0.019*"application" + 0.019*"tool"'),
 (1,
  '0.069*"course" + 0.048*"word" + 0.038*"vector" + 0.037*"space" + 0.027*"specialization" + 0.027*"sentiment" + 0.027*"analysis" + 0.027*"language" + 0.026*"processing" + 0.026*"translation"')]

In [15]:
# Let's try topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

2022-02-24 12:21:49,322 : INFO : using symmetric alpha at 0.3333333333333333
2022-02-24 12:21:49,324 : INFO : using symmetric eta at 0.3333333333333333
2022-02-24 12:21:49,325 : INFO : using serial LDA version on this node
2022-02-24 12:21:49,327 : INFO : running online (multi-pass) LDA training, 3 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 12:21:49,344 : INFO : -5.572 per-word bound, 47.6 perplexity estimate based on a held-out corpus of 10 documents with 107 words
2022-02-24 12:21:49,345 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 12:21:49,372 : INFO : topic #0 (0.333): 0.061*"instructor" + 0.050*"specialization" + 0.046*"course" + 0.043*"learning" + 0.033*"machine" + 0.025*"university" + 0.025*"ai" + 0.025*"younes" + 0.025*"stanford" + 0.024*"curriculum"
2022-02-24 12:21:49,373 : INFO : topic #1 (0.333): 0.0

2022-02-24 12:21:49,529 : INFO : topic diff=0.009806, rho=0.333333
2022-02-24 12:21:49,540 : INFO : -4.697 per-word bound, 25.9 perplexity estimate based on a held-out corpus of 10 documents with 107 words
2022-02-24 12:21:49,542 : INFO : PROGRESS: pass 8, at document #10/10
2022-02-24 12:21:49,548 : INFO : topic #0 (0.333): 0.065*"instructor" + 0.046*"course" + 0.045*"learning" + 0.045*"specialization" + 0.045*"machine" + 0.026*"university" + 0.026*"ai" + 0.026*"younes" + 0.026*"stanford" + 0.026*"curriculum"
2022-02-24 12:21:49,550 : INFO : topic #1 (0.333): 0.048*"regression" + 0.048*"course" + 0.045*"vector" + 0.045*"word" + 0.033*"space" + 0.033*"sentiment" + 0.033*"analysis" + 0.026*"specialization" + 0.026*"language" + 0.026*"tweet"
2022-02-24 12:21:49,551 : INFO : topic #2 (0.333): 0.042*"specialization" + 0.042*"sentiment" + 0.042*"analysis" + 0.042*"perform" + 0.042*"application" + 0.042*"chatbot" + 0.042*"translate" + 0.042*"tool" + 0.042*"text" + 0.042*"language"
2022-02-24

[(0,
  '0.065*"instructor" + 0.046*"course" + 0.045*"learning" + 0.045*"machine" + 0.045*"specialization" + 0.026*"university" + 0.026*"ai" + 0.026*"younes" + 0.026*"stanford" + 0.026*"curriculum"'),
 (1,
  '0.048*"regression" + 0.048*"course" + 0.046*"vector" + 0.045*"word" + 0.033*"space" + 0.033*"sentiment" + 0.033*"analysis" + 0.026*"specialization" + 0.026*"language" + 0.026*"tweet"'),
 (2,
  '0.042*"specialization" + 0.042*"sentiment" + 0.042*"analysis" + 0.042*"perform" + 0.042*"application" + 0.042*"chatbot" + 0.042*"translate" + 0.042*"tool" + 0.042*"text" + 0.042*"language"')]

In [16]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

2022-02-24 12:21:49,606 : INFO : using symmetric alpha at 0.25
2022-02-24 12:21:49,608 : INFO : using symmetric eta at 0.25
2022-02-24 12:21:49,609 : INFO : using serial LDA version on this node
2022-02-24 12:21:49,610 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 12:21:49,625 : INFO : -6.260 per-word bound, 76.6 perplexity estimate based on a held-out corpus of 10 documents with 107 words
2022-02-24 12:21:49,626 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 12:21:49,645 : INFO : topic #0 (0.250): 0.039*"instructor" + 0.037*"word" + 0.035*"course" + 0.030*"language" + 0.028*"specialization" + 0.027*"use" + 0.026*"space" + 0.025*"relationship" + 0.025*"vector" + 0.023*"processing"
2022-02-24 12:21:49,647 : INFO : topic #1 (0.250): 0.071*"regression" + 0.065*"course

2022-02-24 12:21:49,842 : INFO : topic #2 (0.250): 0.081*"learning" + 0.081*"specialization" + 0.046*"instructor" + 0.045*"machine" + 0.045*"ai" + 0.045*"stanford" + 0.045*"university" + 0.045*"younes" + 0.045*"fun" + 0.009*"course"
2022-02-24 12:21:49,842 : INFO : topic #3 (0.250): 0.047*"specialization" + 0.047*"language" + 0.047*"sentiment" + 0.047*"perform" + 0.047*"analysis" + 0.046*"tool" + 0.046*"text" + 0.046*"translate" + 0.046*"chatbot" + 0.046*"application"
2022-02-24 12:21:49,843 : INFO : topic diff=0.015724, rho=0.353553
2022-02-24 12:21:49,854 : INFO : -4.786 per-word bound, 27.6 perplexity estimate based on a held-out corpus of 10 documents with 107 words
2022-02-24 12:21:49,855 : INFO : PROGRESS: pass 7, at document #10/10
2022-02-24 12:21:49,861 : INFO : topic #0 (0.250): 0.046*"word" + 0.046*"course" + 0.032*"instructor" + 0.032*"language" + 0.032*"specialization" + 0.032*"use" + 0.032*"space" + 0.032*"vector" + 0.032*"relationship" + 0.032*"processing"
2022-02-24 12:

[(0,
  '0.046*"word" + 0.046*"course" + 0.032*"language" + 0.032*"specialization" + 0.032*"space" + 0.032*"use" + 0.032*"vector" + 0.032*"relationship" + 0.032*"processing" + 0.032*"instructor"'),
 (1,
  '0.067*"regression" + 0.067*"course" + 0.046*"sentiment" + 0.046*"analysis" + 0.046*"vector" + 0.026*"machine" + 0.026*"learn" + 0.026*"feature" + 0.026*"lecture" + 0.026*"straight"'),
 (2,
  '0.082*"learning" + 0.082*"specialization" + 0.046*"instructor" + 0.045*"ai" + 0.045*"stanford" + 0.045*"university" + 0.045*"younes" + 0.045*"machine" + 0.045*"fun" + 0.009*"course"'),
 (3,
  '0.047*"specialization" + 0.047*"language" + 0.047*"sentiment" + 0.047*"analysis" + 0.047*"perform" + 0.047*"tool" + 0.047*"text" + 0.047*"translate" + 0.047*"chatbot" + 0.047*"application"')]

In [17]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [18]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
0,rating â â students course natural language pr...
1,end specialization nlp application perform sen...
2,specialization expert nlp machine deep learning
3,younes mourri instructor ai stanford universit...
4,åukasz staff research scientist google brain ...
5,machine translation word embeddings localityse...
6,lecture detailed little hard straight helped r...
7,other i informative fun
8,lesson sentiment analysis logistic regression ...
9,instructor instructor senior curriculum developer


In [19]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,ai,analysis,application,approximate,assignment,aug,awesome,bayes,best,binary,...,tweet,ukasz,university,use,vector,video,visualize,week,word,younes
0,0,1,0,1,0,0,0,1,0,0,...,1,0,0,2,2,1,1,0,3,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,0,1,0,0,1,1,1,0,1,0,...,0,0,0,0,1,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [21]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

2022-02-24 12:21:50,180 : INFO : using symmetric alpha at 0.5
2022-02-24 12:21:50,184 : INFO : using symmetric eta at 0.5
2022-02-24 12:21:50,187 : INFO : using serial LDA version on this node
2022-02-24 12:21:50,190 : INFO : running online (multi-pass) LDA training, 2 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 12:21:50,223 : INFO : -5.415 per-word bound, 42.7 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 12:21:50,224 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 12:21:50,246 : INFO : topic #0 (0.500): 0.040*"specialization" + 0.023*"language" + 0.022*"word" + 0.022*"course" + 0.021*"learning" + 0.020*"nlp" + 0.020*"deep" + 0.019*"analysis" + 0.019*"perform" + 0.018*"instructor"
2022-02-24 12:21:50,247 : INFO : topic #1 (0.500): 0.035*"course" + 0.026*"regression" + 0.02

[(0,
  '0.041*"specialization" + 0.026*"word" + 0.026*"course" + 0.026*"language" + 0.019*"vector" + 0.019*"space" + 0.019*"perform" + 0.019*"nlp" + 0.019*"learning" + 0.019*"deep"'),
 (1,
  '0.032*"course" + 0.023*"regression" + 0.023*"sentiment" + 0.023*"analysis" + 0.023*"instructor" + 0.023*"logistic" + 0.023*"vector" + 0.014*"machine" + 0.014*"text" + 0.014*"translation"')]

In [22]:
# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

2022-02-24 12:21:50,492 : INFO : using symmetric alpha at 0.3333333333333333
2022-02-24 12:21:50,493 : INFO : using symmetric eta at 0.3333333333333333
2022-02-24 12:21:50,495 : INFO : using serial LDA version on this node
2022-02-24 12:21:50,497 : INFO : running online (multi-pass) LDA training, 3 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 12:21:50,522 : INFO : -6.002 per-word bound, 64.1 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 12:21:50,523 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 12:21:50,538 : INFO : topic #0 (0.333): 0.052*"course" + 0.036*"word" + 0.028*"analysis" + 0.028*"sentiment" + 0.027*"specialization" + 0.027*"vector" + 0.027*"language" + 0.027*"space" + 0.019*"rating" + 0.019*"localitysensitive"
2022-02-24 12:21:50,539 : INFO : topic #1 (0.333): 0

2022-02-24 12:21:50,691 : INFO : topic diff=0.001993, rho=0.333333
2022-02-24 12:21:50,702 : INFO : -5.015 per-word bound, 32.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 12:21:50,703 : INFO : PROGRESS: pass 8, at document #10/10
2022-02-24 12:21:50,708 : INFO : topic #0 (0.333): 0.052*"course" + 0.036*"word" + 0.028*"specialization" + 0.028*"vector" + 0.028*"analysis" + 0.028*"sentiment" + 0.028*"space" + 0.028*"language" + 0.019*"processing" + 0.019*"rating"
2022-02-24 12:21:50,709 : INFO : topic #1 (0.333): 0.059*"instructor" + 0.034*"senior" + 0.033*"curriculum" + 0.033*"developer" + 0.033*"fun" + 0.033*"informative" + 0.009*"coauthor" + 0.009*"transformer" + 0.008*"scientist" + 0.008*"brain"
2022-02-24 12:21:50,710 : INFO : topic #2 (0.333): 0.040*"regression" + 0.028*"logistic" + 0.028*"learning" + 0.028*"deep" + 0.028*"specialization" + 0.016*"text" + 0.016*"learn" + 0.016*"numerical" + 0.016*"lesson" + 0.016*"binary"
2022-02-24 12:2

[(0,
  '0.052*"course" + 0.036*"word" + 0.028*"specialization" + 0.028*"vector" + 0.028*"analysis" + 0.028*"sentiment" + 0.028*"space" + 0.028*"language" + 0.019*"processing" + 0.019*"natural"'),
 (1,
  '0.059*"instructor" + 0.034*"senior" + 0.034*"curriculum" + 0.034*"developer" + 0.033*"fun" + 0.033*"informative" + 0.008*"coauthor" + 0.008*"transformer" + 0.008*"scientist" + 0.008*"brain"'),
 (2,
  '0.040*"regression" + 0.028*"logistic" + 0.028*"learning" + 0.028*"deep" + 0.028*"specialization" + 0.016*"text" + 0.016*"learn" + 0.016*"numerical" + 0.016*"lesson" + 0.016*"binary"')]

In [23]:
# Let's try 4 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

2022-02-24 12:21:50,757 : INFO : using symmetric alpha at 0.25
2022-02-24 12:21:50,759 : INFO : using symmetric eta at 0.25
2022-02-24 12:21:50,760 : INFO : using serial LDA version on this node
2022-02-24 12:21:50,761 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 12:21:50,782 : INFO : -6.734 per-word bound, 106.4 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 12:21:50,784 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 12:21:50,808 : INFO : topic #0 (0.250): 0.040*"course" + 0.030*"specialization" + 0.025*"space" + 0.025*"word" + 0.022*"instructor" + 0.022*"learning" + 0.022*"deep" + 0.022*"younes" + 0.022*"university" + 0.022*"mourri"
2022-02-24 12:21:50,809 : INFO : topic #1 (0.250): 0.037*"helped" + 0.037*"hard" + 0.037*

2022-02-24 12:21:51,032 : INFO : topic #1 (0.250): 0.038*"model" + 0.038*"regression" + 0.038*"helped" + 0.038*"hard" + 0.038*"lecture" + 0.038*"straight" + 0.038*"detailed" + 0.038*"little" + 0.008*"language" + 0.008*"nlp"
2022-02-24 12:21:51,034 : INFO : topic #2 (0.250): 0.059*"course" + 0.040*"word" + 0.031*"specialization" + 0.030*"vector" + 0.030*"space" + 0.021*"machine" + 0.021*"translation" + 0.021*"rating" + 0.021*"localitysensitive" + 0.021*"processing"
2022-02-24 12:21:51,035 : INFO : topic #3 (0.250): 0.031*"logistic" + 0.031*"analysis" + 0.031*"regression" + 0.031*"sentiment" + 0.031*"text" + 0.031*"instructor" + 0.018*"vector" + 0.018*"language" + 0.018*"specialization" + 0.018*"tweet"
2022-02-24 12:21:51,036 : INFO : topic diff=0.023364, rho=0.353553
2022-02-24 12:21:51,047 : INFO : -5.110 per-word bound, 34.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 12:21:51,048 : INFO : PROGRESS: pass 7, at document #10/10
2022-02-24 12:

[(0,
  '0.037*"learning" + 0.037*"deep" + 0.037*"specialization" + 0.037*"instructor" + 0.037*"younes" + 0.037*"university" + 0.037*"mourri" + 0.037*"stanford" + 0.037*"ai" + 0.008*"course"'),
 (1,
  '0.038*"model" + 0.038*"regression" + 0.038*"helped" + 0.038*"hard" + 0.038*"lecture" + 0.038*"straight" + 0.038*"detailed" + 0.038*"little" + 0.008*"nlp" + 0.008*"language"'),
 (2,
  '0.059*"course" + 0.040*"word" + 0.031*"specialization" + 0.031*"vector" + 0.030*"space" + 0.021*"machine" + 0.021*"translation" + 0.021*"rating" + 0.021*"language" + 0.021*"localitysensitive"'),
 (3,
  '0.031*"logistic" + 0.031*"regression" + 0.031*"analysis" + 0.031*"sentiment" + 0.031*"text" + 0.031*"instructor" + 0.018*"vector" + 0.018*"language" + 0.017*"tweet" + 0.017*"perform"')]

In [37]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=80)
ldana.print_topics()

2022-02-24 15:25:28,897 : INFO : using symmetric alpha at 0.2
2022-02-24 15:25:28,899 : INFO : using symmetric eta at 0.2
2022-02-24 15:25:28,900 : INFO : using serial LDA version on this node
2022-02-24 15:25:28,901 : INFO : running online (multi-pass) LDA training, 5 topics, 80 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 15:25:28,907 : INFO : -7.566 per-word bound, 189.5 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:28,908 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 15:25:28,915 : INFO : topic #0 (0.200): 0.027*"nlp" + 0.027*"sentiment" + 0.027*"perform" + 0.027*"analysis" + 0.027*"end" + 0.027*"chatbot" + 0.027*"tool" + 0.027*"application" + 0.027*"language" + 0.027*"specialization"
2022-02-24 15:25:28,916 : INFO : topic #1 (0.200): 0.053*"instructor" + 0.037*"specializat

2022-02-24 15:25:28,976 : INFO : topic #2 (0.200): 0.055*"informative" + 0.055*"fun" + 0.010*"course" + 0.010*"vector" + 0.010*"space" + 0.010*"specialization" + 0.010*"natural" + 0.010*"relationship" + 0.010*"language" + 0.010*"word"
2022-02-24 15:25:28,977 : INFO : topic #3 (0.200): 0.064*"course" + 0.044*"word" + 0.033*"space" + 0.033*"vector" + 0.023*"use" + 0.023*"natural" + 0.023*"relationship" + 0.023*"embeddings" + 0.023*"specialization" + 0.023*"sentiment"
2022-02-24 15:25:28,977 : INFO : topic #4 (0.200): 0.061*"logistic" + 0.061*"regression" + 0.033*"vector" + 0.033*"tweet" + 0.033*"sentiment" + 0.033*"analysis" + 0.033*"numerical" + 0.033*"lesson" + 0.033*"extract" + 0.033*"feature"
2022-02-24 15:25:28,978 : INFO : topic diff=0.049538, rho=0.377964
2022-02-24 15:25:28,983 : INFO : -5.104 per-word bound, 34.4 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:28,984 : INFO : PROGRESS: pass 6, at document #10/10
2022-02-24 15:25:28,

2022-02-24 15:25:29,037 : INFO : topic #1 (0.200): 0.081*"instructor" + 0.055*"specialization" + 0.055*"deep" + 0.055*"learning" + 0.030*"machine" + 0.030*"stanford" + 0.030*"ai" + 0.030*"university" + 0.030*"mourri" + 0.030*"younes"
2022-02-24 15:25:29,037 : INFO : topic #2 (0.200): 0.055*"informative" + 0.055*"fun" + 0.009*"course" + 0.009*"vector" + 0.009*"specialization" + 0.009*"space" + 0.009*"natural" + 0.009*"relationship" + 0.009*"language" + 0.009*"word"
2022-02-24 15:25:29,038 : INFO : topic #3 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"space" + 0.033*"vector" + 0.023*"use" + 0.023*"natural" + 0.023*"specialization" + 0.023*"relationship" + 0.023*"language" + 0.023*"embeddings"
2022-02-24 15:25:29,038 : INFO : topic #4 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"vector" + 0.034*"tweet" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"text" + 0.034*"numerical" + 0.034*"lesson" + 0.034*"extract"
2022-02-24 15:25:29,038 : INFO : topic diff=0.003999, rho=0.277350

2022-02-24 15:25:29,092 : INFO : PROGRESS: pass 17, at document #10/10
2022-02-24 15:25:29,094 : INFO : topic #0 (0.200): 0.023*"nlp" + 0.023*"end" + 0.023*"chatbot" + 0.023*"tool" + 0.023*"application" + 0.023*"translate" + 0.023*"perform" + 0.023*"ukasz" + 0.023*"trax" + 0.023*"google"
2022-02-24 15:25:29,094 : INFO : topic #1 (0.200): 0.081*"instructor" + 0.056*"specialization" + 0.056*"deep" + 0.056*"learning" + 0.030*"machine" + 0.030*"stanford" + 0.030*"younes" + 0.030*"university" + 0.030*"mourri" + 0.030*"ai"
2022-02-24 15:25:29,095 : INFO : topic #2 (0.200): 0.056*"informative" + 0.056*"fun" + 0.009*"course" + 0.009*"vector" + 0.009*"specialization" + 0.009*"space" + 0.009*"natural" + 0.009*"relationship" + 0.009*"language" + 0.009*"word"
2022-02-24 15:25:29,096 : INFO : topic #3 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"sentiment" + 0.023*"use" + 0.023*"natural" + 0.023*"analysis"
2022-02-24 15

2022-02-24 15:25:29,149 : INFO : topic diff=0.000150, rho=0.204124
2022-02-24 15:25:29,153 : INFO : -5.101 per-word bound, 34.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:29,153 : INFO : PROGRESS: pass 23, at document #10/10
2022-02-24 15:25:29,156 : INFO : topic #0 (0.200): 0.023*"nlp" + 0.023*"end" + 0.023*"chatbot" + 0.023*"tool" + 0.023*"application" + 0.023*"translate" + 0.023*"ukasz" + 0.023*"perform" + 0.023*"trax" + 0.023*"google"
2022-02-24 15:25:29,156 : INFO : topic #1 (0.200): 0.081*"instructor" + 0.056*"specialization" + 0.056*"deep" + 0.056*"learning" + 0.030*"machine" + 0.030*"younes" + 0.030*"university" + 0.030*"stanford" + 0.030*"mourri" + 0.030*"ai"
2022-02-24 15:25:29,156 : INFO : topic #2 (0.200): 0.056*"informative" + 0.056*"fun" + 0.009*"course" + 0.009*"specialization" + 0.009*"vector" + 0.009*"space" + 0.009*"language" + 0.009*"natural" + 0.009*"relationship" + 0.009*"word"
2022-02-24 15:25:29,157 : INFO : top

2022-02-24 15:25:29,202 : INFO : topic #4 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"text" + 0.034*"tweet" + 0.034*"numerical" + 0.034*"lesson" + 0.034*"extract" + 0.034*"feature"
2022-02-24 15:25:29,203 : INFO : topic diff=0.000037, rho=0.182574
2022-02-24 15:25:29,206 : INFO : -5.101 per-word bound, 34.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:29,207 : INFO : PROGRESS: pass 29, at document #10/10
2022-02-24 15:25:29,209 : INFO : topic #0 (0.200): 0.023*"nlp" + 0.023*"end" + 0.023*"tool" + 0.023*"application" + 0.023*"chatbot" + 0.023*"translate" + 0.023*"ukasz" + 0.023*"trax" + 0.023*"google" + 0.023*"scientist"
2022-02-24 15:25:29,209 : INFO : topic #1 (0.200): 0.081*"instructor" + 0.056*"specialization" + 0.056*"deep" + 0.056*"learning" + 0.030*"machine" + 0.030*"younes" + 0.030*"university" + 0.030*"stanford" + 0.030*"mourri" + 0.030*"ai"
2022-02-24 15:25:29,210 : INFO : topi

2022-02-24 15:25:29,255 : INFO : topic #3 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"sentiment" + 0.023*"analysis" + 0.023*"use" + 0.023*"natural"
2022-02-24 15:25:29,255 : INFO : topic #4 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"text" + 0.034*"tweet" + 0.034*"numerical" + 0.034*"lesson" + 0.034*"extract" + 0.034*"feature"
2022-02-24 15:25:29,256 : INFO : topic diff=0.000011, rho=0.166667
2022-02-24 15:25:29,259 : INFO : -5.101 per-word bound, 34.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:29,259 : INFO : PROGRESS: pass 35, at document #10/10
2022-02-24 15:25:29,263 : INFO : topic #0 (0.200): 0.023*"nlp" + 0.023*"translate" + 0.023*"chatbot" + 0.023*"tool" + 0.023*"end" + 0.023*"application" + 0.023*"ukasz" + 0.023*"trax" + 0.023*"google" + 0.023*"scientist"
2022-02-24 15:25:29,263 : INFO : topic #1 

2022-02-24 15:25:29,307 : INFO : topic #2 (0.200): 0.056*"informative" + 0.056*"fun" + 0.009*"specialization" + 0.009*"nlp" + 0.009*"regression" + 0.009*"machine" + 0.009*"analysis" + 0.009*"sentiment" + 0.009*"model" + 0.009*"curriculum"
2022-02-24 15:25:29,308 : INFO : topic #3 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"sentiment" + 0.023*"analysis" + 0.023*"use" + 0.023*"natural"
2022-02-24 15:25:29,308 : INFO : topic #4 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"text" + 0.034*"tweet" + 0.034*"numerical" + 0.034*"lesson" + 0.034*"extract" + 0.034*"feature"
2022-02-24 15:25:29,308 : INFO : topic diff=0.000003, rho=0.154303
2022-02-24 15:25:29,311 : INFO : -5.101 per-word bound, 34.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:29,312 : INFO : PROGRESS: pass 41, at document #10/10
2022-02-24 15:25:29,315

2022-02-24 15:25:29,358 : INFO : topic #1 (0.200): 0.081*"instructor" + 0.056*"specialization" + 0.056*"deep" + 0.056*"learning" + 0.030*"machine" + 0.030*"younes" + 0.030*"university" + 0.030*"stanford" + 0.030*"mourri" + 0.030*"ai"
2022-02-24 15:25:29,358 : INFO : topic #2 (0.200): 0.056*"fun" + 0.056*"informative" + 0.009*"specialization" + 0.009*"nlp" + 0.009*"regression" + 0.009*"machine" + 0.009*"analysis" + 0.009*"sentiment" + 0.009*"model" + 0.009*"senior"
2022-02-24 15:25:29,359 : INFO : topic #3 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"sentiment" + 0.023*"analysis" + 0.023*"use" + 0.023*"natural"
2022-02-24 15:25:29,359 : INFO : topic #4 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"text" + 0.034*"tweet" + 0.034*"numerical" + 0.034*"extract" + 0.034*"lesson" + 0.034*"classifier"
2022-02-24 15:25:29,359 : INFO : topic diff=0.000001, rho=0.144338


2022-02-24 15:25:29,406 : INFO : PROGRESS: pass 52, at document #10/10
2022-02-24 15:25:29,409 : INFO : topic #0 (0.200): 0.023*"nlp" + 0.023*"ukasz" + 0.023*"trax" + 0.023*"staff" + 0.023*"library" + 0.023*"brain" + 0.023*"google" + 0.023*"coauthor" + 0.023*"research" + 0.023*"paper"
2022-02-24 15:25:29,409 : INFO : topic #1 (0.200): 0.081*"instructor" + 0.056*"specialization" + 0.056*"deep" + 0.056*"learning" + 0.030*"machine" + 0.030*"younes" + 0.030*"university" + 0.030*"stanford" + 0.030*"mourri" + 0.030*"ai"
2022-02-24 15:25:29,410 : INFO : topic #2 (0.200): 0.056*"fun" + 0.056*"informative" + 0.009*"nlp" + 0.009*"specialization" + 0.009*"regression" + 0.009*"machine" + 0.009*"analysis" + 0.009*"sentiment" + 0.009*"model" + 0.009*"senior"
2022-02-24 15:25:29,410 : INFO : topic #3 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"analysis" + 0.023*"sentiment" + 0.023*"use" + 0.023*"natural"
2022-02-24 15:25

2022-02-24 15:25:29,456 : INFO : topic diff=0.000000, rho=0.130189
2022-02-24 15:25:29,460 : INFO : -5.101 per-word bound, 34.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:29,461 : INFO : PROGRESS: pass 58, at document #10/10
2022-02-24 15:25:29,463 : INFO : topic #0 (0.200): 0.023*"nlp" + 0.023*"ukasz" + 0.023*"trax" + 0.023*"staff" + 0.023*"library" + 0.023*"brain" + 0.023*"google" + 0.023*"coauthor" + 0.023*"research" + 0.023*"paper"
2022-02-24 15:25:29,463 : INFO : topic #1 (0.200): 0.081*"instructor" + 0.056*"specialization" + 0.056*"deep" + 0.056*"learning" + 0.030*"machine" + 0.030*"younes" + 0.030*"university" + 0.030*"stanford" + 0.030*"mourri" + 0.030*"ai"
2022-02-24 15:25:29,464 : INFO : topic #2 (0.200): 0.056*"fun" + 0.056*"informative" + 0.009*"nlp" + 0.009*"specialization" + 0.009*"machine" + 0.009*"regression" + 0.009*"sentiment" + 0.009*"analysis" + 0.009*"model" + 0.009*"senior"
2022-02-24 15:25:29,464 : INFO : topic 

2022-02-24 15:25:29,506 : INFO : topic #4 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"text" + 0.034*"tweet" + 0.034*"classifier" + 0.034*"feature" + 0.034*"numerical" + 0.034*"extract"
2022-02-24 15:25:29,506 : INFO : topic diff=0.000000, rho=0.124035
2022-02-24 15:25:29,510 : INFO : -5.101 per-word bound, 34.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:29,510 : INFO : PROGRESS: pass 64, at document #10/10
2022-02-24 15:25:29,513 : INFO : topic #0 (0.200): 0.023*"nlp" + 0.023*"library" + 0.023*"brain" + 0.023*"research" + 0.023*"staff" + 0.023*"paper" + 0.023*"google" + 0.023*"coauthor" + 0.023*"tensorflow" + 0.023*"scientist"
2022-02-24 15:25:29,513 : INFO : topic #1 (0.200): 0.081*"instructor" + 0.056*"specialization" + 0.056*"deep" + 0.056*"learning" + 0.030*"machine" + 0.030*"younes" + 0.030*"university" + 0.030*"stanford" + 0.030*"mourri" + 0.030*"ai"
2022-02-24 15:25:29,514 : IN

2022-02-24 15:25:29,557 : INFO : topic #3 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"analysis" + 0.023*"sentiment" + 0.023*"use" + 0.023*"processing"
2022-02-24 15:25:29,557 : INFO : topic #4 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"text" + 0.034*"tweet" + 0.034*"classifier" + 0.034*"feature" + 0.034*"numerical" + 0.034*"extract"
2022-02-24 15:25:29,558 : INFO : topic diff=0.000000, rho=0.118678
2022-02-24 15:25:29,561 : INFO : -5.101 per-word bound, 34.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:29,561 : INFO : PROGRESS: pass 70, at document #10/10
2022-02-24 15:25:29,564 : INFO : topic #0 (0.200): 0.023*"nlp" + 0.023*"library" + 0.023*"brain" + 0.023*"research" + 0.023*"staff" + 0.023*"paper" + 0.023*"google" + 0.023*"coauthor" + 0.023*"tensorflow" + 0.023*"scientist"
2022-02-24 15:25:29,564 : INFO

2022-02-24 15:25:29,610 : INFO : topic #2 (0.200): 0.056*"fun" + 0.056*"informative" + 0.009*"nlp" + 0.009*"specialization" + 0.009*"machine" + 0.009*"regression" + 0.009*"sentiment" + 0.009*"analysis" + 0.009*"model" + 0.009*"senior"
2022-02-24 15:25:29,610 : INFO : topic #3 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"analysis" + 0.023*"sentiment" + 0.023*"use" + 0.023*"processing"
2022-02-24 15:25:29,611 : INFO : topic #4 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"text" + 0.034*"tweet" + 0.034*"classifier" + 0.034*"feature" + 0.034*"numerical" + 0.034*"extract"
2022-02-24 15:25:29,611 : INFO : topic diff=0.000000, rho=0.113961
2022-02-24 15:25:29,614 : INFO : -5.101 per-word bound, 34.3 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 15:25:29,615 : INFO : PROGRESS: pass 76, at document #10/10
2022-02-24 15:25:29,

[(0,
  '0.023*"nlp" + 0.023*"library" + 0.023*"brain" + 0.023*"research" + 0.023*"staff" + 0.023*"paper" + 0.023*"google" + 0.023*"coauthor" + 0.023*"tensorflow" + 0.023*"scientist"'),
 (1,
  '0.081*"instructor" + 0.056*"specialization" + 0.056*"deep" + 0.056*"learning" + 0.030*"machine" + 0.030*"younes" + 0.030*"university" + 0.030*"stanford" + 0.030*"mourri" + 0.030*"ai"'),
 (2,
  '0.056*"fun" + 0.056*"informative" + 0.009*"nlp" + 0.009*"specialization" + 0.009*"machine" + 0.009*"regression" + 0.009*"sentiment" + 0.009*"analysis" + 0.009*"model" + 0.009*"senior"'),
 (3,
  '0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"analysis" + 0.023*"sentiment" + 0.023*"use" + 0.023*"processing"'),
 (4,
  '0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"text" + 0.034*"tweet" + 0.034*"classifier" + 0.034*"feature" + 0.034*"numerical" + 0.034*"extract"')]

In [73]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for a in corpus_transformed], data_dtmna.index))

[([(3, 0.98423713)], 0),
 ([(0, 0.93794733),
   (1, 0.015538729),
   (2, 0.015391254),
   (3, 0.015581874),
   (4, 0.01554082)],
  1),
 ([(0, 0.028716104),
   (1, 0.8854514),
   (2, 0.02857835),
   (3, 0.028678542),
   (4, 0.028575635)],
  2),
 ([(0, 0.020021288),
   (1, 0.91994584),
   (2, 0.02000486),
   (3, 0.020025084),
   (4, 0.020002956)],
  3),
 ([(0, 0.9384454),
   (1, 0.015388263),
   (2, 0.015391453),
   (3, 0.015386183),
   (4, 0.01538868)],
  4),
 ([(3, 0.97030735)], 5),
 ([(0, 0.91074777),
   (1, 0.022227537),
   (2, 0.022231929),
   (3, 0.02230375),
   (4, 0.022489058)],
  6),
 ([(0, 0.06667232),
   (1, 0.06667399),
   (2, 0.73330903),
   (3, 0.0666698),
   (4, 0.0666748)],
  7),
 ([(0, 0.011816977),
   (1, 0.011766334),
   (2, 0.011767677),
   (3, 0.0118445745),
   (4, 0.95280445)],
  8),
 ([(0, 0.033336755),
   (1, 0.8666483),
   (2, 0.033341426),
   (3, 0.03333524),
   (4, 0.03333825)],
  9)]

In [76]:
sent_topics_df = pd.DataFrame()
for i, row_list in enumerate(ldana[corpusna]):
        row = row_list[0] if ldana.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldana.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
sent_topics_df.columns = ['Dom_Topic', 'Topic_Contri', 'Keywords']
print(sent_topics_df)

   Dom_Topic  Topic_Contri                                           Keywords
0        3.0        0.9842  course, word, vector, space, specialization, l...
1        0.0        0.9379  nlp, library, brain, research, staff, paper, g...
2        1.0        0.8855  instructor, specialization, deep, learning, ma...
3        1.0        0.9199  instructor, specialization, deep, learning, ma...
4        0.0        0.9384  nlp, library, brain, research, staff, paper, g...
5        3.0        0.9703  course, word, vector, space, specialization, l...
6        0.0        0.9107  nlp, library, brain, research, staff, paper, g...
7        2.0        0.7333  fun, informative, nlp, specialization, machine...
8        4.0        0.9528  regression, logistic, sentiment, analysis, tex...
9        1.0        0.8666  instructor, specialization, deep, learning, ma...
