In [1]:
# Let's read in our document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,algorithm,analysis,appears,application,applies,approximate,assignment,associated,attented,aug,...,video,visualize,vocabulary,waiting,want,wasnnto,week,write,younes,youtube
0,1,1,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
1,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,1,0,1,2,...,0,0,0,1,0,1,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
algorithm,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
analysis,1,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
appears,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
application,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
applies,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [4]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [5]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [6]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

2022-02-25 11:50:08,762 : INFO : using symmetric alpha at 0.5
2022-02-25 11:50:08,765 : INFO : using symmetric eta at 0.5
2022-02-25 11:50:08,766 : INFO : using serial LDA version on this node
2022-02-25 11:50:08,772 : INFO : running online (multi-pass) LDA training, 2 topics, 10 passes over the supplied corpus of 37 documents, updating model once every 37 documents, evaluating perplexity every 37 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-25 11:50:08,810 : INFO : -5.824 per-word bound, 56.6 perplexity estimate based on a held-out corpus of 37 documents with 233 words
2022-02-25 11:50:08,811 : INFO : PROGRESS: pass 0, at document #37/37
2022-02-25 11:50:08,838 : INFO : topic #0 (0.500): 0.021*"vocabulary" + 0.019*"appears" + 0.018*"specialization" + 0.018*"corpus" + 0.017*"instructor" + 0.016*"example" + 0.015*"number" + 0.014*"dictionary" + 0.014*"use" + 0.013*"count"
2022-02-25 11:50:08,839 : INFO : topic #1 (0.500): 0.028*"course" + 0.021*"using" + 0.0

[(0,
  '0.025*"vocabulary" + 0.020*"instructor" + 0.020*"appears" + 0.020*"learning" + 0.020*"specialization" + 0.020*"example" + 0.015*"dictionary" + 0.015*"corpus" + 0.014*"count" + 0.014*"number"'),
 (1,
  '0.032*"course" + 0.022*"using" + 0.022*"vector" + 0.022*"table" + 0.019*"analysis" + 0.018*"set" + 0.018*"use" + 0.018*"feature" + 0.017*"space" + 0.017*"entire"')]

In [7]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

2022-02-25 11:50:14,744 : INFO : using symmetric alpha at 0.25
2022-02-25 11:50:14,746 : INFO : using symmetric eta at 0.25
2022-02-25 11:50:14,746 : INFO : using serial LDA version on this node
2022-02-25 11:50:14,747 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 37 documents, updating model once every 37 documents, evaluating perplexity every 37 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-25 11:50:14,764 : INFO : -7.106 per-word bound, 137.7 perplexity estimate based on a held-out corpus of 37 documents with 233 words
2022-02-25 11:50:14,765 : INFO : PROGRESS: pass 0, at document #37/37
2022-02-25 11:50:14,778 : INFO : topic #0 (0.250): 0.036*"feature" + 0.028*"using" + 0.026*"use" + 0.025*"classifier" + 0.025*"extract" + 0.025*"example" + 0.025*"appears" + 0.016*"vector" + 0.015*"analysis" + 0.015*"video"
2022-02-25 11:50:14,779 : INFO : topic #1 (0.250): 0.058*"number" + 0.031*"instructor" + 0.031*"c

2022-02-25 11:50:14,910 : INFO : topic #3 (0.250): 0.044*"course" + 0.023*"specialization" + 0.023*"space" + 0.023*"vector" + 0.016*"use" + 0.016*"belong" + 0.016*"entire" + 0.016*"learning" + 0.016*"model" + 0.016*"analysis"
2022-02-25 11:50:14,911 : INFO : topic diff=0.004785, rho=0.353553
2022-02-25 11:50:14,921 : INFO : -5.544 per-word bound, 46.6 perplexity estimate based on a held-out corpus of 37 documents with 233 words
2022-02-25 11:50:14,921 : INFO : PROGRESS: pass 7, at document #37/37
2022-02-25 11:50:14,927 : INFO : topic #0 (0.250): 0.038*"feature" + 0.027*"example" + 0.027*"appears" + 0.027*"using" + 0.027*"classifier" + 0.027*"extract" + 0.027*"use" + 0.015*"analysis" + 0.015*"feel" + 0.015*"check"
2022-02-25 11:50:14,927 : INFO : topic #1 (0.250): 0.059*"number" + 0.031*"vocabulary" + 0.031*"count" + 0.031*"instructor" + 0.031*"want" + 0.031*"given" + 0.031*"track" + 0.031*"map" + 0.031*"corresponding" + 0.017*"dictionary"
2022-02-25 11:50:14,928 : INFO : topic #2 (0.2

[(0,
  '0.038*"feature" + 0.027*"example" + 0.027*"appears" + 0.027*"using" + 0.027*"classifier" + 0.027*"extract" + 0.027*"use" + 0.015*"analysis" + 0.015*"feel" + 0.015*"check"'),
 (1,
  '0.059*"number" + 0.031*"vocabulary" + 0.031*"count" + 0.031*"instructor" + 0.031*"want" + 0.031*"given" + 0.031*"track" + 0.031*"map" + 0.031*"corresponding" + 0.017*"dictionary"'),
 (2,
  '0.029*"corpus" + 0.029*"table" + 0.029*"set" + 0.029*"specialization" + 0.029*"let" + 0.029*"analysis" + 0.029*"associated" + 0.016*"vocabulary" + 0.016*"unique" + 0.016*"designed"'),
 (3,
  '0.044*"course" + 0.023*"vector" + 0.023*"specialization" + 0.023*"space" + 0.016*"entire" + 0.016*"use" + 0.016*"analysis" + 0.016*"belong" + 0.016*"model" + 0.016*"language"')]

In [8]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [9]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript
0,rating â â students enrolled course nat...
1,by end specialization designed nlp applicatio...
2,this specialization designed taught two expert...
3,younes bensouda mourri instructor ai stanford ...
4,åukasz kaiser staff research scientist google...
5,machine translation word embeddings locality...
6,the lecture exciting detailed though little h...
7,other i informative fun
8,from lesson sentiment analysis logistic regres...
9,instructor instructor senior curriculum develo...


In [10]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

Unnamed: 0,transcript
0,rating â students course language processing s...
1,specialization application perform sentiment a...
2,specialization machine learning
3,younes instructor ai stanford university learn...
4,åukasz staff research scientist google brain ...
5,machine translation word embeddings sentiment ...
6,lecture straight regression model
7,fun
8,sentiment analysis regression learn feature ve...
9,instructor instructor curriculum developer gen...


In [11]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,ai,analysis,application,applies,assignment,aug,awesome,bayes,brain,chatbot,...,university,use,value,vector,video,visualize,vocabulary,week,word,younes
0,0,1,0,0,0,0,0,1,0,0,...,0,2,0,2,1,1,0,0,3,0
1,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,1,1,1,0,0,0,...,0,0,0,1,0,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [13]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

2022-02-25 11:50:31,763 : INFO : using symmetric alpha at 0.25
2022-02-25 11:50:31,764 : INFO : using symmetric eta at 0.25
2022-02-25 11:50:31,765 : INFO : using serial LDA version on this node
2022-02-25 11:50:31,766 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 37 documents, updating model once every 37 documents, evaluating perplexity every 37 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-25 11:50:31,782 : INFO : -6.329 per-word bound, 80.4 perplexity estimate based on a held-out corpus of 37 documents with 198 words
2022-02-25 11:50:31,783 : INFO : PROGRESS: pass 0, at document #37/37
2022-02-25 11:50:31,796 : INFO : topic #0 (0.250): 0.054*"sentiment" + 0.054*"analysis" + 0.054*"example" + 0.031*"class" + 0.031*"word" + 0.030*"specialization" + 0.030*"chatbot" + 0.030*"perform" + 0.030*"application" + 0.030*"tool"
2022-02-25 11:50:31,796 : INFO : topic #1 (0.250): 0.059*"tweet" + 0.059*"sentiment" + 

2022-02-25 11:50:31,899 : INFO : topic #2 (0.250): 0.104*"word" + 0.098*"class" + 0.097*"frequency" + 0.081*"tweet" + 0.040*"number" + 0.031*"feature" + 0.031*"corpus" + 0.021*"instructor" + 0.021*"classifier" + 0.021*"count"
2022-02-25 11:50:31,899 : INFO : topic #3 (0.250): 0.059*"course" + 0.040*"word" + 0.040*"vector" + 0.040*"specialization" + 0.031*"regression" + 0.031*"space" + 0.031*"sentiment" + 0.031*"analysis" + 0.031*"look" + 0.024*"tweet"
2022-02-25 11:50:31,900 : INFO : topic diff=0.005848, rho=0.353553
2022-02-25 11:50:31,911 : INFO : -4.782 per-word bound, 27.5 perplexity estimate based on a held-out corpus of 37 documents with 198 words
2022-02-25 11:50:31,914 : INFO : PROGRESS: pass 7, at document #37/37
2022-02-25 11:50:31,920 : INFO : topic #0 (0.250): 0.055*"example" + 0.055*"sentiment" + 0.055*"analysis" + 0.031*"vocabulary" + 0.030*"specialization" + 0.030*"perform" + 0.030*"chatbot" + 0.030*"application" + 0.030*"tool" + 0.030*"translate"
2022-02-25 11:50:31,921

[(0,
  '0.055*"example" + 0.055*"sentiment" + 0.055*"analysis" + 0.031*"vocabulary" + 0.030*"specialization" + 0.030*"perform" + 0.030*"chatbot" + 0.030*"application" + 0.030*"tool" + 0.030*"translate"'),
 (1,
  '0.065*"sentiment" + 0.037*"check" + 0.036*"regression" + 0.036*"lecture" + 0.036*"model" + 0.036*"straight" + 0.036*"pause" + 0.036*"entry" + 0.036*"clarity" + 0.036*"example"'),
 (2,
  '0.105*"word" + 0.097*"class" + 0.097*"frequency" + 0.083*"tweet" + 0.040*"number" + 0.031*"feature" + 0.031*"corpus" + 0.021*"instructor" + 0.021*"classifier" + 0.021*"count"'),
 (3,
  '0.059*"course" + 0.040*"word" + 0.040*"vector" + 0.040*"specialization" + 0.031*"regression" + 0.031*"space" + 0.031*"analysis" + 0.031*"sentiment" + 0.031*"look" + 0.024*"tweet"')]

In [14]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [15]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
0,rating â â students course natural language pr...
1,end specialization nlp application perform sen...
2,specialization expert nlp machine deep learning
3,younes mourri instructor ai stanford universit...
4,åukasz staff research scientist google brain ...
5,machine translation word embeddings localityse...
6,lecture detailed little hard straight helped r...
7,other i informative fun
8,lesson sentiment analysis logistic regression ...
9,instructor instructor senior curriculum develo...


In [16]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,ai,analysis,application,applies,approximate,assignment,aug,awesome,bayes,belong,...,university,use,value,vector,video,visualize,vocabulary,week,word,younes
0,0,1,0,0,1,0,0,0,1,0,...,0,2,0,2,1,1,0,0,3,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,1,1,1,0,0,...,0,0,0,1,0,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [17]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [18]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=80)
ldana.print_topics()

2022-02-25 11:52:12,577 : INFO : using symmetric alpha at 0.2
2022-02-25 11:52:12,578 : INFO : using symmetric eta at 0.2
2022-02-25 11:52:12,578 : INFO : using serial LDA version on this node
2022-02-25 11:52:12,579 : INFO : running online (multi-pass) LDA training, 5 topics, 80 passes over the supplied corpus of 37 documents, updating model once every 37 documents, evaluating perplexity every 37 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-25 11:52:12,595 : INFO : -7.312 per-word bound, 158.9 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:12,596 : INFO : PROGRESS: pass 0, at document #37/37
2022-02-25 11:52:12,609 : INFO : topic #0 (0.200): 0.041*"sentiment" + 0.036*"class" + 0.030*"use" + 0.028*"tweet" + 0.027*"specialization" + 0.025*"frequency" + 0.025*"dictionary" + 0.025*"instructor" + 0.023*"nlp" + 0.020*"word"
2022-02-25 11:52:12,611 : INFO : topic #1 (0.200): 0.084*"class" + 0.084*"word" + 0.068*"fre

2022-02-25 11:52:12,717 : INFO : topic #2 (0.200): 0.037*"feature" + 0.037*"free" + 0.037*"check" + 0.021*"use" + 0.020*"logistic" + 0.020*"regression" + 0.020*"count" + 0.020*"classifier" + 0.020*"extract" + 0.020*"coauthor"
2022-02-25 11:52:12,718 : INFO : topic #3 (0.200): 0.083*"tweet" + 0.056*"positive" + 0.047*"sentiment" + 0.039*"word" + 0.030*"analysis" + 0.029*"course" + 0.029*"example" + 0.024*"vocabulary" + 0.020*"look" + 0.020*"second"
2022-02-25 11:52:12,718 : INFO : topic #4 (0.200): 0.032*"specialization" + 0.032*"word" + 0.032*"course" + 0.022*"regression" + 0.022*"model" + 0.022*"vector" + 0.022*"processing" + 0.022*"space" + 0.022*"natural" + 0.022*"relationship"
2022-02-25 11:52:12,719 : INFO : topic diff=0.048285, rho=0.377964
2022-02-25 11:52:12,727 : INFO : -5.225 per-word bound, 37.4 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:12,728 : INFO : PROGRESS: pass 6, at document #37/37
2022-02-25 11:52:12,733 : INFO : t

2022-02-25 11:52:12,838 : INFO : topic #1 (0.200): 0.117*"class" + 0.103*"word" + 0.096*"frequency" + 0.076*"negative" + 0.054*"positive" + 0.045*"number" + 0.034*"corpus" + 0.033*"table" + 0.023*"map" + 0.023*"track"
2022-02-25 11:52:12,839 : INFO : topic #2 (0.200): 0.038*"feature" + 0.038*"free" + 0.038*"check" + 0.021*"use" + 0.021*"count" + 0.021*"look" + 0.021*"logistic" + 0.021*"regression" + 0.021*"extract" + 0.021*"classifier"
2022-02-25 11:52:12,839 : INFO : topic #3 (0.200): 0.087*"tweet" + 0.050*"positive" + 0.049*"sentiment" + 0.035*"word" + 0.030*"analysis" + 0.030*"course" + 0.030*"example" + 0.021*"vocabulary" + 0.021*"second" + 0.021*"regression"
2022-02-25 11:52:12,840 : INFO : topic #4 (0.200): 0.033*"word" + 0.033*"specialization" + 0.033*"course" + 0.023*"regression" + 0.023*"model" + 0.023*"vector" + 0.023*"use" + 0.023*"space" + 0.023*"processing" + 0.023*"language"
2022-02-25 11:52:12,840 : INFO : topic diff=0.008627, rho=0.277350
2022-02-25 11:52:12,851 : INFO 

2022-02-25 11:52:12,939 : INFO : PROGRESS: pass 17, at document #37/37
2022-02-25 11:52:12,943 : INFO : topic #0 (0.200): 0.033*"instructor" + 0.033*"use" + 0.033*"nlp" + 0.033*"specialization" + 0.024*"dictionary" + 0.019*"frequency" + 0.018*"sentiment" + 0.018*"classifier" + 0.018*"learn" + 0.018*"text"
2022-02-25 11:52:12,944 : INFO : topic #1 (0.200): 0.122*"class" + 0.102*"word" + 0.095*"frequency" + 0.075*"negative" + 0.056*"positive" + 0.044*"number" + 0.033*"corpus" + 0.033*"table" + 0.025*"vocabulary" + 0.023*"map"
2022-02-25 11:52:12,945 : INFO : topic #2 (0.200): 0.039*"feature" + 0.038*"free" + 0.038*"check" + 0.021*"count" + 0.021*"look" + 0.021*"extract" + 0.021*"classifier" + 0.021*"paper" + 0.021*"library" + 0.021*"transformer"
2022-02-25 11:52:12,946 : INFO : topic #3 (0.200): 0.088*"tweet" + 0.050*"sentiment" + 0.048*"positive" + 0.035*"word" + 0.031*"analysis" + 0.031*"course" + 0.031*"example" + 0.021*"regression" + 0.021*"logistic" + 0.021*"second"
2022-02-25 11:52

2022-02-25 11:52:13,042 : INFO : topic diff=0.001352, rho=0.204124
2022-02-25 11:52:13,051 : INFO : -5.199 per-word bound, 36.7 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:13,051 : INFO : PROGRESS: pass 23, at document #37/37
2022-02-25 11:52:13,056 : INFO : topic #0 (0.200): 0.033*"instructor" + 0.033*"use" + 0.033*"nlp" + 0.033*"specialization" + 0.024*"dictionary" + 0.019*"frequency" + 0.018*"classifier" + 0.018*"learn" + 0.018*"text" + 0.018*"deep"
2022-02-25 11:52:13,057 : INFO : topic #1 (0.200): 0.125*"class" + 0.102*"word" + 0.095*"frequency" + 0.074*"negative" + 0.057*"positive" + 0.043*"number" + 0.033*"corpus" + 0.033*"table" + 0.026*"vocabulary" + 0.023*"map"
2022-02-25 11:52:13,058 : INFO : topic #2 (0.200): 0.039*"feature" + 0.039*"free" + 0.039*"check" + 0.021*"count" + 0.021*"look" + 0.021*"extract" + 0.021*"classifier" + 0.021*"paper" + 0.021*"staff" + 0.021*"coauthor"
2022-02-25 11:52:13,059 : INFO : topic #3 (0.200):

2022-02-25 11:52:13,147 : INFO : topic #4 (0.200): 0.033*"word" + 0.033*"specialization" + 0.033*"course" + 0.023*"use" + 0.023*"regression" + 0.023*"language" + 0.023*"space" + 0.023*"vector" + 0.023*"model" + 0.023*"processing"
2022-02-25 11:52:13,147 : INFO : topic diff=0.000641, rho=0.182574
2022-02-25 11:52:13,157 : INFO : -5.198 per-word bound, 36.7 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:13,157 : INFO : PROGRESS: pass 29, at document #37/37
2022-02-25 11:52:13,162 : INFO : topic #0 (0.200): 0.033*"instructor" + 0.033*"use" + 0.033*"nlp" + 0.033*"specialization" + 0.024*"dictionary" + 0.019*"frequency" + 0.018*"classifier" + 0.018*"deep" + 0.018*"learning" + 0.018*"learn"
2022-02-25 11:52:13,163 : INFO : topic #1 (0.200): 0.125*"class" + 0.101*"word" + 0.094*"frequency" + 0.074*"negative" + 0.058*"positive" + 0.043*"number" + 0.033*"table" + 0.033*"corpus" + 0.026*"vocabulary" + 0.023*"map"
2022-02-25 11:52:13,163 : INFO : to

2022-02-25 11:52:13,260 : INFO : topic #3 (0.200): 0.089*"tweet" + 0.050*"sentiment" + 0.045*"positive" + 0.034*"word" + 0.031*"analysis" + 0.031*"course" + 0.031*"example" + 0.021*"regression" + 0.021*"logistic" + 0.021*"second"
2022-02-25 11:52:13,261 : INFO : topic #4 (0.200): 0.033*"word" + 0.033*"specialization" + 0.033*"course" + 0.023*"use" + 0.023*"regression" + 0.023*"language" + 0.023*"space" + 0.023*"vector" + 0.023*"model" + 0.023*"processing"
2022-02-25 11:52:13,261 : INFO : topic diff=0.000297, rho=0.166667
2022-02-25 11:52:13,269 : INFO : -5.198 per-word bound, 36.7 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:13,270 : INFO : PROGRESS: pass 35, at document #37/37
2022-02-25 11:52:13,276 : INFO : topic #0 (0.200): 0.033*"instructor" + 0.033*"use" + 0.033*"nlp" + 0.033*"specialization" + 0.024*"dictionary" + 0.019*"frequency" + 0.018*"classifier" + 0.018*"deep" + 0.018*"learning" + 0.018*"learn"
2022-02-25 11:52:13,277 : IN

2022-02-25 11:52:13,362 : INFO : topic #2 (0.200): 0.039*"feature" + 0.039*"free" + 0.039*"check" + 0.021*"count" + 0.021*"look" + 0.021*"extract" + 0.021*"classifier" + 0.021*"tensorflow" + 0.021*"ukasz" + 0.021*"coauthor"
2022-02-25 11:52:13,363 : INFO : topic #3 (0.200): 0.089*"tweet" + 0.050*"sentiment" + 0.045*"positive" + 0.034*"word" + 0.031*"analysis" + 0.031*"course" + 0.031*"example" + 0.021*"regression" + 0.021*"logistic" + 0.021*"second"
2022-02-25 11:52:13,364 : INFO : topic #4 (0.200): 0.033*"word" + 0.033*"specialization" + 0.033*"course" + 0.023*"use" + 0.023*"language" + 0.023*"regression" + 0.023*"space" + 0.023*"vector" + 0.023*"processing" + 0.023*"relationship"
2022-02-25 11:52:13,364 : INFO : topic diff=0.000142, rho=0.154303
2022-02-25 11:52:13,372 : INFO : -5.198 per-word bound, 36.7 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:13,372 : INFO : PROGRESS: pass 41, at document #37/37
2022-02-25 11:52:13,378 : INFO :

2022-02-25 11:52:13,473 : INFO : topic #1 (0.200): 0.125*"class" + 0.101*"word" + 0.094*"frequency" + 0.074*"negative" + 0.059*"positive" + 0.043*"number" + 0.033*"table" + 0.033*"corpus" + 0.027*"vocabulary" + 0.023*"map"
2022-02-25 11:52:13,474 : INFO : topic #2 (0.200): 0.039*"feature" + 0.039*"check" + 0.039*"free" + 0.021*"count" + 0.021*"look" + 0.021*"extract" + 0.021*"classifier" + 0.021*"coauthor" + 0.021*"research" + 0.021*"google"
2022-02-25 11:52:13,475 : INFO : topic #3 (0.200): 0.089*"tweet" + 0.050*"sentiment" + 0.045*"positive" + 0.034*"word" + 0.031*"analysis" + 0.031*"course" + 0.031*"example" + 0.021*"regression" + 0.021*"logistic" + 0.021*"second"
2022-02-25 11:52:13,476 : INFO : topic #4 (0.200): 0.033*"word" + 0.033*"specialization" + 0.033*"course" + 0.023*"use" + 0.023*"language" + 0.023*"regression" + 0.023*"space" + 0.023*"vector" + 0.023*"processing" + 0.023*"natural"
2022-02-25 11:52:13,477 : INFO : topic diff=0.000075, rho=0.144338
2022-02-25 11:52:13,489 :

2022-02-25 11:52:13,595 : INFO : PROGRESS: pass 52, at document #37/37
2022-02-25 11:52:13,603 : INFO : topic #0 (0.200): 0.033*"instructor" + 0.033*"use" + 0.033*"nlp" + 0.033*"specialization" + 0.024*"dictionary" + 0.019*"frequency" + 0.018*"classifier" + 0.018*"learning" + 0.018*"deep" + 0.018*"learn"
2022-02-25 11:52:13,604 : INFO : topic #1 (0.200): 0.125*"class" + 0.101*"word" + 0.094*"frequency" + 0.074*"negative" + 0.059*"positive" + 0.043*"number" + 0.033*"table" + 0.033*"corpus" + 0.027*"vocabulary" + 0.023*"map"
2022-02-25 11:52:13,605 : INFO : topic #2 (0.200): 0.039*"feature" + 0.039*"free" + 0.039*"check" + 0.021*"count" + 0.021*"look" + 0.021*"extract" + 0.021*"classifier" + 0.021*"coauthor" + 0.021*"scientist" + 0.021*"research"
2022-02-25 11:52:13,606 : INFO : topic #3 (0.200): 0.089*"tweet" + 0.050*"sentiment" + 0.045*"positive" + 0.034*"word" + 0.031*"analysis" + 0.031*"course" + 0.031*"example" + 0.021*"regression" + 0.021*"logistic" + 0.021*"second"
2022-02-25 11:5

2022-02-25 11:52:13,728 : INFO : topic diff=0.000030, rho=0.130189
2022-02-25 11:52:13,737 : INFO : -5.198 per-word bound, 36.7 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:13,737 : INFO : PROGRESS: pass 58, at document #37/37
2022-02-25 11:52:13,743 : INFO : topic #0 (0.200): 0.033*"instructor" + 0.033*"use" + 0.033*"nlp" + 0.033*"specialization" + 0.024*"dictionary" + 0.019*"frequency" + 0.018*"classifier" + 0.018*"learning" + 0.018*"deep" + 0.018*"learn"
2022-02-25 11:52:13,744 : INFO : topic #1 (0.200): 0.125*"class" + 0.101*"word" + 0.094*"frequency" + 0.074*"negative" + 0.060*"positive" + 0.043*"number" + 0.033*"table" + 0.033*"corpus" + 0.028*"vocabulary" + 0.023*"map"
2022-02-25 11:52:13,745 : INFO : topic #2 (0.200): 0.039*"feature" + 0.039*"free" + 0.039*"check" + 0.021*"count" + 0.021*"look" + 0.021*"extract" + 0.021*"classifier" + 0.021*"coauthor" + 0.021*"scientist" + 0.021*"research"
2022-02-25 11:52:13,745 : INFO : topic 

2022-02-25 11:52:13,853 : INFO : topic #4 (0.200): 0.033*"word" + 0.033*"specialization" + 0.033*"course" + 0.023*"use" + 0.023*"language" + 0.023*"regression" + 0.023*"space" + 0.023*"vector" + 0.023*"relationship" + 0.023*"processing"
2022-02-25 11:52:13,854 : INFO : topic diff=0.000020, rho=0.124035
2022-02-25 11:52:13,868 : INFO : -5.198 per-word bound, 36.7 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:13,868 : INFO : PROGRESS: pass 64, at document #37/37
2022-02-25 11:52:13,874 : INFO : topic #0 (0.200): 0.033*"instructor" + 0.033*"use" + 0.033*"nlp" + 0.033*"specialization" + 0.024*"dictionary" + 0.019*"frequency" + 0.018*"classifier" + 0.018*"learning" + 0.018*"deep" + 0.018*"learn"
2022-02-25 11:52:13,874 : INFO : topic #1 (0.200): 0.125*"class" + 0.101*"word" + 0.094*"frequency" + 0.074*"negative" + 0.060*"positive" + 0.043*"number" + 0.033*"table" + 0.033*"corpus" + 0.028*"vocabulary" + 0.023*"map"
2022-02-25 11:52:13,875 : IN

2022-02-25 11:52:13,980 : INFO : topic #3 (0.200): 0.089*"tweet" + 0.050*"sentiment" + 0.044*"positive" + 0.034*"word" + 0.031*"analysis" + 0.031*"course" + 0.031*"example" + 0.021*"regression" + 0.021*"logistic" + 0.021*"second"
2022-02-25 11:52:13,981 : INFO : topic #4 (0.200): 0.033*"word" + 0.033*"specialization" + 0.033*"course" + 0.023*"use" + 0.023*"language" + 0.023*"regression" + 0.023*"space" + 0.023*"vector" + 0.023*"relationship" + 0.023*"processing"
2022-02-25 11:52:13,981 : INFO : topic diff=0.000014, rho=0.118678
2022-02-25 11:52:13,992 : INFO : -5.198 per-word bound, 36.7 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:13,993 : INFO : PROGRESS: pass 70, at document #37/37
2022-02-25 11:52:13,999 : INFO : topic #0 (0.200): 0.033*"instructor" + 0.033*"use" + 0.033*"nlp" + 0.033*"specialization" + 0.024*"dictionary" + 0.019*"frequency" + 0.018*"classifier" + 0.018*"learning" + 0.018*"deep" + 0.018*"learn"
2022-02-25 11:52:14,0

2022-02-25 11:52:14,096 : INFO : topic #2 (0.200): 0.039*"feature" + 0.039*"free" + 0.039*"check" + 0.021*"count" + 0.021*"look" + 0.021*"extract" + 0.021*"classifier" + 0.021*"coauthor" + 0.021*"scientist" + 0.021*"research"
2022-02-25 11:52:14,097 : INFO : topic #3 (0.200): 0.089*"tweet" + 0.050*"sentiment" + 0.044*"positive" + 0.034*"word" + 0.031*"analysis" + 0.031*"course" + 0.031*"example" + 0.021*"regression" + 0.021*"logistic" + 0.021*"second"
2022-02-25 11:52:14,097 : INFO : topic #4 (0.200): 0.033*"word" + 0.033*"specialization" + 0.033*"course" + 0.023*"use" + 0.023*"language" + 0.023*"regression" + 0.023*"space" + 0.023*"vector" + 0.023*"relationship" + 0.023*"processing"
2022-02-25 11:52:14,097 : INFO : topic diff=0.000009, rho=0.113961
2022-02-25 11:52:14,106 : INFO : -5.198 per-word bound, 36.7 perplexity estimate based on a held-out corpus of 37 documents with 286 words
2022-02-25 11:52:14,106 : INFO : PROGRESS: pass 76, at document #37/37
2022-02-25 11:52:14,112 : INFO

[(0,
  '0.033*"instructor" + 0.033*"use" + 0.033*"nlp" + 0.033*"specialization" + 0.024*"dictionary" + 0.019*"frequency" + 0.018*"classifier" + 0.018*"learning" + 0.018*"deep" + 0.018*"learn"'),
 (1,
  '0.125*"class" + 0.101*"word" + 0.094*"frequency" + 0.074*"negative" + 0.060*"positive" + 0.043*"number" + 0.033*"table" + 0.033*"corpus" + 0.028*"vocabulary" + 0.023*"map"'),
 (2,
  '0.039*"feature" + 0.039*"free" + 0.039*"check" + 0.021*"count" + 0.021*"look" + 0.021*"extract" + 0.021*"classifier" + 0.021*"coauthor" + 0.021*"scientist" + 0.021*"research"'),
 (3,
  '0.089*"tweet" + 0.050*"sentiment" + 0.044*"positive" + 0.034*"word" + 0.031*"analysis" + 0.031*"course" + 0.031*"example" + 0.021*"regression" + 0.021*"logistic" + 0.021*"second"'),
 (4,
  '0.033*"word" + 0.033*"specialization" + 0.033*"course" + 0.023*"use" + 0.023*"language" + 0.023*"regression" + 0.023*"space" + 0.023*"vector" + 0.023*"relationship" + 0.023*"processing"')]

In [19]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for a in corpus_transformed], data_dtmna.index))

[([(4, 0.98417056)], 0),
 ([(0, 0.9380261),
   (1, 0.015386427),
   (2, 0.0153875975),
   (3, 0.015648859),
   (4, 0.0155510735)],
  1),
 ([(0, 0.8852855),
   (1, 0.028574489),
   (2, 0.028576506),
   (3, 0.028656222),
   (4, 0.028907303)],
  2),
 ([(0, 0.020482922),
   (1, 0.020003501),
   (2, 0.020005811),
   (3, 0.020003323),
   (4, 0.91950446)],
  3),
 ([(0, 0.015387056),
   (1, 0.015386317),
   (2, 0.9384541),
   (3, 0.015386228),
   (4, 0.015386323)],
  4),
 ([(3, 0.9702059)], 5),
 ([(0, 0.022274164),
   (1, 0.022225868),
   (2, 0.022282915),
   (3, 0.022296114),
   (4, 0.9109209)],
  6),
 ([(0, 0.066678554),
   (1, 0.06667493),
   (2, 0.7332971),
   (3, 0.0666745),
   (4, 0.06667496)],
  7),
 ([(0, 0.011974327),
   (1, 0.011766281),
   (2, 0.011999774),
   (3, 0.9524016),
   (4, 0.011858029)],
  8),
 ([(0, 0.9379204),
   (1, 0.015386319),
   (2, 0.015628453),
   (3, 0.015556975),
   (4, 0.015507896)],
  9),
 ([(0, 0.033335067),
   (1, 0.86641765),
   (2, 0.03333533),
   (3, 0.03

In [20]:
sent_topics_df = pd.DataFrame()
for i, row_list in enumerate(ldana[corpusna]):
        row = row_list[0] if ldana.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldana.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
sent_topics_df.columns = ['Dom_Topic', 'Topic_Contri', 'Keywords']
print(sent_topics_df)

    Dom_Topic  Topic_Contri                                           Keywords
0         4.0        0.9842  word, specialization, course, use, language, r...
1         0.0        0.9380  instructor, use, nlp, specialization, dictiona...
2         0.0        0.8853  instructor, use, nlp, specialization, dictiona...
3         4.0        0.9195  word, specialization, course, use, language, r...
4         2.0        0.9385  feature, free, check, count, look, extract, cl...
5         3.0        0.9702  tweet, sentiment, positive, word, analysis, co...
6         4.0        0.9109  word, specialization, course, use, language, r...
7         2.0        0.7333  feature, free, check, count, look, extract, cl...
8         3.0        0.9524  tweet, sentiment, positive, word, analysis, co...
9         0.0        0.9379  instructor, use, nlp, specialization, dictiona...
10        1.0        0.8664  class, word, frequency, negative, positive, nu...
11        1.0        0.8856  class, word, frequency,

In [21]:
data = {}
sentences = ""
corpus = pd.read_pickle("corpus.pkl")
corpus
len(sent_topics_df)
i=0
a=0
while(a<len(sent_topics_df)-1):
    sentences = corpus.loc[a].at['transcript']
    if(sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"]):
        while((a<len(sent_topics_df)-1) and (sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"])):
            sentences += corpus.loc[a+1].at['transcript']
            a+=1
    data[i] = sentences
    i+=1
    a+=1
data[i] = sentences = corpus.loc[a].at['transcript']
data

{0: '4.6 ( 3,348 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Natural Language Processing Specialization , : ) Perform sentiment analysis tweet using logistic regression naÃ¯ve Bayes , b ) Use vector space model discover relationship word use PCA reduce dimensionality vector space visualize relationship , c ) Write simple English French translation algorithm using pre-computed word embeddings locality-sensitive hashing relate word via approximate k-nearest neighbor search .',
 1: 'By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , even built chatbot !This Specialization designed taught two expert NLP , machine learning , deep learning .',
 2: 'Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .',
 3: 'Å\x81ukasz Kaiser Staff Research Scientist Google Brai

In [22]:
# We are going to change this to key: sentence_id, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ''.join(list_of_text)
    return combined_text

In [23]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}
data_combined

{0: ['4.6 ( 3,348 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Natural Language Processing Specialization , : ) Perform sentiment analysis tweet using logistic regression naÃ¯ve Bayes , b ) Use vector space model discover relationship word use PCA reduce dimensionality vector space visualize relationship , c ) Write simple English French translation algorithm using pre-computed word embeddings locality-sensitive hashing relate word via approximate k-nearest neighbor search .'],
 1: ['By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , even built chatbot !This Specialization designed taught two expert NLP , machine learning , deep learning .'],
 2: ['Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .'],
 3: ['Å\x81ukasz Kaiser Staff Research Scientist Goog

In [24]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

combined_sent = pd.DataFrame.from_dict(data_combined).transpose()
combined_sent.columns = ['transcript']
combined_sent = combined_sent.sort_index()
combined_sent

Unnamed: 0,transcript
0,"4.6 ( 3,348 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Nat..."
1,"By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , ..."
2,Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .
3,"Åukasz Kaiser Staff Research Scientist Google Brain co-author Tensorflow , Tensor2Tensor Trax library , Transformer paper ."
4,"Machine Translation , Word Embeddings , Locality-Sensitive Hashing , Sentiment Analysis , Vector Space Models 4.6 ( 3,348 rating ) HA Aug 9 , 2020..."
5,"The lecture exciting detailed , though little hard straight forward sometimes , Youtube helped Regression model ."
6,"Other , I informative fun ."
7,"From lesson Sentiment Analysis Logistic Regression Learn extract feature text numerical vector , build binary classifier tweet using logistic regr..."
8,"Instructor Instructor Senior Curriculum Developer We 'll learn generates count , use feature logistic regression classifier ."
9,"Specifically , given word , want keep track number time , 's show positive class .Given another word want keep track number time word showed negat..."


In [25]:
# pickle it for later use
combined_sent.to_pickle("Combined_wrt_topics.pkl")