In [3]:
# Let's read in our document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,ai,algorithm,analysis,application,approximate,assignment,attented,aug,awesome,bayes,...,vector,video,visualize,waiting,wasnnto,week,word,write,younes,youtube
0,0,1,1,0,1,0,0,0,0,1,...,2,1,1,0,0,0,3,1,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,1,1,2,1,0,...,1,0,0,1,1,1,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
ai,0,0,0,1,0,0,0,0,0,0
algorithm,1,0,0,0,0,0,0,0,0,0
analysis,1,1,0,0,0,1,0,0,1,0
application,0,1,0,0,0,0,0,0,0,0
approximate,1,0,0,0,0,0,0,0,0,0


In [6]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [7]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [8]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

2022-02-24 21:43:55,832 : INFO : using symmetric alpha at 0.5
2022-02-24 21:43:55,834 : INFO : using symmetric eta at 0.5
2022-02-24 21:43:55,834 : INFO : using serial LDA version on this node
2022-02-24 21:43:55,835 : INFO : running online (multi-pass) LDA training, 2 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 21:43:55,847 : INFO : -5.480 per-word bound, 44.6 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 21:43:55,848 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 21:43:55,855 : INFO : topic #0 (0.500): 0.032*"specialization" + 0.023*"learning" + 0.021*"word" + 0.020*"analysis" + 0.020*"course" + 0.020*"vector" + 0.019*"logistic" + 0.018*"language" + 0.016*"deep" + 0.016*"using"
2022-02-24 21:43:55,856 : INFO : topic #1 (0.500): 0.037*"course" + 0.023*"instructor" + 0.020

[(0,
  '0.035*"specialization" + 0.022*"word" + 0.022*"learning" + 0.022*"vector" + 0.022*"logistic" + 0.022*"language" + 0.022*"analysis" + 0.022*"using" + 0.022*"course" + 0.016*"natural"'),
 (1,
  '0.037*"course" + 0.027*"instructor" + 0.027*"aug" + 0.016*"machine" + 0.016*"helped" + 0.016*"analysis" + 0.016*"vector" + 0.016*"localitysensitive" + 0.016*"embeddings" + 0.016*"hashing"')]

In [9]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

2022-02-24 21:43:55,957 : INFO : using symmetric alpha at 0.25
2022-02-24 21:43:55,957 : INFO : using symmetric eta at 0.25
2022-02-24 21:43:55,958 : INFO : using serial LDA version on this node
2022-02-24 21:43:55,959 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 21:43:55,965 : INFO : -6.827 per-word bound, 113.6 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 21:43:55,966 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 21:43:55,971 : INFO : topic #0 (0.250): 0.049*"learning" + 0.030*"machine" + 0.027*"deep" + 0.027*"specialization" + 0.027*"designed" + 0.027*"taught" + 0.027*"expert" + 0.027*"informative" + 0.027*"fun" + 0.020*"youtube"
2022-02-24 21:43:55,971 : INFO : topic #1 (0.250): 0.034*"google" + 0.034*"brain" + 0.03

2022-02-24 21:43:56,036 : INFO : topic #2 (0.250): 0.056*"course" + 0.039*"instructor" + 0.039*"aug" + 0.022*"curriculum" + 0.022*"developer" + 0.022*"vector" + 0.022*"embeddings" + 0.022*"word" + 0.021*"wasnnto" + 0.021*"good"
2022-02-24 21:43:56,037 : INFO : topic #3 (0.250): 0.039*"specialization" + 0.030*"course" + 0.030*"analysis" + 0.030*"word" + 0.030*"vector" + 0.029*"logistic" + 0.029*"using" + 0.029*"language" + 0.020*"text" + 0.020*"build"
2022-02-24 21:43:56,037 : INFO : topic diff=0.014893, rho=0.353553
2022-02-24 21:43:56,043 : INFO : -5.152 per-word bound, 35.6 perplexity estimate based on a held-out corpus of 10 documents with 147 words
2022-02-24 21:43:56,043 : INFO : PROGRESS: pass 7, at document #10/10
2022-02-24 21:43:56,046 : INFO : topic #0 (0.250): 0.049*"learning" + 0.027*"deep" + 0.027*"designed" + 0.027*"machine" + 0.027*"taught" + 0.027*"expert" + 0.027*"informative" + 0.027*"fun" + 0.027*"specialization" + 0.027*"helped"
2022-02-24 21:43:56,046 : INFO : topi

[(0,
  '0.049*"learning" + 0.027*"deep" + 0.027*"designed" + 0.027*"machine" + 0.027*"helped" + 0.027*"taught" + 0.027*"expert" + 0.027*"informative" + 0.027*"fun" + 0.027*"specialization"'),
 (1,
  '0.034*"google" + 0.034*"brain" + 0.034*"coauthor" + 0.034*"library" + 0.034*"ukasz" + 0.034*"staff" + 0.034*"tensorflow" + 0.034*"paper" + 0.034*"trax" + 0.034*"kaiser"'),
 (2,
  '0.056*"course" + 0.039*"instructor" + 0.039*"aug" + 0.022*"curriculum" + 0.022*"developer" + 0.022*"vector" + 0.022*"embeddings" + 0.022*"word" + 0.022*"localitysensitive" + 0.022*"hashing"'),
 (3,
  '0.039*"specialization" + 0.030*"course" + 0.030*"analysis" + 0.030*"word" + 0.030*"vector" + 0.030*"logistic" + 0.030*"using" + 0.030*"language" + 0.020*"text" + 0.020*"build"')]

In [10]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [11]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript
0,rating â â students enrolled course nat...
1,by end specialization designed nlp applicatio...
2,this specialization designed taught two expert...
3,younes bensouda mourri instructor ai stanford ...
4,åukasz kaiser staff research scientist google...
5,machine translation word embeddings locality...
6,the lecture exciting detailed though little h...
7,other i informative fun
8,from lesson sentiment analysis logistic regres...
9,instructor instructor senior curriculum developer


In [12]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

Unnamed: 0,transcript
0,rating â students course language processing s...
1,specialization application perform sentiment a...
2,specialization machine learning
3,younes instructor ai stanford university learn...
4,åukasz staff research scientist google brain ...
5,machine translation word embeddings sentiment ...
6,lecture straight regression model
7,fun
8,sentiment analysis regression learn feature ve...
9,instructor instructor curriculum developer


In [13]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,ai,analysis,application,assignment,aug,awesome,bayes,brain,chatbot,coauthor,...,tweet,ukasz,university,use,vector,video,visualize,week,word,younes
0,0,1,0,0,0,0,1,0,0,0,...,1,0,0,2,2,1,1,0,3,0
1,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,0,0,0,0
5,0,1,0,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [15]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

2022-02-24 21:43:56,678 : INFO : using symmetric alpha at 0.25
2022-02-24 21:43:56,679 : INFO : using symmetric eta at 0.25
2022-02-24 21:43:56,679 : INFO : using serial LDA version on this node
2022-02-24 21:43:56,680 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 21:43:56,687 : INFO : -6.257 per-word bound, 76.5 perplexity estimate based on a held-out corpus of 10 documents with 107 words
2022-02-24 21:43:56,688 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 21:43:56,694 : INFO : topic #0 (0.250): 0.041*"instructor" + 0.035*"course" + 0.035*"word" + 0.028*"relationship" + 0.028*"vector" + 0.028*"processing" + 0.027*"space" + 0.026*"language" + 0.025*"specialization" + 0.023*"google"
2022-02-24 21:43:56,695 : INFO : topic #1 (0.250): 0.078*"regression" + 0.056*"sen

2022-02-24 21:43:56,758 : INFO : topic #2 (0.250): 0.080*"course" + 0.055*"machine" + 0.031*"word" + 0.031*"space" + 0.031*"vector" + 0.031*"translation" + 0.031*"rating" + 0.031*"sentiment" + 0.031*"embeddings" + 0.031*"analysis"
2022-02-24 21:43:56,759 : INFO : topic #3 (0.250): 0.054*"specialization" + 0.053*"learning" + 0.053*"instructor" + 0.052*"university" + 0.052*"stanford" + 0.052*"ai" + 0.052*"younes" + 0.011*"word" + 0.011*"course" + 0.011*"language"
2022-02-24 21:43:56,760 : INFO : topic diff=0.016731, rho=0.353553
2022-02-24 21:43:56,764 : INFO : -4.801 per-word bound, 27.9 perplexity estimate based on a held-out corpus of 10 documents with 107 words
2022-02-24 21:43:56,765 : INFO : PROGRESS: pass 7, at document #10/10
2022-02-24 21:43:56,768 : INFO : topic #0 (0.250): 0.046*"word" + 0.046*"course" + 0.032*"specialization" + 0.032*"instructor" + 0.032*"vector" + 0.032*"relationship" + 0.032*"processing" + 0.032*"space" + 0.032*"language" + 0.032*"use"
2022-02-24 21:43:56,7

[(0,
  '0.046*"word" + 0.046*"course" + 0.032*"specialization" + 0.032*"vector" + 0.032*"language" + 0.032*"relationship" + 0.032*"space" + 0.032*"processing" + 0.032*"use" + 0.032*"instructor"'),
 (1,
  '0.084*"regression" + 0.058*"sentiment" + 0.058*"analysis" + 0.032*"specialization" + 0.032*"straight" + 0.032*"lecture" + 0.032*"learn" + 0.032*"feature" + 0.032*"text" + 0.032*"tweet"'),
 (2,
  '0.080*"course" + 0.055*"machine" + 0.031*"word" + 0.031*"vector" + 0.031*"space" + 0.031*"translation" + 0.031*"rating" + 0.031*"sentiment" + 0.031*"embeddings" + 0.031*"analysis"'),
 (3,
  '0.054*"specialization" + 0.053*"learning" + 0.053*"instructor" + 0.053*"university" + 0.053*"stanford" + 0.053*"ai" + 0.053*"younes" + 0.011*"word" + 0.011*"course" + 0.011*"language"')]

In [16]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [17]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
0,rating â â students course natural language pr...
1,end specialization nlp application perform sen...
2,specialization expert nlp machine deep learning
3,younes mourri instructor ai stanford universit...
4,åukasz staff research scientist google brain ...
5,machine translation word embeddings localityse...
6,lecture detailed little hard straight helped r...
7,other i informative fun
8,lesson sentiment analysis logistic regression ...
9,instructor instructor senior curriculum developer


In [18]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,ai,analysis,application,approximate,assignment,aug,awesome,bayes,best,binary,...,tweet,ukasz,university,use,vector,video,visualize,week,word,younes
0,0,1,0,1,0,0,0,1,0,0,...,1,0,0,2,2,1,1,0,3,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,0,1,0,0,1,1,1,0,1,0,...,0,0,0,0,1,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [20]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=80)
ldana.print_topics()

2022-02-24 21:44:00,600 : INFO : using symmetric alpha at 0.2
2022-02-24 21:44:00,602 : INFO : using symmetric eta at 0.2
2022-02-24 21:44:00,602 : INFO : using serial LDA version on this node
2022-02-24 21:44:00,603 : INFO : running online (multi-pass) LDA training, 5 topics, 80 passes over the supplied corpus of 10 documents, updating model once every 10 documents, evaluating perplexity every 10 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-24 21:44:00,610 : INFO : -7.529 per-word bound, 184.7 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:00,611 : INFO : PROGRESS: pass 0, at document #10/10
2022-02-24 21:44:00,616 : INFO : topic #0 (0.200): 0.054*"specialization" + 0.041*"nlp" + 0.035*"language" + 0.029*"perform" + 0.029*"analysis" + 0.029*"sentiment" + 0.022*"text" + 0.022*"tool" + 0.022*"translate" + 0.022*"application"
2022-02-24 21:44:00,617 : INFO : topic #1 (0.200): 0.077*"instructor" + 0.029*"learning

2022-02-24 21:44:00,682 : INFO : topic #2 (0.200): 0.036*"google" + 0.036*"scientist" + 0.036*"staff" + 0.036*"ukasz" + 0.036*"tensorflow" + 0.036*"library" + 0.036*"brain" + 0.036*"coauthor" + 0.036*"research" + 0.036*"transformer"
2022-02-24 21:44:00,682 : INFO : topic #3 (0.200): 0.061*"regression" + 0.061*"logistic" + 0.034*"vector" + 0.033*"tweet" + 0.033*"analysis" + 0.033*"sentiment" + 0.033*"text" + 0.033*"lesson" + 0.033*"learn" + 0.033*"binary"
2022-02-24 21:44:00,683 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*"word" + 0.034*"space" + 0.033*"vector" + 0.023*"translation" + 0.023*"rating" + 0.023*"embeddings" + 0.023*"localitysensitive" + 0.023*"sentiment" + 0.023*"analysis"
2022-02-24 21:44:00,683 : INFO : topic diff=0.037354, rho=0.377964
2022-02-24 21:44:00,688 : INFO : -5.074 per-word bound, 33.7 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:00,689 : INFO : PROGRESS: pass 6, at document #10/10
2022-02-24 21:44:00,691 

2022-02-24 21:44:00,740 : INFO : topic #0 (0.200): 0.059*"specialization" + 0.058*"nlp" + 0.032*"language" + 0.032*"perform" + 0.032*"analysis" + 0.032*"sentiment" + 0.032*"machine" + 0.032*"tool" + 0.032*"translate" + 0.032*"application"
2022-02-24 21:44:00,740 : INFO : topic #1 (0.200): 0.077*"instructor" + 0.029*"learning" + 0.029*"deep" + 0.029*"university" + 0.029*"younes" + 0.029*"stanford" + 0.029*"ai" + 0.029*"detailed" + 0.029*"mourri" + 0.029*"little"
2022-02-24 21:44:00,741 : INFO : topic #2 (0.200): 0.036*"google" + 0.036*"scientist" + 0.036*"staff" + 0.036*"ukasz" + 0.036*"tensorflow" + 0.036*"library" + 0.036*"coauthor" + 0.036*"research" + 0.036*"brain" + 0.036*"transformer"
2022-02-24 21:44:00,741 : INFO : topic #3 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"vector" + 0.034*"tweet" + 0.034*"analysis" + 0.034*"sentiment" + 0.034*"text" + 0.034*"lesson" + 0.034*"learn" + 0.034*"extract"
2022-02-24 21:44:00,742 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*

2022-02-24 21:44:00,786 : INFO : topic diff=0.000600, rho=0.235702
2022-02-24 21:44:00,789 : INFO : -5.072 per-word bound, 33.6 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:00,789 : INFO : PROGRESS: pass 17, at document #10/10
2022-02-24 21:44:00,792 : INFO : topic #0 (0.200): 0.059*"specialization" + 0.058*"nlp" + 0.032*"language" + 0.032*"perform" + 0.032*"analysis" + 0.032*"sentiment" + 0.032*"machine" + 0.032*"translate" + 0.032*"tool" + 0.032*"application"
2022-02-24 21:44:00,792 : INFO : topic #1 (0.200): 0.077*"instructor" + 0.029*"deep" + 0.029*"learning" + 0.029*"university" + 0.029*"younes" + 0.029*"stanford" + 0.029*"ai" + 0.029*"mourri" + 0.029*"detailed" + 0.029*"straight"
2022-02-24 21:44:00,793 : INFO : topic #2 (0.200): 0.036*"google" + 0.036*"scientist" + 0.036*"staff" + 0.036*"tensorflow" + 0.036*"library" + 0.036*"ukasz" + 0.036*"brain" + 0.036*"coauthor" + 0.036*"research" + 0.036*"transformer"
2022-02-24 21:44:00,79

2022-02-24 21:44:00,838 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"translation" + 0.023*"rating" + 0.023*"embeddings" + 0.023*"localitysensitive" + 0.023*"sentiment" + 0.023*"analysis"
2022-02-24 21:44:00,838 : INFO : topic diff=0.000116, rho=0.204124
2022-02-24 21:44:00,842 : INFO : -5.072 per-word bound, 33.6 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:00,842 : INFO : PROGRESS: pass 23, at document #10/10
2022-02-24 21:44:00,844 : INFO : topic #0 (0.200): 0.059*"specialization" + 0.059*"nlp" + 0.032*"perform" + 0.032*"language" + 0.032*"machine" + 0.032*"translate" + 0.032*"tool" + 0.032*"application" + 0.032*"end" + 0.032*"chatbot"
2022-02-24 21:44:00,845 : INFO : topic #1 (0.200): 0.077*"instructor" + 0.029*"deep" + 0.029*"learning" + 0.029*"university" + 0.029*"younes" + 0.029*"stanford" + 0.029*"mourri" + 0.029*"ai" + 0.029*"detailed" + 0.029*"straight"
2022-02-24 21:44:00,8

2022-02-24 21:44:00,890 : INFO : topic #3 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"analysis" + 0.034*"sentiment" + 0.034*"tweet" + 0.034*"text" + 0.034*"numerical" + 0.034*"lesson" + 0.034*"classifier" + 0.034*"learn"
2022-02-24 21:44:00,891 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"translation" + 0.023*"rating" + 0.023*"embeddings" + 0.023*"localitysensitive"
2022-02-24 21:44:00,891 : INFO : topic diff=0.000028, rho=0.182574
2022-02-24 21:44:00,896 : INFO : -5.072 per-word bound, 33.6 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:00,896 : INFO : PROGRESS: pass 29, at document #10/10
2022-02-24 21:44:00,899 : INFO : topic #0 (0.200): 0.059*"specialization" + 0.059*"nlp" + 0.032*"machine" + 0.032*"perform" + 0.032*"application" + 0.032*"tool" + 0.032*"end" + 0.032*"translate" + 0.032*"chatbot" + 0.032*"expert"
2022-02-24 21:

2022-02-24 21:44:00,946 : INFO : topic #2 (0.200): 0.036*"tensorflow" + 0.036*"research" + 0.036*"paper" + 0.036*"coauthor" + 0.036*"staff" + 0.036*"brain" + 0.036*"scientist" + 0.036*"google" + 0.036*"ukasz" + 0.036*"library"
2022-02-24 21:44:00,946 : INFO : topic #3 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"analysis" + 0.034*"sentiment" + 0.034*"tweet" + 0.034*"text" + 0.034*"feature" + 0.034*"learn" + 0.034*"lesson" + 0.034*"classifier"
2022-02-24 21:44:00,947 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"translation" + 0.023*"rating" + 0.023*"embeddings" + 0.023*"localitysensitive"
2022-02-24 21:44:00,947 : INFO : topic diff=0.000008, rho=0.166667
2022-02-24 21:44:00,950 : INFO : -5.072 per-word bound, 33.6 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:00,951 : INFO : PROGRESS: pass 35, at document #10/10
2022-02-24 21:44:00

2022-02-24 21:44:00,998 : INFO : topic #1 (0.200): 0.077*"instructor" + 0.029*"deep" + 0.029*"learning" + 0.029*"younes" + 0.029*"university" + 0.029*"stanford" + 0.029*"mourri" + 0.029*"ai" + 0.029*"hard" + 0.029*"lecture"
2022-02-24 21:44:00,999 : INFO : topic #2 (0.200): 0.036*"tensorflow" + 0.036*"transformer" + 0.036*"brain" + 0.036*"scientist" + 0.036*"research" + 0.036*"coauthor" + 0.036*"paper" + 0.036*"google" + 0.036*"ukasz" + 0.036*"library"
2022-02-24 21:44:00,999 : INFO : topic #3 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"tweet" + 0.034*"text" + 0.034*"classifier" + 0.034*"lesson" + 0.034*"extract" + 0.034*"feature"
2022-02-24 21:44:01,000 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"translation" + 0.023*"embeddings" + 0.023*"rating" + 0.023*"localitysensitive"
2022-02-24 21:44:01,001 : INFO : topic diff=0.000003, rho=0.1543

2022-02-24 21:44:01,050 : INFO : -5.072 per-word bound, 33.6 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:01,050 : INFO : PROGRESS: pass 46, at document #10/10
2022-02-24 21:44:01,054 : INFO : topic #0 (0.200): 0.059*"specialization" + 0.059*"nlp" + 0.032*"machine" + 0.032*"perform" + 0.032*"tool" + 0.032*"translate" + 0.032*"chatbot" + 0.032*"end" + 0.032*"application" + 0.032*"expert"
2022-02-24 21:44:01,054 : INFO : topic #1 (0.200): 0.077*"instructor" + 0.029*"deep" + 0.029*"learning" + 0.029*"younes" + 0.029*"university" + 0.029*"stanford" + 0.029*"mourri" + 0.029*"ai" + 0.029*"hard" + 0.029*"lecture"
2022-02-24 21:44:01,055 : INFO : topic #2 (0.200): 0.036*"tensorflow" + 0.036*"transformer" + 0.036*"brain" + 0.036*"scientist" + 0.036*"research" + 0.036*"coauthor" + 0.036*"paper" + 0.036*"google" + 0.036*"ukasz" + 0.036*"library"
2022-02-24 21:44:01,055 : INFO : topic #3 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"senti

2022-02-24 21:44:01,102 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"natural" + 0.023*"processing" + 0.023*"use" + 0.023*"relationship"
2022-02-24 21:44:01,103 : INFO : topic diff=0.000001, rho=0.137361
2022-02-24 21:44:01,108 : INFO : -5.072 per-word bound, 33.6 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:01,108 : INFO : PROGRESS: pass 52, at document #10/10
2022-02-24 21:44:01,111 : INFO : topic #0 (0.200): 0.059*"specialization" + 0.059*"nlp" + 0.032*"machine" + 0.032*"perform" + 0.032*"chatbot" + 0.032*"tool" + 0.032*"end" + 0.032*"translate" + 0.032*"application" + 0.032*"expert"
2022-02-24 21:44:01,111 : INFO : topic #1 (0.200): 0.077*"instructor" + 0.029*"deep" + 0.029*"learning" + 0.029*"younes" + 0.029*"university" + 0.029*"stanford" + 0.029*"mourri" + 0.029*"ai" + 0.029*"hard" + 0.029*"lecture"
2022-02-24 21:44:01,112 : INFO : to

2022-02-24 21:44:01,160 : INFO : topic #3 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"tweet" + 0.034*"text" + 0.034*"lesson" + 0.034*"feature" + 0.034*"numerical" + 0.034*"classifier"
2022-02-24 21:44:01,160 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"use" + 0.023*"processing" + 0.023*"natural" + 0.023*"relationship"
2022-02-24 21:44:01,161 : INFO : topic diff=0.000000, rho=0.130189
2022-02-24 21:44:01,165 : INFO : -5.072 per-word bound, 33.6 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:01,165 : INFO : PROGRESS: pass 58, at document #10/10
2022-02-24 21:44:01,168 : INFO : topic #0 (0.200): 0.059*"specialization" + 0.059*"nlp" + 0.032*"machine" + 0.032*"perform" + 0.032*"end" + 0.032*"application" + 0.032*"translate" + 0.032*"tool" + 0.032*"chatbot" + 0.032*"expert"
2022-02-24 21:44:01,169 

2022-02-24 21:44:01,215 : INFO : topic #2 (0.200): 0.036*"tensorflow" + 0.036*"transformer" + 0.036*"brain" + 0.036*"scientist" + 0.036*"research" + 0.036*"coauthor" + 0.036*"paper" + 0.036*"google" + 0.036*"ukasz" + 0.036*"library"
2022-02-24 21:44:01,215 : INFO : topic #3 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"tweet" + 0.034*"text" + 0.034*"lesson" + 0.034*"feature" + 0.034*"numerical" + 0.034*"classifier"
2022-02-24 21:44:01,216 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"processing" + 0.023*"use" + 0.023*"natural" + 0.023*"relationship"
2022-02-24 21:44:01,216 : INFO : topic diff=0.000000, rho=0.124035
2022-02-24 21:44:01,220 : INFO : -5.072 per-word bound, 33.6 perplexity estimate based on a held-out corpus of 10 documents with 146 words
2022-02-24 21:44:01,221 : INFO : PROGRESS: pass 64, at document #10/10
2022-02-24 21:44:01,2

2022-02-24 21:44:01,268 : INFO : topic #1 (0.200): 0.077*"instructor" + 0.029*"deep" + 0.029*"learning" + 0.029*"younes" + 0.029*"university" + 0.029*"stanford" + 0.029*"mourri" + 0.029*"ai" + 0.029*"hard" + 0.029*"lecture"
2022-02-24 21:44:01,269 : INFO : topic #2 (0.200): 0.036*"tensorflow" + 0.036*"transformer" + 0.036*"brain" + 0.036*"scientist" + 0.036*"research" + 0.036*"coauthor" + 0.036*"paper" + 0.036*"google" + 0.036*"ukasz" + 0.036*"library"
2022-02-24 21:44:01,270 : INFO : topic #3 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"tweet" + 0.034*"text" + 0.034*"lesson" + 0.034*"feature" + 0.034*"numerical" + 0.034*"classifier"
2022-02-24 21:44:01,270 : INFO : topic #4 (0.200): 0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"processing" + 0.023*"use" + 0.023*"natural" + 0.023*"relationship"
2022-02-24 21:44:01,271 : INFO : topic diff=0.000000, rho=0.118678
2022-02

2022-02-24 21:44:01,324 : INFO : PROGRESS: pass 75, at document #10/10
2022-02-24 21:44:01,327 : INFO : topic #0 (0.200): 0.059*"specialization" + 0.059*"nlp" + 0.032*"machine" + 0.032*"perform" + 0.032*"end" + 0.032*"application" + 0.032*"translate" + 0.032*"tool" + 0.032*"chatbot" + 0.032*"expert"
2022-02-24 21:44:01,327 : INFO : topic #1 (0.200): 0.077*"instructor" + 0.029*"deep" + 0.029*"learning" + 0.029*"younes" + 0.029*"university" + 0.029*"stanford" + 0.029*"mourri" + 0.029*"ai" + 0.029*"hard" + 0.029*"lecture"
2022-02-24 21:44:01,328 : INFO : topic #2 (0.200): 0.036*"tensorflow" + 0.036*"transformer" + 0.036*"brain" + 0.036*"scientist" + 0.036*"research" + 0.036*"coauthor" + 0.036*"paper" + 0.036*"google" + 0.036*"ukasz" + 0.036*"library"
2022-02-24 21:44:01,328 : INFO : topic #3 (0.200): 0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"tweet" + 0.034*"text" + 0.034*"lesson" + 0.034*"feature" + 0.034*"numerical" + 0.034*"classifier"
2022-02

[(0,
  '0.059*"specialization" + 0.059*"nlp" + 0.032*"machine" + 0.032*"perform" + 0.032*"end" + 0.032*"application" + 0.032*"translate" + 0.032*"tool" + 0.032*"chatbot" + 0.032*"expert"'),
 (1,
  '0.077*"instructor" + 0.029*"deep" + 0.029*"learning" + 0.029*"younes" + 0.029*"university" + 0.029*"stanford" + 0.029*"mourri" + 0.029*"ai" + 0.029*"hard" + 0.029*"lecture"'),
 (2,
  '0.036*"tensorflow" + 0.036*"transformer" + 0.036*"brain" + 0.036*"scientist" + 0.036*"research" + 0.036*"coauthor" + 0.036*"paper" + 0.036*"google" + 0.036*"ukasz" + 0.036*"library"'),
 (3,
  '0.062*"regression" + 0.062*"logistic" + 0.034*"sentiment" + 0.034*"analysis" + 0.034*"tweet" + 0.034*"text" + 0.034*"lesson" + 0.034*"feature" + 0.034*"numerical" + 0.034*"classifier"'),
 (4,
  '0.065*"course" + 0.044*"word" + 0.033*"vector" + 0.033*"space" + 0.023*"specialization" + 0.023*"language" + 0.023*"processing" + 0.023*"use" + 0.023*"natural" + 0.023*"relationship"')]

In [21]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for a in corpus_transformed], data_dtmna.index))

[([(4, 0.98423046)], 0),
 ([(0, 0.93821067),
   (1, 0.015399863),
   (2, 0.015387423),
   (3, 0.015496068),
   (4, 0.015505998)],
  1),
 ([(0, 0.8853018),
   (1, 0.028872894),
   (2, 0.028576214),
   (3, 0.02857595),
   (4, 0.028673114)],
  2),
 ([(0, 0.020304438),
   (1, 0.91962755),
   (2, 0.020004138),
   (3, 0.020003907),
   (4, 0.020059945)],
  3),
 ([(0, 0.01538711),
   (1, 0.01538687),
   (2, 0.93845314),
   (3, 0.015387241),
   (4, 0.015385632)],
  4),
 ([(4, 0.9703012)], 5),
 ([(0, 0.022226714),
   (1, 0.9108184),
   (2, 0.02222724),
   (3, 0.022440098),
   (4, 0.022287546)],
  6),
 ([(0, 0.06667876),
   (1, 0.06667762),
   (2, 0.7332926),
   (3, 0.06667943),
   (4, 0.06667159)],
  7),
 ([(0, 0.011821607),
   (1, 0.011780262),
   (2, 0.01176662),
   (3, 0.95278245),
   (4, 0.011849039)],
  8),
 ([(0, 0.033338226),
   (1, 0.8666491),
   (2, 0.0333388),
   (3, 0.033338495),
   (4, 0.033335328)],
  9)]

In [55]:
sent_topics_df = pd.DataFrame()
for i, row_list in enumerate(ldana[corpusna]):
        row = row_list[0] if ldana.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldana.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
sent_topics_df.columns = ['Dom_Topic', 'Topic_Contri', 'Keywords']
print(sent_topics_df)

   Dom_Topic  Topic_Contri                                           Keywords
0        4.0        0.9842  course, word, vector, space, specialization, l...
1        0.0        0.9382  specialization, nlp, machine, perform, end, ap...
2        0.0        0.8853  specialization, nlp, machine, perform, end, ap...
3        1.0        0.9196  instructor, deep, learning, younes, university...
4        2.0        0.9385  tensorflow, transformer, brain, scientist, res...
5        4.0        0.9703  course, word, vector, space, specialization, l...
6        1.0        0.9108  instructor, deep, learning, younes, university...
7        2.0        0.7333  tensorflow, transformer, brain, scientist, res...
8        3.0        0.9528  regression, logistic, sentiment, analysis, twe...
9        1.0        0.8666  instructor, deep, learning, younes, university...


In [84]:
data = {}
sentences = ""
corpus = pd.read_pickle("corpus.pkl")
corpus
len(sent_topics_df)
i=0
a=0
while(a<len(sent_topics_df)-1):
    sentences = corpus.loc[a].at['transcript']
    if(sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"]):
        while((a<len(sent_topics_df)-1) and (sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"])):
            sentences += corpus.loc[a+1].at['transcript']
            a+=1
    data[i] = sentences
    i+=1
    a+=1
data[i] = sentences = corpus.loc[a].at['transcript']
data

{0: '4.6 ( 3,347 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Natural Language Processing Specialization , : ) Perform sentiment analysis tweet using logistic regression naÃ¯ve Bayes , b ) Use vector space model discover relationship word use PCA reduce dimensionality vector space visualize relationship , c ) Write simple English French translation algorithm using pre-computed word embeddings locality-sensitive hashing relate word via approximate k-nearest neighbor search .',
 1: 'By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , even built chatbot !This Specialization designed taught two expert NLP , machine learning , deep learning .',
 2: 'Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .',
 3: 'Å\x81ukasz Kaiser Staff Research Scientist Google Brai

In [88]:
# We are going to change this to key: sentence_id, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ''.join(list_of_text)
    return combined_text

In [89]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}
data_combined

{0: ['4.6 ( 3,347 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Natural Language Processing Specialization , : ) Perform sentiment analysis tweet using logistic regression naÃ¯ve Bayes , b ) Use vector space model discover relationship word use PCA reduce dimensionality vector space visualize relationship , c ) Write simple English French translation algorithm using pre-computed word embeddings locality-sensitive hashing relate word via approximate k-nearest neighbor search .'],
 1: ['By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , even built chatbot !This Specialization designed taught two expert NLP , machine learning , deep learning .'],
 2: ['Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .'],
 3: ['Å\x81ukasz Kaiser Staff Research Scientist Goog

In [90]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

combined_sent = pd.DataFrame.from_dict(data_combined).transpose()
combined_sent.columns = ['transcript']
combined_sent = combined_sent.sort_index()
combined_sent

Unnamed: 0,transcript
0,"4.6 ( 3,347 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Nat..."
1,"By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , ..."
2,Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .
3,"Åukasz Kaiser Staff Research Scientist Google Brain co-author Tensorflow , Tensor2Tensor Trax library , Transformer paper ."
4,"Machine Translation , Word Embeddings , Locality-Sensitive Hashing , Sentiment Analysis , Vector Space Models 4.6 ( 3,347 rating ) HA Aug 9 , 2020..."
5,"The lecture exciting detailed , though little hard straight forward sometimes , Youtube helped Regression model ."
6,"Other , I informative fun ."
7,"From lesson Sentiment Analysis Logistic Regression Learn extract feature text numerical vector , build binary classifier tweet using logistic regr..."
8,Instructor Instructor Senior Curriculum Developer
