In [1]:
# Let's read in our document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,accepted,actually,afar,africa,aging,ago,air,alcoholic,align,allah,...,world,worried,worse,worst,write,writer,written,wrote,yes,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
171,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
172,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
accepted,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
actually,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
afar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
africa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aging,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [5]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [6]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

2022-05-06 23:31:09,319 : INFO : using symmetric alpha at 0.5
2022-05-06 23:31:09,322 : INFO : using symmetric eta at 0.5
2022-05-06 23:31:09,322 : INFO : using serial LDA version on this node
2022-05-06 23:31:09,328 : INFO : running online (multi-pass) LDA training, 2 topics, 10 passes over the supplied corpus of 175 documents, updating model once every 175 documents, evaluating perplexity every 175 documents, iterating 50x with a convergence threshold of 0.001000
2022-05-06 23:31:09,437 : INFO : -7.248 per-word bound, 152.0 perplexity estimate based on a held-out corpus of 175 documents with 902 words
2022-05-06 23:31:09,438 : INFO : PROGRESS: pass 0, at document #175/175
2022-05-06 23:31:09,518 : INFO : topic #0 (0.500): 0.013*"ve" + 0.007*"allah" + 0.007*"artist" + 0.007*"dance" + 0.006*"idea" + 0.006*"tom" + 0.005*"divine" + 0.005*"started" + 0.005*"century" + 0.005*"called"
2022-05-06 23:31:09,520 : INFO : topic #1 (0.500): 0.009*"god" + 0.007*"make" + 0.006*"writer" + 0.006*"anc

[(0,
  '0.015*"ve" + 0.008*"artist" + 0.008*"dance" + 0.007*"allah" + 0.007*"tom" + 0.006*"century" + 0.006*"believed" + 0.006*"divine" + 0.006*"talking" + 0.006*"called"'),
 (1,
  '0.009*"god" + 0.009*"make" + 0.008*"question" + 0.008*"ancient" + 0.008*"believe" + 0.007*"feel" + 0.006*"somebody" + 0.006*"writer" + 0.006*"anybody" + 0.006*"chemical"')]

In [7]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

2022-05-06 23:31:10,119 : INFO : using symmetric alpha at 0.25
2022-05-06 23:31:10,121 : INFO : using symmetric eta at 0.25
2022-05-06 23:31:10,121 : INFO : using serial LDA version on this node
2022-05-06 23:31:10,122 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 175 documents, updating model once every 175 documents, evaluating perplexity every 175 documents, iterating 50x with a convergence threshold of 0.001000
2022-05-06 23:31:10,193 : INFO : -8.607 per-word bound, 389.8 perplexity estimate based on a held-out corpus of 175 documents with 902 words
2022-05-06 23:31:10,193 : INFO : PROGRESS: pass 0, at document #175/175
2022-05-06 23:31:10,254 : INFO : topic #0 (0.250): 0.009*"believed" + 0.009*"idea" + 0.009*"lit" + 0.009*"question" + 0.007*"unknowable" + 0.007*"source" + 0.007*"believe" + 0.006*"ancient" + 0.006*"started" + 0.006*"extraordinary"
2022-05-06 23:31:10,255 : INFO : topic #1 (0.250): 0.018*"ve" + 0.011*"job" + 0.009

2022-05-06 23:31:10,612 : INFO : topic #3 (0.250): 0.018*"god" + 0.013*"allah" + 0.011*"make" + 0.011*"love" + 0.011*"ve" + 0.011*"applause" + 0.008*"chemical" + 0.008*"career" + 0.006*"dance" + 0.006*"question"
2022-05-06 23:31:10,613 : INFO : topic diff=0.004789, rho=0.353553
2022-05-06 23:31:10,649 : INFO : -6.982 per-word bound, 126.4 perplexity estimate based on a held-out corpus of 175 documents with 902 words
2022-05-06 23:31:10,650 : INFO : PROGRESS: pass 7, at document #175/175
2022-05-06 23:31:10,673 : INFO : topic #0 (0.250): 0.010*"idea" + 0.010*"lit" + 0.010*"question" + 0.007*"started" + 0.007*"extraordinary" + 0.007*"believed" + 0.007*"believe" + 0.007*"great" + 0.007*"source" + 0.007*"unknowable"
2022-05-06 23:31:10,674 : INFO : topic #1 (0.250): 0.019*"ve" + 0.011*"job" + 0.008*"somebody" + 0.008*"talking" + 0.008*"idea" + 0.008*"doomed" + 0.008*"hell" + 0.008*"internalized" + 0.006*"dance" + 0.006*"better"
2022-05-06 23:31:10,674 : INFO : topic #2 (0.250): 0.012*"anci

[(0,
  '0.010*"idea" + 0.010*"lit" + 0.010*"question" + 0.007*"started" + 0.007*"extraordinary" + 0.007*"believe" + 0.007*"great" + 0.007*"source" + 0.007*"believed" + 0.007*"unknowable"'),
 (1,
  '0.019*"ve" + 0.011*"job" + 0.008*"somebody" + 0.008*"idea" + 0.008*"talking" + 0.008*"doomed" + 0.008*"hell" + 0.008*"internalized" + 0.006*"mystery" + 0.006*"better"'),
 (2,
  '0.012*"ancient" + 0.012*"artist" + 0.009*"century" + 0.009*"got" + 0.009*"allah" + 0.009*"brilliant" + 0.009*"reputation" + 0.009*"continue" + 0.009*"changed" + 0.009*"writer"'),
 (3,
  '0.018*"god" + 0.013*"allah" + 0.011*"make" + 0.011*"love" + 0.011*"ve" + 0.011*"applause" + 0.008*"chemical" + 0.008*"career" + 0.006*"dance" + 0.006*"dancer"')]

In [8]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [9]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript
0,i writer
1,writing book profession s course
2,it also great lifelong love fascination
3,and i nt expect s ever going change
4,but said something kind peculiar happened re...
...,...
170,nonetheless sheer human love stubbornness ke...
171,thank
172,applause thank
173,applause june cohen olé


In [10]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

Unnamed: 0,transcript
0,writer
1,book profession s course
2,love fascination
3,nt change
4,something kind life career relationship work
...,...
170,sheer love stubbornness showing
171,thank
172,applause thank
173,applause june cohen olé


In [11]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,afar,afraid,air,allah,ancient,angeles,anguish,answer,anxiety,anybody,...,way,wisdom,wonderment,word,work,world,worried,writer,year,yes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
171,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
172,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [13]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

2022-05-06 23:31:11,539 : INFO : using symmetric alpha at 0.25
2022-05-06 23:31:11,542 : INFO : using symmetric eta at 0.25
2022-05-06 23:31:11,548 : INFO : using serial LDA version on this node
2022-05-06 23:31:11,550 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 175 documents, updating model once every 175 documents, evaluating perplexity every 175 documents, iterating 50x with a convergence threshold of 0.001000
2022-05-06 23:31:11,614 : INFO : -7.798 per-word bound, 222.5 perplexity estimate based on a held-out corpus of 175 documents with 567 words
2022-05-06 23:31:11,615 : INFO : PROGRESS: pass 0, at document #175/175
2022-05-06 23:31:11,663 : INFO : topic #0 (0.250): 0.034*"work" + 0.026*"way" + 0.025*"sort" + 0.025*"life" + 0.025*"olé" + 0.017*"kind" + 0.013*"allah" + 0.013*"artist" + 0.013*"divine" + 0.009*"house"
2022-05-06 23:31:11,664 : INFO : topic #1 (0.250): 0.036*"thing" + 0.021*"god" + 0.021*"year" + 0.016*"book" + 0

2022-05-06 23:31:12,020 : INFO : topic diff=0.005981, rho=0.353553
2022-05-06 23:31:12,057 : INFO : -6.257 per-word bound, 76.5 perplexity estimate based on a held-out corpus of 175 documents with 567 words
2022-05-06 23:31:12,058 : INFO : PROGRESS: pass 7, at document #175/175
2022-05-06 23:31:12,080 : INFO : topic #0 (0.250): 0.043*"work" + 0.026*"way" + 0.026*"olé" + 0.026*"sort" + 0.026*"life" + 0.018*"kind" + 0.013*"artist" + 0.013*"allah" + 0.013*"divine" + 0.009*"genius"
2022-05-06 23:31:12,081 : INFO : topic #1 (0.250): 0.045*"thing" + 0.025*"god" + 0.021*"year" + 0.020*"sort" + 0.016*"genius" + 0.016*"somebody" + 0.016*"book" + 0.016*"allah" + 0.016*"applause" + 0.011*"mind"
2022-05-06 23:31:12,081 : INFO : topic #2 (0.250): 0.029*"book" + 0.028*"nt" + 0.027*"kind" + 0.024*"thing" + 0.023*"dance" + 0.022*"laughter" + 0.018*"success" + 0.016*"life" + 0.015*"sort" + 0.015*"idea"
2022-05-06 23:31:12,082 : INFO : topic #3 (0.250): 0.030*"process" + 0.024*"work" + 0.019*"laughter" 

[(0,
  '0.044*"work" + 0.026*"way" + 0.026*"olé" + 0.026*"sort" + 0.026*"life" + 0.018*"kind" + 0.013*"artist" + 0.013*"allah" + 0.013*"divine" + 0.009*"genius"'),
 (1,
  '0.045*"thing" + 0.026*"god" + 0.021*"year" + 0.020*"sort" + 0.016*"genius" + 0.016*"somebody" + 0.016*"book" + 0.016*"allah" + 0.016*"applause" + 0.011*"mind"'),
 (2,
  '0.031*"book" + 0.029*"nt" + 0.027*"kind" + 0.024*"thing" + 0.023*"dance" + 0.020*"laughter" + 0.019*"success" + 0.017*"life" + 0.015*"sort" + 0.015*"idea"'),
 (3,
  '0.032*"process" + 0.026*"work" + 0.021*"laughter" + 0.017*"anybody" + 0.017*"poem" + 0.017*"creativity" + 0.017*"chemical" + 0.017*"question" + 0.012*"idea" + 0.012*"engineer"')]

In [14]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [15]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
0,i writer
1,book profession s course
2,great lifelong love fascination
3,nt s change
4,something kind peculiar life career recalibrat...
...,...
170,sheer human love stubbornness showing
171,thank
172,applause thank
173,applause june cohen olé


In [16]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,able,afar,afraid,africa,air,alcoholic,allah,aloud,american,ancient,...,wondrous,word,work,world,worried,worst,writer,year,yes,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
171,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
172,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [18]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=80)
ldana.print_topics()

2022-05-06 23:31:12,487 : INFO : using symmetric alpha at 0.2
2022-05-06 23:31:12,489 : INFO : using symmetric eta at 0.2
2022-05-06 23:31:12,491 : INFO : using serial LDA version on this node
2022-05-06 23:31:12,492 : INFO : running online (multi-pass) LDA training, 5 topics, 80 passes over the supplied corpus of 175 documents, updating model once every 175 documents, evaluating perplexity every 175 documents, iterating 50x with a convergence threshold of 0.001000
2022-05-06 23:31:12,552 : INFO : -8.841 per-word bound, 458.5 perplexity estimate based on a held-out corpus of 175 documents with 825 words
2022-05-06 23:31:12,553 : INFO : PROGRESS: pass 0, at document #175/175
2022-05-06 23:31:12,606 : INFO : topic #0 (0.200): 0.028*"creative" + 0.020*"creativity" + 0.018*"sort" + 0.017*"human" + 0.015*"laughter" + 0.015*"century" + 0.013*"work" + 0.011*"life" + 0.011*"reputation" + 0.011*"spirit"
2022-05-06 23:31:12,607 : INFO : topic #1 (0.200): 0.017*"chemical" + 0.017*"example" + 0.01

2022-05-06 23:31:12,907 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"life" + 0.017*"question" + 0.017*"job" + 0.012*"work" + 0.012*"genius" + 0.012*"true" + 0.012*"applause" + 0.012*"individual" + 0.012*"god"
2022-05-06 23:31:12,907 : INFO : topic diff=0.014211, rho=0.377964
2022-05-06 23:31:12,941 : INFO : -6.636 per-word bound, 99.5 perplexity estimate based on a held-out corpus of 175 documents with 825 words
2022-05-06 23:31:12,942 : INFO : PROGRESS: pass 6, at document #175/175
2022-05-06 23:31:12,962 : INFO : topic #0 (0.200): 0.035*"creative" + 0.028*"laughter" + 0.021*"human" + 0.019*"sort" + 0.018*"creativity" + 0.015*"century" + 0.013*"work" + 0.011*"life" + 0.011*"reputation" + 0.011*"spirit"
2022-05-06 23:31:12,963 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"example" + 0.017*"dance" + 0.016*"artist" + 0.015*"year" + 0.012*"thing" + 0.012*"process" + 0.012*"success"
2022-05-06 23:31:12,963 : INFO : topic #2 (0.200): 0.033*"work" +

2022-05-06 23:31:13,244 : INFO : topic diff=0.002372, rho=0.277350
2022-05-06 23:31:13,275 : INFO : -6.630 per-word bound, 99.0 perplexity estimate based on a held-out corpus of 175 documents with 825 words
2022-05-06 23:31:13,276 : INFO : PROGRESS: pass 12, at document #175/175
2022-05-06 23:31:13,296 : INFO : topic #0 (0.200): 0.037*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018*"sort" + 0.018*"creativity" + 0.015*"century" + 0.013*"work" + 0.011*"life" + 0.011*"reputation" + 0.011*"spirit"
2022-05-06 23:31:13,296 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"example" + 0.017*"dance" + 0.017*"artist" + 0.016*"year" + 0.012*"process" + 0.012*"thing" + 0.012*"success"
2022-05-06 23:31:13,297 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"
2022-05-06 23:31:13,297 : INFO : topic #3 (0.200): 0.035*"thing" + 0.019*"al

2022-05-06 23:31:13,607 : INFO : PROGRESS: pass 18, at document #175/175
2022-05-06 23:31:13,628 : INFO : topic #0 (0.200): 0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018*"sort" + 0.018*"creativity" + 0.015*"century" + 0.013*"work" + 0.012*"life" + 0.011*"divine" + 0.011*"reputation"
2022-05-06 23:31:13,629 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"example" + 0.017*"dance" + 0.017*"artist" + 0.016*"year" + 0.012*"process" + 0.012*"success" + 0.012*"thing"
2022-05-06 23:31:13,629 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"
2022-05-06 23:31:13,630 : INFO : topic #3 (0.200): 0.035*"thing" + 0.019*"allah" + 0.019*"afraid" + 0.016*"god" + 0.014*"nt" + 0.013*"way" + 0.010*"olé" + 0.010*"pencil" + 0.010*"paper" + 0.010*"feel"
2022-05-06 23:31:13,631 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question"

2022-05-06 23:31:13,966 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"example" + 0.017*"artist" + 0.017*"dance" + 0.016*"year" + 0.012*"process" + 0.012*"success" + 0.012*"genius"
2022-05-06 23:31:13,967 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"
2022-05-06 23:31:13,967 : INFO : topic #3 (0.200): 0.035*"thing" + 0.020*"allah" + 0.019*"afraid" + 0.016*"god" + 0.014*"nt" + 0.013*"way" + 0.010*"olé" + 0.010*"pencil" + 0.010*"paper" + 0.010*"feel"
2022-05-06 23:31:13,968 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question" + 0.017*"job" + 0.016*"life" + 0.012*"idea" + 0.012*"god" + 0.012*"individual" + 0.012*"genius" + 0.012*"work" + 0.012*"big"
2022-05-06 23:31:13,969 : INFO : topic diff=0.000456, rho=0.196116
2022-05-06 23:31:14,000 : INFO : -6.628 per-word bound, 98.9 perplexity estimate based on a held-out cor

2022-05-06 23:31:14,303 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"
2022-05-06 23:31:14,303 : INFO : topic #3 (0.200): 0.035*"thing" + 0.020*"allah" + 0.019*"afraid" + 0.016*"god" + 0.014*"nt" + 0.013*"way" + 0.010*"olé" + 0.010*"pencil" + 0.010*"paper" + 0.010*"feel"
2022-05-06 23:31:14,304 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question" + 0.017*"job" + 0.016*"life" + 0.012*"idea" + 0.012*"god" + 0.012*"individual" + 0.012*"genius" + 0.012*"work" + 0.012*"big"
2022-05-06 23:31:14,305 : INFO : topic diff=0.000146, rho=0.176777
2022-05-06 23:31:14,335 : INFO : -6.628 per-word bound, 98.9 perplexity estimate based on a held-out corpus of 175 documents with 825 words
2022-05-06 23:31:14,336 : INFO : PROGRESS: pass 31, at document #175/175
2022-05-06 23:31:14,356 : INFO : topic #0 (0.200): 0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018

2022-05-06 23:31:14,634 : INFO : topic #3 (0.200): 0.035*"thing" + 0.020*"allah" + 0.019*"afraid" + 0.016*"god" + 0.014*"nt" + 0.013*"way" + 0.010*"olé" + 0.010*"pencil" + 0.010*"paper" + 0.010*"feel"
2022-05-06 23:31:14,635 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question" + 0.017*"job" + 0.016*"life" + 0.012*"idea" + 0.012*"god" + 0.012*"individual" + 0.012*"genius" + 0.012*"work" + 0.012*"big"
2022-05-06 23:31:14,635 : INFO : topic diff=0.000070, rho=0.162221
2022-05-06 23:31:14,666 : INFO : -6.628 per-word bound, 98.9 perplexity estimate based on a held-out corpus of 175 documents with 825 words
2022-05-06 23:31:14,667 : INFO : PROGRESS: pass 37, at document #175/175
2022-05-06 23:31:14,688 : INFO : topic #0 (0.200): 0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018*"creativity" + 0.018*"sort" + 0.014*"century" + 0.013*"work" + 0.012*"life" + 0.011*"divine" + 0.011*"unknowable"
2022-05-06 23:31:14,689 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"chemical" + 0.

2022-05-06 23:31:14,970 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question" + 0.017*"job" + 0.016*"life" + 0.012*"idea" + 0.012*"god" + 0.012*"individual" + 0.012*"genius" + 0.012*"work" + 0.012*"big"
2022-05-06 23:31:14,971 : INFO : topic diff=0.000074, rho=0.150756
2022-05-06 23:31:15,001 : INFO : -6.628 per-word bound, 98.9 perplexity estimate based on a held-out corpus of 175 documents with 825 words
2022-05-06 23:31:15,002 : INFO : PROGRESS: pass 43, at document #175/175
2022-05-06 23:31:15,023 : INFO : topic #0 (0.200): 0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018*"creativity" + 0.018*"sort" + 0.014*"century" + 0.013*"work" + 0.012*"life" + 0.011*"divine" + 0.011*"unknowable"
2022-05-06 23:31:15,024 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"artist" + 0.017*"example" + 0.017*"dance" + 0.016*"year" + 0.012*"process" + 0.012*"success" + 0.012*"genius"
2022-05-06 23:31:15,024 : INFO : topic #2 (0.200): 0.034*"work" + 0.

2022-05-06 23:31:15,302 : INFO : topic diff=0.000603, rho=0.141421
2022-05-06 23:31:15,332 : INFO : -6.627 per-word bound, 98.8 perplexity estimate based on a held-out corpus of 175 documents with 825 words
2022-05-06 23:31:15,333 : INFO : PROGRESS: pass 49, at document #175/175
2022-05-06 23:31:15,353 : INFO : topic #0 (0.200): 0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018*"creativity" + 0.018*"sort" + 0.014*"century" + 0.013*"work" + 0.012*"life" + 0.011*"divine" + 0.011*"unknowable"
2022-05-06 23:31:15,354 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"artist" + 0.017*"example" + 0.017*"dance" + 0.016*"year" + 0.012*"process" + 0.012*"success" + 0.012*"genius"
2022-05-06 23:31:15,354 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"
2022-05-06 23:31:15,355 : INFO : topic #3 (0.200): 0.035*"thing" + 0.020*"a

2022-05-06 23:31:15,677 : INFO : PROGRESS: pass 55, at document #175/175
2022-05-06 23:31:15,699 : INFO : topic #0 (0.200): 0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018*"creativity" + 0.018*"sort" + 0.014*"century" + 0.013*"work" + 0.012*"life" + 0.011*"source" + 0.011*"divine"
2022-05-06 23:31:15,699 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"artist" + 0.017*"example" + 0.017*"dance" + 0.017*"year" + 0.012*"process" + 0.012*"success" + 0.012*"genius"
2022-05-06 23:31:15,700 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"
2022-05-06 23:31:15,701 : INFO : topic #3 (0.200): 0.035*"thing" + 0.020*"allah" + 0.019*"afraid" + 0.016*"god" + 0.014*"nt" + 0.013*"way" + 0.010*"olé" + 0.010*"paper" + 0.010*"pencil" + 0.010*"feel"
2022-05-06 23:31:15,701 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question" + 

2022-05-06 23:31:16,054 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"artist" + 0.017*"example" + 0.017*"dance" + 0.017*"year" + 0.012*"process" + 0.012*"success" + 0.012*"genius"
2022-05-06 23:31:16,055 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"
2022-05-06 23:31:16,056 : INFO : topic #3 (0.200): 0.035*"thing" + 0.020*"allah" + 0.019*"afraid" + 0.016*"god" + 0.014*"nt" + 0.013*"way" + 0.010*"olé" + 0.010*"paper" + 0.010*"pencil" + 0.010*"feel"
2022-05-06 23:31:16,057 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question" + 0.017*"job" + 0.016*"life" + 0.012*"idea" + 0.012*"god" + 0.012*"individual" + 0.012*"genius" + 0.012*"work" + 0.012*"big"
2022-05-06 23:31:16,058 : INFO : topic diff=0.000122, rho=0.125988
2022-05-06 23:31:16,091 : INFO : -6.625 per-word bound, 98.7 perplexity estimate based on a held-out cor

2022-05-06 23:31:16,408 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"
2022-05-06 23:31:16,408 : INFO : topic #3 (0.200): 0.035*"thing" + 0.020*"allah" + 0.019*"afraid" + 0.016*"god" + 0.014*"nt" + 0.013*"way" + 0.010*"olé" + 0.010*"paper" + 0.010*"pencil" + 0.010*"feel"
2022-05-06 23:31:16,408 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question" + 0.017*"job" + 0.016*"life" + 0.012*"idea" + 0.012*"god" + 0.012*"individual" + 0.012*"genius" + 0.012*"work" + 0.012*"big"
2022-05-06 23:31:16,409 : INFO : topic diff=0.000070, rho=0.120386
2022-05-06 23:31:16,442 : INFO : -6.625 per-word bound, 98.7 perplexity estimate based on a held-out corpus of 175 documents with 825 words
2022-05-06 23:31:16,443 : INFO : PROGRESS: pass 68, at document #175/175
2022-05-06 23:31:16,463 : INFO : topic #0 (0.200): 0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018

2022-05-06 23:31:16,744 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question" + 0.017*"job" + 0.016*"life" + 0.012*"idea" + 0.012*"god" + 0.012*"individual" + 0.012*"genius" + 0.012*"work" + 0.012*"big"
2022-05-06 23:31:16,744 : INFO : topic diff=0.000037, rho=0.115470
2022-05-06 23:31:16,776 : INFO : -6.625 per-word bound, 98.7 perplexity estimate based on a held-out corpus of 175 documents with 825 words
2022-05-06 23:31:16,777 : INFO : PROGRESS: pass 74, at document #175/175
2022-05-06 23:31:16,797 : INFO : topic #0 (0.200): 0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018*"creativity" + 0.018*"sort" + 0.014*"century" + 0.013*"work" + 0.012*"life" + 0.011*"source" + 0.011*"divine"
2022-05-06 23:31:16,797 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"year" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"artist" + 0.017*"example" + 0.017*"dance" + 0.012*"process" + 0.012*"success" + 0.012*"genius"
2022-05-06 23:31:16,798 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*

2022-05-06 23:31:17,103 : INFO : topic #0 (0.200): 0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018*"creativity" + 0.018*"sort" + 0.014*"century" + 0.013*"work" + 0.012*"life" + 0.011*"source" + 0.011*"divine"
2022-05-06 23:31:17,103 : INFO : topic #1 (0.200): 0.017*"tom" + 0.017*"year" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"artist" + 0.017*"example" + 0.017*"dance" + 0.012*"process" + 0.012*"success" + 0.012*"genius"
2022-05-06 23:31:17,104 : INFO : topic #2 (0.200): 0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"
2022-05-06 23:31:17,105 : INFO : topic #3 (0.200): 0.035*"thing" + 0.020*"allah" + 0.019*"afraid" + 0.016*"god" + 0.014*"nt" + 0.013*"way" + 0.010*"olé" + 0.010*"paper" + 0.010*"pencil" + 0.010*"feel"
2022-05-06 23:31:17,106 : INFO : topic #4 (0.200): 0.023*"kind" + 0.017*"question" + 0.017*"job" + 0.016*"life" + 0.012*"idea" + 0.012*"god" + 0.012*"individu

[(0,
  '0.038*"creative" + 0.028*"laughter" + 0.021*"human" + 0.018*"creativity" + 0.018*"sort" + 0.014*"century" + 0.013*"work" + 0.012*"life" + 0.011*"source" + 0.011*"divine"'),
 (1,
  '0.017*"tom" + 0.017*"year" + 0.017*"chemical" + 0.017*"ancient" + 0.017*"artist" + 0.017*"example" + 0.017*"dance" + 0.012*"process" + 0.012*"success" + 0.012*"genius"'),
 (2,
  '0.034*"work" + 0.031*"nt" + 0.025*"book" + 0.025*"sort" + 0.018*"olé" + 0.011*"genius" + 0.011*"kind" + 0.011*"life" + 0.011*"moment" + 0.011*"point"'),
 (3,
  '0.035*"thing" + 0.020*"allah" + 0.019*"afraid" + 0.016*"god" + 0.014*"nt" + 0.013*"way" + 0.010*"olé" + 0.010*"paper" + 0.010*"pencil" + 0.010*"feel"'),
 (4,
  '0.023*"kind" + 0.017*"question" + 0.017*"job" + 0.016*"life" + 0.012*"idea" + 0.012*"god" + 0.012*"individual" + 0.012*"genius" + 0.012*"work" + 0.012*"big"')]

In [19]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for a in corpus_transformed], data_dtmna.index))

[([(0, 0.10001757),
   (1, 0.100025475),
   (2, 0.10266737),
   (3, 0.10001644),
   (4, 0.5972732)],
  0),
 ([(0, 0.050008),
   (1, 0.050011504),
   (2, 0.7997865),
   (3, 0.050182223),
   (4, 0.050011728)],
  1),
 ([(0, 0.041195203),
   (1, 0.040014274),
   (2, 0.040325277),
   (3, 0.83795357),
   (4, 0.040511653)],
  2),
 ([(0, 0.06667127),
   (1, 0.7269902),
   (2, 0.07145717),
   (3, 0.06820793),
   (4, 0.06667343)],
  3),
 ([(0, 0.025269017),
   (1, 0.02502202),
   (2, 0.025569938),
   (3, 0.025201015),
   (4, 0.89893806)],
  4),
 ([(0, 0.012525681),
   (1, 0.01251511),
   (2, 0.012757372),
   (3, 0.94957083),
   (4, 0.012630969)],
  5),
 ([(0, 0.10001293),
   (1, 0.10001875),
   (2, 0.100013204),
   (3, 0.599936),
   (4, 0.10001912)],
  6),
 ([(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)], 7),
 ([(0, 0.040005766),
   (1, 0.040360942),
   (2, 0.040855214),
   (3, 0.8387696),
   (4, 0.04000844)],
  8),
 ([(0, 0.022302562),
   (1, 0.022629883),
   (2, 0.9101332),
   (3, 0.022603

In [32]:
sent_topics_df = pd.DataFrame()
for i, row_list in enumerate(ldana[corpusna]):
        row = row_list[0] if ldana.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldana.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
sent_topics_df.columns = ['Dom_Topic', 'Topic_Contri', 'Keywords']
print(sent_topics_df)

     Dom_Topic  Topic_Contri  \
0          4.0        0.5974   
1          2.0        0.7998   
2          3.0        0.8380   
3          1.0        0.7272   
4          4.0        0.8989   
..         ...           ...   
170        0.0        0.8662   
171        0.0        0.5999   
172        0.0        0.7315   
173        4.0        0.8378   
174        4.0        0.5974   

                                                                             Keywords  
0                 kind, question, job, life, idea, god, individual, genius, work, big  
1                        work, nt, book, sort, olé, genius, kind, life, moment, point  
2                        thing, allah, afraid, god, nt, way, olé, paper, pencil, feel  
3      tom, year, chemical, ancient, artist, example, dance, process, success, genius  
4                 kind, question, job, life, idea, god, individual, genius, work, big  
..                                                                                ...  

In [21]:
# data = {}
# sentences = ""
# corpus = pd.read_pickle("corpus.pkl")
# corpus
# i=0
# a=0
# z=0
# x=0
# while(a<len(sent_topics_df)-1):
#     sentences = corpus.loc[a].at['transcript']
#     if(sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"]):
#         while((a<len(sent_topics_df)-1) and (sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"])):
#             sentences += corpus.loc[a+1].at['transcript']
#             a+=1
#     data[i] = sentences
#     i+=1
#     a+=1
# if(a<len(sent_topics_df)):
#     data[i] = sentences = corpus.loc[a].at['transcript']
# data

In [22]:
# data = {}
# sentences = ""
# corpus = pd.read_pickle("corpus.pkl")
# corpus
# i=0
# a=0
# z=0
# x=0
# y=1
# j=1
# time_and_sentences = pd.read_pickle('time_and_sentences.pkl')
# keys = list(time_and_sentences.keys())
# values = list(time_and_sentences.values())

# while(a<len(sent_topics_df)-1):
#     sentences = corpus.loc[a].at['transcript']
#     if(sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"]):
#         while((a<len(sent_topics_df)-1) and (sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"])):
#             sentences += corpus.loc[a+1].at['transcript']
#             if(y < len(values) and (values[y]-j) >= (a+1)):
#                 values[y] = values[y]-j
#                 y += 1
                
#             else:
#                 while(y < len(values) and (values[y]-j) < (a+1)):
#                     values[y] = values[y]-j
#                     y += 1
                    
#             j += 1
#             a+=1
#     data[i] = sentences
#     i+=1
#     a+=1
# if(a<len(sent_topics_df)):
#     data[i] = sentences = corpus.loc[a].at['transcript']

# values.sort()

# time_and_sentences = dict(zip(keys, values))
# pickle.dump(time_and_sentences, open("time_and_sentences.pkl", "wb" ))

# data

In [31]:
data = {}
sentences = ""
corpus = pd.read_pickle("corpus.pkl")
corpus
i=0
a=0
j=1
b=0
merged_sentences = []
unique_topics = []
time_and_sentences = pd.read_pickle('time_and_sentences.pkl')
keys = list(time_and_sentences.keys())
values = list(time_and_sentences.values())
to_minus = []


while(a<len(sent_topics_df)-1):
    sentences = data_clean.loc[a].at['transcript']
    if(sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"]):
        while((a<len(sent_topics_df)-1) and (sent_topics_df.loc[a].at["Dom_Topic"] == sent_topics_df.loc[a+1].at["Dom_Topic"])):
            sentences += data_clean.loc[a+1].at['transcript']
            b+=1
            j+=1
            a+=1
        merged_sentences.append(a)
        to_minus.append(b)
    unique_topics.append(sent_topics_df.loc[a].at['Dom_Topic'])
    data[i] = sentences
    i+=1
    a+=1
if(a<len(sent_topics_df)):
    data[i] = sentences = data_clean.loc[a].at['transcript']
    unique_topics.append(sent_topics_df.loc[a].at['Dom_Topic'])


values.sort()

pickle.dump(time_and_sentences, open("time_and_sentences.pkl", "wb" ))

time_and_sentences = dict(zip(keys, values))
pickle.dump(time_and_sentences, open("time_and_sentences.pkl", "wb" ))

data

{0: ' i writer ',
 1: 'writing book profession s  course ',
 2: 'it also great lifelong love fascination ',
 3: 'and i nt expect s ever going change ',
 4: 'but  said  something kind peculiar happened recently life career  caused recalibrate whole relationship work ',
 5: 'and peculiar thing i recently wrote book  memoir called  eat  pray  love   decidedly unlike previous book  went world reason  became big  megasensation  international bestseller thing the result everywhere i go  people treat like i m doomed ',
 6: 'seriously  doomed  doomed ',
 7: 'like  come  worried  say   are nt afraid re never going able top ',
 8: 'are nt afraid re going keep writing whole life re never going create book anybody world care  ever  ',
 9: ' so s reassuring  know ',
 10: 'but would worse  except i happen remember  year ago  i teenager  i first started telling people i wanted writer  i met sort fearbased reaction ',
 11: 'and people would say   are nt afraid re never going success ',
 12: 'are nt af

In [24]:
i=1
j=0
a=0
print(values)
while(i < len(values)):
    while(j < len(merged_sentences) and merged_sentences[j] <= values[i]):
        a+=1
        j+=1
        
    values[i] = values[i] - a
    a=0
    j=0
    i+=1

time_and_sentences = dict(zip(keys, values))
print(values)

[0, 10, 15, 16, 28, 34, 40, 47, 48, 52, 58, 61, 67, 71, 76, 83, 88, 99, 100, 107, 109, 112, 117, 121, 125, 137, 144, 150, 158, 159, 171, 172, 173, 174]
[0, 9, 14, 15, 26, 30, 34, 40, 41, 45, 50, 53, 58, 62, 67, 72, 75, 84, 84, 91, 93, 96, 99, 103, 107, 116, 122, 127, 134, 134, 146, 146, 147, 147]


In [25]:
# We are going to change this to key: sentence_id, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ''.join(list_of_text)
    return combined_text

In [26]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}
data_combined

{0: [' i writer '],
 1: ['writing book profession s  course '],
 2: ['it also great lifelong love fascination '],
 3: ['and i nt expect s ever going change '],
 4: ['but  said  something kind peculiar happened recently life career  caused recalibrate whole relationship work '],
 5: ['and peculiar thing i recently wrote book  memoir called  eat  pray  love   decidedly unlike previous book  went world reason  became big  megasensation  international bestseller thing the result everywhere i go  people treat like i m doomed '],
 6: ['seriously  doomed  doomed '],
 7: ['like  come  worried  say   are nt afraid re never going able top '],
 8: ['are nt afraid re going keep writing whole life re never going create book anybody world care  ever  '],
 9: [' so s reassuring  know '],
 10: ['but would worse  except i happen remember  year ago  i teenager  i first started telling people i wanted writer  i met sort fearbased reaction '],
 11: ['and people would say   are nt afraid re never going suc

In [27]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

combined_sent = pd.DataFrame.from_dict(data_combined).transpose()
combined_sent.columns = ['transcript']
combined_sent = combined_sent.sort_index()
combined_sent

Unnamed: 0,transcript
0,i writer
1,writing book profession s course
2,it also great lifelong love fascination
3,and i nt expect s ever going change
4,but said something kind peculiar happened recently life career caused recalibrate whole relationship work
...,...
132,nonetheless
133,i believe i feel must teach
134,olé
135,nonetheless sheer human love stubbornness keep showing thank applause thank


In [28]:
# pickle it for later use
combined_sent.to_pickle("Combined_wrt_topics.pkl")

In [29]:
time_and_sentences

{'00:00': 0,
 '01:07': 9,
 '01:39': 14,
 '01:42': 15,
 '02:53': 26,
 '03:53': 30,
 '04:17': 34,
 '05:11': 40,
 '05:14': 41,
 '05:58': 45,
 '06:35': 50,
 '07:05': 53,
 '07:34': 58,
 '08:10': 62,
 '08:47': 67,
 '09:30': 72,
 '09:59': 75,
 '11:31': 84,
 '11:40': 84,
 '12:17': 91,
 '12:41': 93,
 '13:05': 96,
 '13:26': 99,
 '13:44': 103,
 '14:14': 107,
 '15:17': 116,
 '16:14': 122,
 '16:58': 127,
 '17:55': 134,
 '18:09': 134,
 '18:51': 146,
 '18:53': 146,
 '18:56': 147,
 '19:01': 147}