In [1]:
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import nltk
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os
from pprint import pprint
import spacy
import numpy as np
import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenenke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Topic Modeling

## Base LDA Model

### Real Tweets

In [None]:
df_preproc = pd.read_csv('Data/tweets_preprocessed.csv', index=False)

In [346]:
# Load in spaCy processed tweets located in preproc_pipe
df_preproc.head(2)

Unnamed: 0,date,name,tweet,hashtags,tweet_clean,length,clean,preproc_pipe
0,2021-01-22,Conan O'Brien,"After Covid, people with soul patches should still have to wear a mask.",[],"After Covid, people with soul patches should still have to wear a mask.",71,After Covid people with soul patches should still have wear mask,"[covid, people, soul, patch, still, wear, mask]"
1,2021-01-21,Conan O'Brien,I’ll never get used to having a First Lady who is not from Slovenia,[],Ill never get used to having a First Lady who is not from Slovenia,66,never get used having First Lady who not from Slovenia,"[never, get, use, first, lady, slovenia]"


In [347]:
df_preproc.preproc_pipe[0]

['covid', 'people', 'soul', 'patch', 'still', 'wear', 'mask']

In [348]:
df_preproc.preproc_pipe[0][0]

'covid'

In [349]:
stopwords = stopwords.words('english') + list(string.punctuation)
stopwords += ["''", '""', '...', '``', 'im', '\ufeff1', "'s", "n't", 'https', 'http', 'amp']

In [264]:
stop_words = stopwords.words('english')
stop_words.extend(["''", '""', '...', '``', 'im', '\ufeff1', "'s", "n't", 'https', 'http', 'amp'])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data = df_preproc.preproc_pipe.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

  and should_run_async(code)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenenke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['covid', 'people', 'soul', 'patch', 'still', 'wear', 'mask']


In [265]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# # View
print(corpus[:1][0][:30])

  and should_run_async(code)


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]


In [268]:
import gensim
from pprint import pprint
# number of topics
num_topics = 4
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

  and should_run_async(code)


[(0,
  '0.010*"know" + 0.008*"today" + 0.007*"time" + 0.007*"new" + 0.005*"bite" + '
  '0.005*"need" + 0.005*"back" + 0.004*"start" + 0.004*"live" + 0.004*"get"'),
 (1,
  '0.010*"thank" + 0.007*"like" + 0.007*"love" + 0.007*"day" + 0.006*"work" + '
  '0.006*"make" + 0.006*"well" + 0.005*"one" + 0.005*"think" + 0.005*"right"'),
 (2,
  '0.010*"like" + 0.009*"good" + 0.009*"make" + 0.006*"people" + 0.006*"want" '
  '+ 0.005*"look" + 0.005*"great" + 0.005*"new" + 0.005*"say" + 0.004*"get"'),
 (3,
  '0.015*"get" + 0.009*"one" + 0.009*"love" + 0.007*"think" + 0.007*"people" + '
  '0.007*"say" + 0.007*"see" + 0.007*"like" + 0.005*"know" + 0.005*"time"')]


In [269]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/Users/stephenenke/Documents/Flatiron/Phase-4/P4-Project'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '/Users/stephenenke/Documents/Flatiron/Phase-4/P4-Project'+ str(num_topics) +'.html')
LDAvis_prepared

  and should_run_async(code)


## LDA Model Tuning

### Real Tweets

In [274]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [276]:
tweets = pd.read_csv('Data/all_tweets_cleaned.csv')
tweets.head()

Unnamed: 0,tweet_clean
0,"After Covid, people with soul patches should still have to wear a mask."
1,Ill never get used to having a First Lady who is not from Slovenia
2,"6.8 miles below the sea is the Marianas Trench, and 6.8 miles below that is the bar Trump set for Biden."
3,Im looking forward to sweating the small stuff again.
4,Easy compromise: Lower the maximum wage to $15/hour.


In [277]:
# Load the regular expression library
import re
# Remove punctuation
tweets['tweet_clean'] = tweets['tweet_clean'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
tweets['tweet_clean'] = tweets['tweet_clean'].map(lambda x: x.lower())
# Print out the first rows of papers
tweets['tweet_clean'].head()

0    after covid people with soul patches should still have to wear a mask                               
1    ill never get used to having a first lady who is not from slovenia                                  
2    68 miles below the sea is the marianas trench and 68 miles below that is the bar trump set for biden
3    im looking forward to sweating the small stuff again                                                
4    easy compromise: lower the maximum wage to $15/hour                                                 
Name: tweet_clean, dtype: object

In [289]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data = tweets.tweet_clean.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:30])

['after', 'covid', 'people', 'with', 'soul', 'patches', 'should', 'still', 'have', 'to', 'wear', 'mask']


In [290]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [291]:
stop_words = stopwords.words('english') + list(string.punctuation)
stop_words += ['get', 'com', 'tinyurl', "''", '""', '...', '``', 'im', 'pourmecoffee', 
               '\ufeff1', "'s", "n't", 'https', 'http', 'amp', 'from', 'subject', 're', 'edu', 'use']

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [292]:
import spacy
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

[['covid', 'people', 'soul', 'patch', 'still', 'wear', 'mask']]


In [295]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]


In [296]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [297]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.021*"find" + 0.021*"bad" + 0.020*"way" + 0.019*"keep" + 0.017*"good" + '
  '0.013*"people" + 0.013*"work" + 0.013*"need" + 0.012*"make" + 0.012*"long"'),
 (1,
  '0.019*"today" + 0.018*"share" + 0.018*"night" + 0.017*"photo" + '
  '0.017*"help" + 0.016*"check" + 0.014*"feel" + 0.011*"also" + 0.010*"enjoy" '
  '+ 0.009*"look"'),
 (2,
  '0.050*"realdonaldtrump" + 0.030*"look" + 0.027*"tell" + 0.019*"must" + '
  '0.018*"hear" + 0.017*"never" + 0.015*"maybe" + 0.013*"wait" + '
  '0.012*"believe" + 0.011*"people"'),
 (3,
  '0.033*"see" + 0.030*"great" + 0.022*"right" + 0.021*"work" + 0.021*"start" '
  '+ 0.018*"people" + 0.018*"really" + 0.017*"may" + 0.015*"lot" + 0.014*"go"'),
 (4,
  '0.042*"good" + 0.026*"say" + 0.022*"make" + 0.022*"thing" + 0.021*"know" + '
  '0.021*"time" + 0.018*"think" + 0.017*"go" + 0.016*"give" + 0.013*"want"'),
 (5,
  '0.041*"leader" + 0.018*"tonight" + 0.016*"open" + 0.013*"nice" + 0.013*"do" '
  '+ 0.013*"amazing" + 0.012*"follow" + 0.011*"last" + 0.01

In [298]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.24831914189787402


In [313]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [322]:
# grid = {}
# grid['Validation_Set'] = {}
# # Topics range
# min_topics = 2
# max_topics = 11
# step_size = 1
# topics_range = range(min_topics, max_topics, step_size)
# # Alpha parameter
# alpha = list(np.arange(0.01, 1, 0.3))
# alpha.append('symmetric')
# alpha.append('asymmetric')
# # Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))
# beta.append('symmetric')
# # Validation sets
# num_of_docs = int(len(corpus))
# corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
#                # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
#                gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), corpus]
# corpus_title = ['75% Corpus', '100% Corpus']
# model_results = {'Validation_Set': [],
#                  'Topics': [],
#                  'Alpha': [],
#                  'Beta': [],
#                  'Coherence': []
#                 }
# # Can take a long time to run
# if 1 == 1:
#     pbar = tqdm.tqdm(total=540)
    
#     # iterate through validation corpuses
#     for i in range(len(corpus_sets)):
#         # iterate through number of topics
#         for k in topics_range:
#             # iterate through alpha values
#             for a in alpha:
#                 # iterare through beta values
#                 for b in beta:
#                     # get the coherence score for the given parameters
#                     cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, k=k, a=a, b=b)
#                     # Save the model results
#                     model_results['Validation_Set'].append(corpus_title[i])
#                     model_results['Topics'].append(k)
#                     model_results['Alpha'].append(a)
#                     model_results['Beta'].append(b)
#                     model_results['Coherence'].append(cv)
                    
#                     pbar.update(1)
#     pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
#     pbar.close()

  0%|          | 0/540 [02:44<?, ?it/s]








  0%|          | 0/540 [35:27<?, ?it/s][A[A[A[A[A[A[A[A
  0%|          | 0/540 [21:41<?, ?it/s]
  0%|          | 0/540 [19:33<?, ?it/s]
  0%|          | 0/540 [12:09<?, ?it/s]
  0%|          | 0/540 [07:02<?, ?it/s]
  0%|          | 0/540 [04:23<?, ?it/s]
  0%|          | 0/540 [01:55<?, ?it/s]








  0%|          | 1/540 [01:22<12:24:48, 82.91s/it][A[A[A[A[A[A[A[A







  0%|          | 2/540 [02:47<12:30:33, 83.71s/it][A[A[A[A[A[A[A[A







  1%|          | 3/540 [04:10<12:27:48, 83.55s/it][A[A[A[A[A[A[A[A







  1%|          | 4/540 [05:31<12:16:22, 82.43s/it][A[A[A[A[A[A[A[A







  1%|          | 5/540 [06:49<12:02:16, 81.00s/it][A[A[A[A[A[A[A[A







  1%|          | 6/540 [08:10<12:00:40, 80.97s/it][A[A[A[A[A[A[A[A







  1%|▏         | 7/540 [09:27<11:47:48, 79.68s/it][A[A[A[A[A[A[A[A







  1%|▏         | 8/540 [10:45<11:40:42, 79.03s/it][A[A[A[A[A

 17%|█▋        | 94/540 [2:31:09<10:36:34, 85.64s/it][A[A[A[A[A[A[A[A







 18%|█▊        | 95/540 [2:32:36<10:38:18, 86.06s/it][A[A[A[A[A[A[A[A







 18%|█▊        | 96/540 [2:34:04<10:41:27, 86.68s/it][A[A[A[A[A[A[A[A







 18%|█▊        | 97/540 [2:35:34<10:46:27, 87.56s/it][A[A[A[A[A[A[A[A







 18%|█▊        | 98/540 [2:37:04<10:50:01, 88.24s/it][A[A[A[A[A[A[A[A







 18%|█▊        | 99/540 [2:38:32<10:49:17, 88.34s/it][A[A[A[A[A[A[A[A







 19%|█▊        | 100/540 [2:39:59<10:45:19, 88.00s/it][A[A[A[A[A[A[A[A







 19%|█▊        | 101/540 [2:41:29<10:47:30, 88.50s/it][A[A[A[A[A[A[A[A







 19%|█▉        | 102/540 [2:42:58<10:46:55, 88.62s/it][A[A[A[A[A[A[A[A







 19%|█▉        | 103/540 [2:44:26<10:43:41, 88.38s/it][A[A[A[A[A[A[A[A







 19%|█▉        | 104/540 [2:45:53<10:39:50, 88.05s/it][A[A[A[A[A[A[A[A







 19%|█▉        | 105/540 [2:47:21<10:38:07, 88.02s/it][A[A

 52%|█████▏    | 282/540 [10:09:02<13:48:37, 192.70s/it][A[A[A[A[A[A[A[A







 52%|█████▏    | 283/540 [10:12:14<13:44:57, 192.60s/it][A[A[A[A[A[A[A[A







 53%|█████▎    | 284/540 [10:15:26<13:40:19, 192.27s/it][A[A[A[A[A[A[A[A







 53%|█████▎    | 285/540 [10:18:39<13:37:48, 192.43s/it][A[A[A[A[A[A[A[A







 53%|█████▎    | 286/540 [10:21:51<13:34:11, 192.33s/it][A[A[A[A[A[A[A[A







 53%|█████▎    | 287/540 [10:25:08<13:36:55, 193.74s/it][A[A[A[A[A[A[A[A







 53%|█████▎    | 288/540 [10:28:18<13:28:58, 192.61s/it][A[A[A[A[A[A[A[A







 54%|█████▎    | 289/540 [10:31:27<13:21:09, 191.51s/it][A[A[A[A[A[A[A[A







 54%|█████▎    | 290/540 [10:34:41<13:20:43, 192.17s/it][A[A[A[A[A[A[A[A







 54%|█████▍    | 291/540 [10:37:51<13:14:52, 191.54s/it][A[A[A[A[A[A[A[A







 54%|█████▍    | 292/540 [10:41:02<13:10:57, 191.36s/it][A[A[A[A[A[A[A[A







 54%|█████▍    | 293/540 [10:44:

 87%|████████▋ | 469/540 [19:07:55<2:48:48, 142.66s/it][A[A[A[A[A[A[A[A







 87%|████████▋ | 470/540 [19:10:07<2:42:28, 139.27s/it][A[A[A[A[A[A[A[A







 87%|████████▋ | 471/540 [19:12:17<2:37:05, 136.60s/it][A[A[A[A[A[A[A[A







 87%|████████▋ | 472/540 [19:14:25<2:32:03, 134.17s/it][A[A[A[A[A[A[A[A







 88%|████████▊ | 473/540 [19:16:47<2:32:08, 136.25s/it][A[A[A[A[A[A[A[A







 88%|████████▊ | 474/540 [19:21:06<3:10:38, 173.31s/it][A[A[A[A[A[A[A[A







 88%|████████▊ | 475/540 [19:25:52<3:44:12, 206.97s/it][A[A[A[A[A[A[A[A







 88%|████████▊ | 476/540 [19:30:28<4:02:54, 227.72s/it][A[A[A[A[A[A[A[A







 88%|████████▊ | 477/540 [19:34:51<4:10:19, 238.40s/it][A[A[A[A[A[A[A[A







 89%|████████▊ | 478/540 [19:39:20<4:15:48, 247.56s/it][A[A[A[A[A[A[A[A







 89%|████████▊ | 479/540 [19:43:50<4:18:31, 254.29s/it][A[A[A[A[A[A[A[A







 89%|████████▉ | 480/540 [19:48:18<4:18:27,

In [324]:
lda_tune = pd.read_csv('Data/lda_tuning_results.csv')

In [329]:
lda_tune.head(10)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.085531
1,75% Corpus,2,0.01,0.31,0.10994
2,75% Corpus,2,0.01,0.61,0.09469
3,75% Corpus,2,0.01,0.9099999999999999,0.082928
4,75% Corpus,2,0.01,symmetric,0.096602
5,75% Corpus,2,0.31,0.01,0.064359
6,75% Corpus,2,0.31,0.31,0.07187
7,75% Corpus,2,0.31,0.61,0.073953
8,75% Corpus,2,0.31,0.9099999999999999,0.064328
9,75% Corpus,2,0.31,symmetric,0.069776


In [333]:
lda_tune.Coherence.describe()

count    540.000000
mean     0.224118  
std      0.109141  
min      0.060430  
25%      0.158204  
50%      0.204224  
75%      0.246516  
max      0.576991  
Name: Coherence, dtype: float64

##### Parameters for the Top LDA Score (Real Texts)

In [340]:
lda_tune.sort_values('Coherence', ascending=False)[0:5]

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
238,75% Corpus,9,asymmetric,0.91,0.576991
268,75% Corpus,10,asymmetric,0.91,0.569899
538,100% Corpus,10,asymmetric,0.91,0.562504
448,100% Corpus,7,asymmetric,0.91,0.554042
508,100% Corpus,9,asymmetric,0.91,0.550941


In [341]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=9, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha='asymmetric',
                                           eta=0.9099)

In [345]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/Users/stephenenke/Documents/Flatiron/Phase-4/P4-Project'+str(num_topics)+'_tuned')
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '/Users/stephenenke/Documents/Flatiron/Phase-4/P4-Project'+ str(num_topics)+'_tuned' +'.html')
LDAvis_prepared

### Generated Tweets

In [362]:
generated = pd.read_csv('Data/gpt2_gentext1_20210126.txt',  sep=None, header=None, 
                        error_bad_lines=False, warn_bad_lines=False)

In [367]:
# Fixing dat column name
generated.rename(columns={0: 'tweet_clean'}, inplace=True)

In [368]:
# Return all rows not containing that \n===\n string
generated = generated[~generated['tweet_clean'].str.contains('\n====================\n')] 

In [369]:
generated.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99573 entries, 0 to 103997
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   tweet_clean  99573 non-null  object
dtypes: object(1)
memory usage: 1.5+ MB


In [370]:
data = generated.tweet_clean.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['waddup', 'inclined', 'expect', 'thee', 'procreate']


In [371]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# # View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


In [373]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=9, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha='asymmetric',
                                           eta=0.9099)

In [374]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/Users/stephenenke/Documents/Flatiron/Phase-4/P4-Project'+str(num_topics)+'_gen')
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '/Users/stephenenke/Documents/Flatiron/Phase-4/P4-Project'+ str(num_topics)+'_gen' +'.html')
LDAvis_prepared