In [None]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import pandas as pd

# Import preprocessed dataframes
%store -r speeches_df
%store -r speeches_debates_df

In [None]:
# Select only debates
is_debate = speeches_debates_df['type']=='debate'
debates = speeches_debates_df[is_debate]

# List of debater names
debaters = debates['debater'].unique()
a = (map(lambda x: x.lower(), debaters))
debaters = list(a)
debaters_ls = debates['debater']

# Define stopwords
stop_words = stopwords.words('english')
stop_words.extend(['--', 'michael e eidenmuller', 'people', 'would', 'authenticity', 'certified', 'text', 'version', 'transcribed', 'directly',\
                'audio', '--', 'aa', '?', ';', 'page', '12/31/21', ')', ',', "n't", '(', ']',\
                '[', '!', 'updated', '12/11/21', 'update', "``", "''", 'rr', "'ve", "'s",\
                'mmeerriiccaann', 'hheettoorriiccccoomm', 'hheettoorriicc', '..', ',', ':', \
                'ccoomm', 'transcription', 'americanrhetoric.com','and', 'the', 'to', 'of', 'in',\
                'that', 'is', 'a', 'have', 'with', 'be', 'this', 'as', 'by', 'those', '--', 'are', \
                'but', 'your',  'for', 'they', 'here', 'their', 'but', 'were', 'at', 'aa', 'rr', 'www',\
                'mmeerriiccaann', 'hheettoorriicc..ccoomm', 'transcription','1','2','3', 'transcription',\
                'americanrhetoric.com', 'updated', '12/31/21', 'page','ccoomm','hheettoorriicc','would','11',\
                'michael','e','eidenmuller','also','americanrhetoric','com','w','oodrow','ilson','mauricio',\
                'garcia','21','10','12','may','let','new','say','said','make','go','mr','come','could',\
                'get','many','authenticity', 'certified', 'text', 'version', 'transcribed', 'directly', 'audio', '--', \
                'aa', '?', ';', 'page', '12/31/21', ')', ',', "n't", '(', ']', '[', '!', 'updated', '12/11/21', 'update', \
                "``", "''", 'rr', "'ve", "'s", 'mmeerriiccaann', 'hheettoorriiccccoomm', 'hheettoorriicc', '..', ',', ':',\
                'ccoomm', 'transcription', 'americanrhetoric.com', 'one', 'us', 'right', "'m", "'re", "...",\
                'thing', 'think', 'know', "'d", 'â\x80\x94', '\\'])
stop_words.extend(debaters)

# Create dataframe
data_dict = {'debater': debaters_ls, 'text': debates['lemmatized']}
data = pd.DataFrame(data_dict)

# Tokenize words
data_words = data['text'].apply(word_tokenize)
data_sent = data['text'].apply(sent_tokenize)

# Remove stop words
data_words_filter = data_words.apply(lambda x: [word for word in x if word not in (stop_words)])
data_sent_filter = data_sent.apply(lambda x: [word for word in x if word not in (stop_words)])

# Create new dataframe columns
data['text_cleaned'] = data_words_filter
data['sent_cleaned'] = data_sent_filter #[word_tokenize(t) for t in sent_tokenize(data_sent)]

# Preview dataframe
display(data)

In [None]:
# Select debater from list in range 0-50
selected_debater = str(debaters[48]).capitalize()
selected = data['debater']==selected_debater
selected_df = data[selected]

# Preview text
print(selected_debater)
print(selected_df['text_cleaned'])

In [None]:
# Preview most frequent words
from nltk import FreqDist

long_string = ' '.join([x for l in selected_df['text_cleaned'] for x in l])
all_words = long_string.split()
fdist = FreqDist(all_words) # a frequency distribution of words (word count over the corpus)
k = 100 # say you want to see the top 10,000 words
top_k_words, _ = zip(*fdist.most_common(k)) # unzip the words and word count tuples

# Stop words for selected debater
personal_stop_words = list(top_k_words)

selected_df['personal_text'] = selected_df['text_cleaned'].apply(lambda x: [word for word in x if word not in (personal_stop_words)])

In [None]:
# Print wordcloud of selected speech
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib as mpl


long_string = ' '.join([x for l in selected_df['text_cleaned'] for x in l])

wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
wordcloud.generate(long_string)

print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show();

In [None]:
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(selected_df['personal_text'])

# Create Corpus
texts = selected_df['personal_text']

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

In [None]:
from pprint import pprint

# Select number of topics
num_topics = 3

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics
                                      )

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


In [None]:
import pickle, os
import pyLDAvis.gensim_models

%cd "/root/.jupyter/FINAL/assets/"
cwd = os.getcwd()

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = ''.join(cwd+str('/')+str(selected_debater)+str(num_topics))
print(LDAvis_data_filepath)

if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, LDAvis_data_filepath +'.html')

LDAvis_prepared