In [None]:
import pandas as pd
import glob
import random
import numpy as np

import matplotlib.pyplot as plt
import ast
from wordcloud import WordCloud
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim
from gensim import corpora

In [None]:
np.random.seed = 42
random.seed = 42

## 후보/연도별로 연설 합치기

In [None]:
df = None
for file in glob.glob('./data/to_preprocess/*'):
    df_curr = pd.read_csv(file, index_col=0)
    df_curr["Year"] = int(file.split("\\")[-1][0:4])
    if df is None:
        df = df_curr
    else:
        df = pd.concat([df,df_curr])

In [None]:
df

## 연도 분포 확인

In [None]:


fig, ax = plt.subplots()

df.groupby("Year")["Year"].value_counts().plot.bar(ax=ax)
# ax.hist(df["Year"], bins=range(1920,2024,4))

## 후보별 분포 확인

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

df.groupby("name")["name"].value_counts().plot.bar(ax=ax)
# ax.hist(df["name"])
ax.set_xticks(ax.get_xticks())
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

plt.show()

In [None]:
df["joined_speech"] = df["speech"].apply(lambda l: ' '.join(ast.literal_eval(l)))

## 길이 분포 확인

In [None]:
fig, ax = plt.subplots()

ax.hist(df["joined_speech"].apply(lambda l: len(l.split(' '))))

In [None]:
df.info()

In [None]:

ast.literal_eval(df["speech"].values[0])

## WordCloud

In [None]:
# !conda install wordcloud -y


df_test = pd.read_csv('./data/orig/2020_Trump_speech.csv', index_col=0)

speeches = list(map(lambda l: ' '.join(ast.literal_eval(l)), df_test["speech"].values))



speeches_processed = list()
for word in speeches:
    if word not in stop_words_list:
        speeches_processed.append(word)

speeches_combined = ' '.join(speeches_processed)

In [None]:
wordcloud = WordCloud(
    background_color = 'black',
    width = 1000, height = 500).generate(speeches_combined)
plt.figure(figsize = (15, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.draw()

In [None]:
df_test = pd.read_csv('./data/to_preprocess/2020_Biden_speech_edited.csv', index_col=0)

In [None]:
speeches = list(map(lambda l: ' '.join(ast.literal_eval(l)), df_test["speech"].values))

speeches_combined = ' '.join(speeches)

speeches_processed = list()
for word in speeches:
    if word not in stop_words_list:
        speeches_processed.append(word)

speeches_combined = ' '.join(speeches_processed)

In [None]:
wordcloud = WordCloud(
    background_color = 'black',
    width = 1000, height = 500).generate(speeches_combined)
plt.figure(figsize = (15, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.draw()

## LDA

In [None]:

lambda x: re.sub('[,\.!?]','',x)

lambda x: x.lower()



In [None]:
import gensim
from gensim_utils import simple_preprocess
import nltk


In [None]:
# !conda install nltk -y

# nltk.download('stopwords')
# nltk.download("punkt")
stop_words_list = stopwords.words('english')

In [None]:
len(stop_words_list)

In [None]:
df["speech_whole"] = df["speech"].apply(lambda l: ' '.join(ast.literal_eval(l)))
# ast.literal_eval(df["speech"].values[0])

In [None]:
def filter_stopwords(token_list):
    filtered_list = list()
    for token in token_list:
        if token not in stop_words_list:
            filtered_list.append(token)
    return filtered_list

In [None]:
def filter_nonwords(string):
    out = string
    out = re.sub('[-–—–;]', ' ', out)
    out = re.sub('[,\.!?"\'`]','', out)
    out = re.sub('[^a-zA-Z0-9\-_./]','', out)
    out = out.lower()
    return out

In [None]:
df["speech_processed"] = df["speech_whole"].apply(filter_nonwords)

In [None]:
df["speech_processed"] = df["speech_processed"].apply(word_tokenize)

In [None]:
df["speech_processed"] = df["speech_processed"].apply(filter_stopwords)

In [None]:
df

In [None]:
df.iloc[-1]["speech_processed"]

In [None]:
# !conda install gensim -y

dictionary = corpora.Dictionary(df["speech_processed"])

corpus = [dictionary.doc2bow(text) for text in df["speech_processed"]]

In [None]:
corpus[0]

In [None]:
words = list(dictionary.values())
words.sort(key=lambda x:len(x), reverse=True)
words

In [None]:
n_topics = 15
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word=dictionary, passes=15, random_state=42)
topics = lda_model.print_topics(num_words=5)

for topic in topics:
    print(topic)

In [None]:
n_topics = 20
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word=dictionary, passes=15, random_state=42)
topics = lda_model.print_topics(num_words=5)

for topic in topics:
    print(topic)

In [None]:
topics = lda_model.print_topics(num_words=10)

for topic in topics:
    print(topic)

In [None]:
for i, topic_list in enumerate(lda_model[corpus]):
    print(i,'번째 문서의 topic 비율은',topic_list)