In [1]:
import numpy as np
import pandas as pd

# LDA
from sklearn.decomposition import LatentDirichletAllocation


In [2]:
# Read Data
df = pd.read_csv('construction_review.csv')
# 결측치 제거
df = df.dropna()
adv = df['adv'] # Advantage review
dadv = df['dadv'] # Disadvantage review
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'construction_review.csv'

In [None]:
# Word Count
from konlpy.tag import Okt

t = Okt()

def my_tokenizer(doc):
    return [
        token
        for token, pos in t.pos(doc)
        if pos == 'Noun' and len(token)>1
    ]

In [None]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1000, tokenizer = my_tokenizer)
review_cv = cv.fit_transform(df.adv)

In [None]:
print(cv.get_feature_names_out()[:100])

In [None]:
# LDA
from sklearn.decomposition import LatentDirichletAllocation
np.set_printoptions(precision = 3)

lda = LatentDirichletAllocation(n_components = 10,
                                max_iter = 5,
                                topic_word_prior = 0.1, doc_topic_prior = 1.0,
                                learning_method = 'online',
                                n_jobs = -1,
                                random_state=0)

review_topics = lda.fit_transform(review_cv)
print('#Shape of review_topics:', review_topics.shape)
print('#Sample of review_topics:', review_topics[0])

gross_topic_weights = np.mean(review_topics, axis=0)
print('#Sum of topic weights of documents:',gross_topic_weights)
print('#shape of topic word distribution:', lda.components_.shape)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d " % topic_idx, end='')
        print( ", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
print_top_words(lda, cv.get_feature_names_out(),10)

In [None]:
_import matplotlib.pyplot as plt
%matplotlib inline

def show_perplexity(cv, start=10, end = 30, max_iter=5, topic_word_prior=0.1,
                    doc_topic_prior=1.0):
    iter_num=[]
    per_value=[]

    for i in range(start, end+1):
        lda = LatentDirichletAllocation(n_components = i, max_iter = max_iter,
                                        topic_word_prior = topic_word_prior,
                                        doc_topic_prior = doc_topic_prior,
                                        learning_method = 'batch', n_jobs = -1,
                                        random_state=7)
        lda.fit(cv)
        iter_num.append(i)
        pv = lda.perplexity(cv)
        per_value.append(pv)
        print(f'n_components: {i}, perplextiy: {pv:0.3f}')

    plt.plot(iter_num, per_value, 'g-')
    plt.show()
    return start + per_value.index(min(per_value))

print("n_components with minimum perplexity: ",
      show_perplexity(review_cv, start=5, end=15))


In [None]:
lda = LatentDirichletAllocation(n_components = 6,
                                max_iter = 20,
                                topic_word_prior = 0.1, doc_topic_prior = 1.0,
                                learning_method = 'batch',
                                n_jobs = -1,
                                random_state=7)

review_topics = lda.fit_transform(review_cv)

print_top_words(lda, cv.get_feature_names_out(), 4)