In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Gensim
from gensim.corpora.dictionary import Dictionary

In [None]:
# Categories
'''
bank_financial_business_review
construction_review
distribution_trade_transport_reviews
education_review
manufacture_chemistry_review
media_design_reviews
organization_association_reviews
medical_medicine_welfare_reviews
service_reviews
'''

IMPORT DATA

In [None]:
df = pd.read_csv('/Users/myeongseop.kim/Desktop/SCAISCO/csv files/medical_medicine_welfare_reviews.csv')
# 결측치 제거
df = df.dropna()
df.head()

In [None]:
# # only for IT data
# it1 = pd.read_csv('/Users/myeongseop.kim/Desktop/SCAISCO/csv files/it_web_communication_reviews_1_100.csv')
# it2 = pd.read_csv('/Users/myeongseop.kim/Desktop/SCAISCO/csv files/it_web_communication_reviews_101_200.csv')
# it3 = pd.read_csv('/Users/myeongseop.kim/Desktop/SCAISCO/csv files/it_web_communication_reviews_201_300.csv')
# it4 = pd.read_csv('/Users/myeongseop.kim/Desktop/SCAISCO/csv files/it_web_communication_reviews_300_end.csv')

# df= pd.concat([it1,it2,it3,it4],axis=0)
# df.index= [i for i in range(563)]
# # 결측치 제거
# df = df.dropna()
# adv = df['adv'] # Advantage review
# dadv = df['dadv'] # Disadvantage review
# df.head()

TOKENIZING

In [None]:
# Word Count
from konlpy.tag import Okt

t = Okt()

def my_tokenizer(doc):
    return [
        token
        for token, pos in t.pos(doc)
        if pos == 'Noun' and len(token)>1
    ]

In [None]:
text_adv = [my_tokenizer(text) for text in df.adv]
text_dadv = [my_tokenizer(text) for text in df.dadv]

In [None]:
dictionary_adv = Dictionary(text_adv)
dictionary_dadv = Dictionary(text_dadv)
print('#Number of initial unique words in adv_documents:',len(dictionary_adv))
print('#Number of initial unique words in dadv_documents:',len(dictionary_dadv))

dictionary_adv.filter_extremes(keep_n = 2000, no_below = 10, no_above = 0.5)
dictionary_dadv.filter_extremes(keep_n = 2000, no_below = 10, no_above = 0.5)
print("#Number of unique words after removing rae and common words:", len(dictionary_adv))
print("#Number of unique words after removing rae and common words:", len(dictionary_dadv))

corpus_adv = [dictionary_adv.doc2bow(text) for text in text_adv]
corpus_dadv = [dictionary_dadv.doc2bow(text) for text in text_dadv]
print('#Number of unique tokens: %d' % len(dictionary_adv))
print('#Number of documents: %d' % len(corpus_adv))

print('#Number of unique tokens: %d' % len(dictionary_dadv))
print('#Number of documents: %d' % len(corpus_dadv))

LDA MODEL

In [None]:
from gensim.models import LdaModel

num_topics_adv = 3
passes = 5
model_adv = LdaModel(corpus = corpus_adv, id2word = dictionary_adv,passes = passes, num_topics = num_topics_adv,random_state = 7)

In [None]:
num_topics_dadv = 3
model_dadv = LdaModel(corpus = corpus_dadv, id2word = dictionary_dadv,passes = passes, num_topics = num_topics_dadv,random_state = 7)

In [None]:
print('ADV')
print(model_adv.print_topics(num_words=10))
print('\n DADV')
print(model_dadv.print_topics(num_words=10))
# print('#topic distribution of the first document:',model.get_document_topics(corpus)[0])

TOPIC MODELING COMPARISON(based on # topics)

In [None]:
# # topic_modeling csv file
# num_topic_list = [8, 11, 15]
# save = {}
# for i in num_topic_list:
#     model = LdaModel(corpus = corpus, id2word = dictionary,passes = passes, num_topics = i,random_state = 7)
#     save[i] = model.print_topics(num_words=10)
# topic_csv = pd.DataFrame.from_dict(save, orient='index')
# topic_csv.to_csv("it_topic_selection.csv")

TOPIC MODELING OUTPUT

In [None]:
output_df_adv = pd.DataFrame({'cmp':df.cmp})
for col in range(num_topics_adv):
    tmp = str(col)
    output_df_adv[tmp] = float(0)
output_df_adv.index = [i for i in range(len(output_df_adv))]
output_df_adv

In [None]:
output_df_dadv = pd.DataFrame({'cmp':df.cmp})
for col in range(num_topics_dadv):
    tmp = str(col)
    output_df_dadv[tmp] = float(0)
output_df_dadv.index = [i for i in range(len(output_df_dadv))]
output_df_dadv

In [None]:
for i, dt in enumerate(model_adv.get_document_topics(corpus_adv)):
    for val in dt:

        idx = str(val[0])
        pt = val[1]

        output_df_adv.at[i,idx] = pt

In [None]:
for i, dt in enumerate(model_dadv.get_document_topics(corpus_dadv)):
    for val in dt:

        idx = str(val[0])
        pt = val[1]

        output_df_dadv.at[i,idx] = pt

In [None]:
# to csv file
# output_df.to_csv('construction_topic_modeling.csv')

In [None]:
from gensim.models import CoherenceModel

cm_adv = CoherenceModel(model= model_adv, corpus= corpus_adv, coherence= 'u_mass')
cm_dadv = CoherenceModel(model= model_dadv, corpus= corpus_dadv, coherence= 'u_mass')

coherence_adv = cm_adv.get_coherence()
coherence_dadv = cm_dadv.get_coherence()

print(coherence_adv)
print(coherence_dadv)

In [None]:
def show_coherence(corpus, dictionary, start = 6, end = 15):
    iter_num = []
    per_value = []
    coh_value = []

    for i in range(start, end+1):
        model = LdaModel(corpus = corpus, id2word = dictionary, chunksize=1000, num_topics = i,random_state = 7)
        iter_num.append(i)
        pv = model.log_perplexity(corpus)
        per_value.append(pv)

        cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
        cv = cm.get_coherence()
        coh_value.append(cv)
        print(f'num_topics: {i}, perplexity: {pv:0.3f}, coherence: {cv:0.3f}')
    
    plt.plot(iter_num, per_value, 'g-')
    plt.xlabel("num_topics")
    plt.ylabel("perplexity")
    plt.show()

    plt.plot(iter_num, coh_value, 'r--')
    plt.xlabel("num_topics")
    plt.ylabel("coherence")
    plt.show()
# 5, 14
show_coherence(corpus_adv, dictionary_adv, start = 3, end = 30)

In [None]:
show_coherence(corpus_dadv, dictionary_dadv, start = 3, end = 30)
