In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Gensim
from gensim.corpora.dictionary import Dictionary

IMPORT DATA

In [3]:
df = pd.read_csv('bow_df.csv')
# 결측치 제거
df = df.dropna()

company_name = df.company_name
company_name.to_csv("company_name.csv")
other_var = df.drop(['company_name','adv','dadv','Unnamed: 0'],axis=1)
other_var.to_csv("other_var.csv")

TOKENIZING

In [4]:
# Word Count
from konlpy.tag import Okt

t = Okt()

def my_tokenizer(doc):
    return [
        token
        for token, pos in t.pos(doc)
        if pos in ['Noun','Verb','Adjective'] and len(token)>1
    ]

In [6]:
text_adv = [my_tokenizer(text) for text in df.adv]
text_dadv = [my_tokenizer(text) for text in df.dadv]

In [7]:
dictionary_adv = Dictionary(text_adv)
dictionary_dadv = Dictionary(text_dadv)
print('#Number of initial unique words in adv_documents:',len(dictionary_adv))
print('#Number of initial unique words in dadv_documents:',len(dictionary_dadv))

dictionary_adv.filter_extremes(keep_n = 2000, no_below = 10, no_above = 0.5)
dictionary_dadv.filter_extremes(keep_n = 2000, no_below = 10, no_above = 0.5)

print("#Number of unique words after removing rae and common words:", len(dictionary_adv))
print("#Number of unique words after removing rae and common words:", len(dictionary_dadv))

corpus_adv = [dictionary_adv.doc2bow(text) for text in text_adv]
corpus_dadv = [dictionary_dadv.doc2bow(text) for text in text_dadv]
print('#Number of unique tokens: %d' % len(dictionary_adv))
print('#Number of documents: %d' % len(corpus_adv))

print('#Number of unique tokens: %d' % len(dictionary_dadv))
print('#Number of documents: %d' % len(corpus_dadv))


#Number of initial unique words in adv_documents: 44973
#Number of initial unique words in dadv_documents: 75796
#Number of unique words after removing rae and common words: 2000
#Number of unique words after removing rae and common words: 2000
#Number of unique tokens: 2000
#Number of documents: 2503
#Number of unique tokens: 2000
#Number of documents: 2503


In [None]:
# import pickle
# with open('corpus_adv.pkl', 'wb') as lf:
#     pickle.dump(corpus_adv, lf)

In [16]:
# import pickle
# with open('corpus_dadv.pkl', 'wb') as lf:
#     pickle.dump(corpus_dadv, lf)

LDA MODEL

In [8]:
from gensim.models import LdaModel

num_topics_adv = 6
passes = 5
model_adv = LdaModel(corpus = corpus_adv, id2word = dictionary_adv,passes = passes, num_topics = num_topics_adv,random_state = 7)
model_adv.save("tp_adv_model")

In [9]:
num_topics_dadv = 7
model_dadv = LdaModel(corpus = corpus_dadv, id2word = dictionary_dadv,passes = passes, num_topics = num_topics_dadv,random_state = 7)
model_dadv.save("tp_dadv_model")

In [10]:
print('ADV')
print(model_adv.print_topics(num_words=10))
print('\n DADV')
print(model_dadv.print_topics(num_words=10))
# print('#topic distribution of the first document:',model.get_document_topics(corpus)[0])

ADV
[(0, '0.011*"안정" + 0.009*"공공기관" + 0.009*"서울" + 0.008*"부바" + 0.007*"유연근무제" + 0.007*"육아휴직" + 0.006*"정년" + 0.006*"강도" + 0.006*"높은" + 0.005*"공기업"'), (1, '0.013*"점심" + 0.009*"저녁" + 0.009*"버스" + 0.009*"기숙사" + 0.008*"통근" + 0.008*"높은" + 0.008*"식당" + 0.007*"맛있음" + 0.006*"수당" + 0.006*"아침"'), (2, '0.013*"데이" + 0.013*"금요일" + 0.012*"높은" + 0.008*"포인트" + 0.008*"제도" + 0.008*"리프" + 0.007*"패밀리" + 0.007*"업계" + 0.006*"대비" + 0.006*"여름"'), (3, '0.013*"재택근무" + 0.008*"재택" + 0.008*"대기업" + 0.007*"동료" + 0.007*"포인트" + 0.007*"기회" + 0.006*"업계" + 0.006*"사내" + 0.005*"개인" + 0.005*"교육"'), (4, '0.008*"수당" + 0.006*"강도" + 0.006*"대기업" + 0.005*"따라" + 0.005*"명절" + 0.005*"때문" + 0.005*"나옴" + 0.005*"지급" + 0.005*"사업" + 0.005*"없고"'), (5, '0.012*"카페" + 0.012*"커피" + 0.011*"건물" + 0.010*"점심" + 0.009*"간식" + 0.009*"할인" + 0.009*"사내" + 0.009*"식대" + 0.008*"사옥" + 0.007*"사무실"')]

 DADV
[(0, '0.010*"생산" + 0.009*"진급" + 0.006*"군대" + 0.005*"사원" + 0.005*"공장" + 0.005*"수직" + 0.005*"보수" + 0.005*"출근" + 0.005*"꼰대" + 0.005*"승진"'), (1, '0.026*"계약" 

In [25]:
import re

def extract_korean_words(input_string):
    # Regular expression to match Korean words
    korean_pattern = re.compile("[가-힣]+")

    # Find all matches in the input string
    korean_matches = korean_pattern.findall(input_string)

    return korean_matches

In [34]:
# list of important words from 
adv_topics = model_adv.print_topics(num_words=50)
dadv_topics = model_dadv.print_topics(num_words=50)
adv_words = []
dadv_words = []

for i in range (len(adv_topics)):
    for j in range (len(adv_topics[i])):
        if j%2 == 1:
            adv_words += extract_korean_words(adv_topics[i][j])

for i in range (len(dadv_topics)):
    for j in range (len(dadv_topics[i])):
        if j%2 == 1:
            dadv_words += extract_korean_words(dadv_topics[i][j])

adv_words = list(set(adv_words))
dadv_words = list(set(dadv_words))

167


In [35]:
import pickle

def save_to_pickle(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

# Example usage
# Specify the filename
adv_filename = "adv_word_list.pkl"
dadv_filename = "dadv_word_list.pkl"

# Save the list to a pickle file
save_to_pickle(adv_words, adv_filename)
save_to_pickle(dadv_words, dadv_filename)