In [1]:
import pandas as pd
import pymysql
import urllib.request
import nltk


In [3]:
from konlpy.tag import Okt
from nltk.corpus import stopwords


In [4]:
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim import corpora


In [6]:
import gensim
import pyLDAvis.gensim_models
import pyLDAvis.gensim_models as gensimvis

In [7]:

DB_HOST = 'localhost'
DB_USER = 'pyuser'
DB_PASSWD = 'pyuser'
DB_NAME = 'pyuser'

conn = pymysql.connect(host=DB_HOST, user=DB_USER, password=DB_PASSWD,
                   db=DB_NAME, charset='utf8')
curs = conn.cursor()
sql = """
    select * from yhnews
    where newsDate between '20190101' and '20191231'
    union
    select * from newsis
    where newsDate between '20190101' and '20191231'
    union
    select * from sbsnews
    where newsDate between '20190101' and '20191231';

"""
curs.execute(sql)
table_data = curs.fetchall() 
conn.close()

In [10]:

df = pd.DataFrame(table_data,columns=['no','url','Date','Title','subtitle','Contents'])

# df [ 'Contents'   ]  <= Series,   df[ ['Contents'] ]  <== dataframe
text = df[ ['Contents'] ]
# print(type(text))

okt=Okt()

tokenized_doc = text.apply(lambda row: okt.nouns(row['Contents']), axis=1)
# 길이가 1이하인 단어는 제거 (길이가 짧은 단어 제거)
tokenized_doc = tokenized_doc.apply(lambda x: [word for word in x if len(word) > 1])

# 제거할 단어들 기재
stop = ['연합뉴스','무단']
tokenized_doc = tokenized_doc.apply(lambda x: [word for word in x if word not in (stop)])

# print(tokenized_doc)



In [17]:


detokenized_doc = []

for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

text['Contents'] = detokenized_doc

vectorizer = TfidfVectorizer(max_features= 1000)    # 상위 1,000개의 단어를 보존 
X = vectorizer.fit_transform(text['Contents'])

lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)
lda_top=lda_model.fit_transform(X)

terms = vectorizer.get_feature_names() 

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_,terms)

dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]





Topic 1: [('이륜차', 3.04), ('단속', 2.36), ('위반', 1.73), ('안전', 1.68), ('광주', 1.5)]
Topic 2: [('이륜차', 1.75), ('배달', 1.58), ('종사', 1.41), ('단속', 1.36), ('경찰', 1.28)]
Topic 3: [('배달', 1.45), ('사망자', 1.36), ('단속', 1.32), ('위반', 1.25), ('오토바이', 1.16)]
Topic 4: [('이륜차', 1.01), ('태국', 1.0), ('오토바이', 0.98), ('모두', 0.97), ('상태', 0.96)]
Topic 5: [('사망자', 1.81), ('단속', 1.72), ('이륜차', 1.69), ('교통사고', 1.5), ('감소', 1.49)]
Topic 6: [('단속', 1.63), ('이륜차', 1.58), ('사망자', 1.55), ('교통사고', 1.52), ('교통', 1.49)]
Topic 7: [('경찰', 1.25), ('위반', 1.21), ('사망자', 1.18), ('사고', 1.1), ('이륜차', 1.1)]
Topic 8: [('사고', 1.22), ('오토바이', 1.12), ('학생', 1.1), ('단속', 1.07), ('보행자', 1.07)]
Topic 9: [('사고', 1.19), ('과실', 1.19), ('사망', 1.08), ('횡단보도', 1.06), ('기준', 1.03)]
Topic 10: [('오토바이', 1.46), ('일보', 1.26), ('운전자', 1.18), ('이륜차', 1.12), ('통행', 1.1)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['Contents'] = detokenized_doc


In [18]:
NUM_TOPICS = 6 # n개의 토픽, k=n

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)   # TOPIC별 단어를 4개만
for topic in topics:
    print(topic)


(0, '0.025*"이륜차" + 0.014*"사고" + 0.009*"신고" + 0.008*"사용"')
(1, '0.012*"사고" + 0.008*"장애인" + 0.007*"연합뉴스" + 0.006*"후보"')
(2, '0.020*"티스" + 0.018*"주니어" + 0.012*"오토바이" + 0.010*"샌디에이고"')
(3, '0.027*"이륜차" + 0.023*"사고" + 0.020*"단속" + 0.019*"배달"')
(4, '0.030*"사고" + 0.026*"오토바이" + 0.014*"연합뉴스" + 0.012*"병원"')
(5, '0.007*"시위" + 0.007*"사고" + 0.006*"교통" + 0.006*"안전"')


In [14]:

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)

# https://lovit.github.io/nlp/2018/09/27/pyldavis_lda/
# 시각화는 1- 부터 시작함
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(
