<a href="https://colab.research.google.com/github/RogerHeederer/NLP_entry/blob/master/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference Source : wikidocs.net 유영준님 자료

스스로 학습하면서 필요한 부분에는 추가적 설명, 소스 코드 삽입 및 수정 등이 있습니다. 영리적 목적이 아닌, 자기 계발 목적으로 정리한 자료입니다.


**LDA(Latent Dirichlet Allocation)**

문서의 집합으로부터 어떤 토픽이 존재하는지 알아내는 알고리즘

LSA : DTM을 차원 축소 하여 축소 차원에서 근접 단어들을 토픽으로 묶는다.

LDA : 단어가 특정 토픽에 존재할 확률과 문서에 특정 토픽이 존재할 확률을 결합확률로 추정하여 토픽을 추출한다.

In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [6]:
# 데이터 전처리 하기

news_df = pd.DataFrame({'document':documents})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ") #특수문자 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) #길이가 3이하 단어 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) #소문자로 바꾸기
news_df[1:5]

Unnamed: 0,document,clean_doc
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...


In [8]:
# 전처리 후 토크나이징 하기
#토큰화 수행 후 불용어 제거하기
import nltk
nltk.download('stopwords')
  
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc[0:5]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    [well, sure, about, story, seem, biased, what,...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, that, principle, your, str...
3    [notwithstanding, legitimate, fuss, about, thi...
4    [well, will, have, change, scoring, playoff, p...
Name: clean_doc, dtype: object

In [26]:
tokenized_doc.shape

(11314,)

In [27]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
len(dictionary), dictionary[1]

#딕셔너리는 단어 모음집

(64365, 'acts')

In [29]:
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
len(corpus)

11314

In [31]:
print(corpus[1]) # 두번째 뉴스에 대해 정수 인코딩 적용된 값을 출력

[(0, 1), (2, 1), (20, 1), (60, 2), (66, 1), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 2), (85, 1), (86, 1), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 2), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 2), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)]


In [32]:
print(dictionary[72]) #72번째 your라는 단어가 2번 등장함

your


In [20]:
len(dictionary)

64365

**LDA 모델 트레이닝**

In [33]:
import gensim
NUM_TOPICS = 20 # 20개의 토픽 설정
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS,
                                           id2word = dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
  print(topic)

(0, '0.015*"guns" + 0.011*"crime" + 0.010*"control" + 0.009*"firearms"')
(1, '0.008*"ground" + 0.007*"with" + 0.007*"power" + 0.007*"wire"')
(2, '0.013*"church" + 0.009*"matthew" + 0.009*"father" + 0.008*"holy"')
(3, '0.037*"that" + 0.021*"they" + 0.019*"have" + 0.016*"this"')
(4, '0.021*"file" + 0.013*"output" + 0.012*"entry" + 0.010*"your"')
(5, '0.016*"image" + 0.016*"files" + 0.015*"file" + 0.014*"color"')
(6, '0.025*"game" + 0.023*"team" + 0.017*"games" + 0.016*"play"')
(7, '0.010*"picture" + 0.009*"nist" + 0.007*"sleeve" + 0.007*"ncsl"')
(8, '0.046*"that" + 0.018*"this" + 0.013*"have" + 0.011*"with"')
(9, '0.024*"were" + 0.017*"they" + 0.012*"their" + 0.011*"from"')
(10, '0.025*"will" + 0.010*"this" + 0.008*"with" + 0.008*"that"')
(11, '0.008*"with" + 0.007*"price" + 0.007*"condition" + 0.007*"bike"')
(12, '0.022*"chip" + 0.021*"keys" + 0.018*"clipper" + 0.016*"encryption"')
(13, '0.012*"year" + 0.006*"with" + 0.006*"disease" + 0.006*"health"')
(14, '0.022*"with" + 0.021*"have" +

In [34]:
print(ldamodel.print_topics())

[(0, '0.015*"guns" + 0.011*"crime" + 0.010*"control" + 0.009*"firearms" + 0.009*"weapons" + 0.007*"right" + 0.007*"amendment" + 0.007*"militia" + 0.007*"weapon" + 0.006*"police"'), (1, '0.008*"ground" + 0.007*"with" + 0.007*"power" + 0.007*"wire" + 0.005*"will" + 0.005*"used" + 0.005*"high" + 0.005*"more" + 0.005*"current" + 0.005*"this"'), (2, '0.013*"church" + 0.009*"matthew" + 0.009*"father" + 0.008*"holy" + 0.007*"spirit" + 0.007*"catholic" + 0.007*"john" + 0.006*"books" + 0.006*"greek" + 0.006*"pope"'), (3, '0.037*"that" + 0.021*"they" + 0.019*"have" + 0.016*"this" + 0.014*"with" + 0.011*"there" + 0.011*"about" + 0.010*"just" + 0.009*"what" + 0.009*"would"'), (4, '0.021*"file" + 0.013*"output" + 0.012*"entry" + 0.010*"your" + 0.009*"this" + 0.009*"program" + 0.008*"information" + 0.008*"mail" + 0.007*"name" + 0.007*"send"'), (5, '0.016*"image" + 0.016*"files" + 0.015*"file" + 0.014*"color" + 0.010*"jpeg" + 0.009*"printf" + 0.007*"program" + 0.006*"from" + 0.006*"graphics" + 0.006*

In [36]:
# LDA 시각화
!pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 2.7MB/s 
Collecting funcy
[?25l  Downloading https://files.pythonhosted.org/packages/ce/4b/6ffa76544e46614123de31574ad95758c421aae391a1764921b8a81e1eae/funcy-1.14.tar.gz (548kB)
[K     |████████████████████████████████| 552kB 18.8MB/s 
Building wheels for collected packages: pyLDAvis, funcy
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97712 sha256=312f3abf4eacd56fc2f8bd7955bf8eef3069b1340595202f5e3b97eb14413225
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
  Building wheel for funcy (setup.py) ... [?25l[?25hdone
  Created wheel for funcy: filename=funcy-1.14-py2.py3-none-any.whl size=32042 sha256=2a661c1f

In [38]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

In [39]:
#문서 별 토픽 분포 보기

for i, topic_list in enumerate(ldamodel[corpus]):
  if i==5:
    break
  print(i, '번째 문서의 topic 비율은', topic_list)

0 번째 문서의 topic 비율은 [(3, 0.07246276), (8, 0.35864282), (9, 0.1520031), (10, 0.07243345), (15, 0.33573693)]
1 번째 문서의 topic 비율은 [(3, 0.667297), (8, 0.31505594)]
2 번째 문서의 topic 비율은 [(0, 0.014932848), (3, 0.29900232), (8, 0.43217325), (9, 0.1622421), (15, 0.08239023)]
3 번째 문서의 topic 비율은 [(3, 0.34730807), (8, 0.0737105), (10, 0.14078356), (11, 0.013769619), (12, 0.15687065), (14, 0.060026977), (15, 0.2004654)]
4 번째 문서의 topic 비율은 [(3, 0.4390119), (4, 0.083430804), (6, 0.45099485)]


In [40]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [41]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,8.0,0.3586,"[(3, 0.072467625), (8, 0.35863975), (9, 0.1520..."
1,1,3.0,0.6673,"[(3, 0.6672625), (8, 0.31509045)]"
2,2,8.0,0.4321,"[(0, 0.014932739), (3, 0.29896006), (8, 0.4321..."
3,3,3.0,0.3473,"[(3, 0.34733385), (8, 0.07374732), (10, 0.1407..."
4,4,6.0,0.4511,"[(3, 0.43906674), (4, 0.08328347), (6, 0.45108..."
5,5,8.0,0.3232,"[(2, 0.1812524), (3, 0.26873654), (8, 0.323234..."
6,6,5.0,0.6997,"[(3, 0.09541463), (5, 0.69970363), (11, 0.0140..."
7,7,3.0,0.3513,"[(3, 0.35130394), (8, 0.28281087), (9, 0.08363..."
8,8,3.0,0.5358,"[(0, 0.26496187), (3, 0.5358464), (8, 0.050169..."
9,9,3.0,0.5285,"[(1, 0.15018977), (3, 0.5284928), (9, 0.018649..."
