# LDA

- 의미
> LDA(잠재 디리클레 할당)은 토픽 모델링 기법 중 하나임
> 여러 문서들은 토픽들의 혼합으로 구성되어 있으며 이는 확률 분포에 기반하여 토픽들이 단어들을 생성한다고 가정함
> 따라서, 문서들을 투입하면 그 과정을 추적하여 토픽과 그 내부의 단어들을 추정함


- 계산 과정

> 1) 문서 내 단어 정의(텍스트 전처리)
>> - 문장에 존재하는 주제 단어 후보들을 추출함
>> ex)
>> - [1] 마우스 패드 삽니다 -> [1] 마우스, 패드
>> - [2] 마우스가 고장이 났네요. 수리 요청 -> [2] 마우스, 고장, 수리, 요청
>> - [3] 패드에서 냄새가 나요. -> [3] 패드, 냄새

> - 2) 임의의 토픽 번호 부여
>> 1)에서 추출된 단어들을 열 이름으로 사전 정의된 k개의 토픽을 랜덤으로 부여함

> 3) 토픽-문서 단어 분포 계산
>> 이 단계에서는 각 토픽이 각 문서에 포함될 확률을 계산함
>> 토픽-문서 행렬이 생성되며, 각 문서에 각 토픽에 해당하는 단어가 몇 개 있는지 계산됨
>> 사전 정의된 알파 값을 최종적으로 더해주어 각 셀에 0이 되는 값을 방지함

> 4) 토픽-단어 분포 계산
>> 이 단계에서는 각 단어가 각 토픽에 포함될 확률을 계산함
>> 토픽-단어 행렬이 생성되며, 각 단어가 각 토픽에 얼마나 포함되어 있는지 분포를 계산함
>> 사전 정의된 베타 값을 최종적으로 더해주어 각 셀에 0이 되는 값을 방지함

> 5) 토픽 재선정
>> 1)에서 추출된 문서-단어 쌍들을 대상으로 토픽을 재선정하며 재선정하는 과정은 3)문서 내 토픽이 존재할 확률과 4)토픽 내 단어가 존재할 확률을 곱하여 가장 높은 확률을 가진 주제를 해당 문서-단어의 주제로 업데이트함
>> 3)-4)-5) 과정을 설정한 iteration만큼 반복하여 수렴한 토픽을 최종 결과 값으로 도출함

In [1]:
from konlpy.tag import Kkma
kkma = Kkma()

# Kkma 모듈을 통한 명사 추출 함수
def extract_word(sentence):
    lst = kkma.nouns(sentence)
    return lst

In [8]:
import pandas as pd

# 문서 내 주제 단어 후보 정의
def word_box(df):
    wb = []
    cb = []
    wwb = []
    for i in df.iloc[:,0]:
        tmp = extract_word(i)
        cb.append(len(tmp))
        wwb.append(tmp)
        for j in tmp:
            wb.append(j)
        
    return wwb, wb, cb

tmp_df = pd.DataFrame(["마우스 패드 삽니다","마우스가 고장이 났네요 수리 요청","패드에서 냄새가 나요"])
wwb, wb, cb = word_box(tmp_df)
wwb, wb, cb

([['마우스', '패드'], ['마우스', '고장', '수리', '요청'], ['패드', '냄새']],
 ['마우스', '패드', '마우스', '고장', '수리', '요청', '패드', '냄새'],
 [2, 4, 2])

In [39]:
import random
k = 3
# 토픽 임의 할당
def random_topic(wb, k):
    topic_lst = []
    for i in range(len(wb)):
        topic_lst.append(random.randrange(1,k+1))
    a = pd.DataFrame(topic_lst)
    a = a.T
    a.columns = wb
    a.index = ["Topic"]
    return a, topic_lst

randomtopic_df, topic_lst = random_topic(wb, k)
randomtopic_df

Unnamed: 0,마우스,패드,마우스.1,고장,수리,요청,패드.1,냄새
Topic,1,3,2,1,3,3,2,1


In [42]:
# 토픽-문서 단어 분포 계산

def extract_topic_lst(wb, cb, topic_lst):
    lst = []      # 문서 별 토픽 리스트
    tmp = []      # 각 문서 별 토픽 리스트
    a = 0         # 각 문서 내 단어 갯수
    c = 0         # 현재 단어 위치
    time = cb[a]  # 문서당 단어 개수 컷팅

    for i in range(len(wb)):
        if c == time:
            a += 1
            time += cb[a]
            lst.append(tmp)
            tmp = []
        tmp.append(topic_lst[i])
        c+=1
    lst.append(tmp)
    return lst

etl = extract_topic_lst(wb, cb, topic_lst)
etl

[[1, 3], [2, 1, 3, 3], [2, 1]]

In [91]:
# 토픽-문서 단어 분포 데이터 프레임 추출
def topic_docs_matrix(etl, k, alpha):
    lst = []
    for j in range(k):
        c = []
        for i in etl:
            c.append(i.count(j+1))
        lst.append(c)
    return pd.DataFrame(lst, columns=[i for i in range(1,k+1)])+alpha
    
tdm = topic_docs_matrix(etl, k, alpha)
tdm

Unnamed: 0,1,2,3
0,1.01,1.01,1.01
1,0.01,1.01,1.01
2,1.01,2.01,0.01


In [94]:
# 토픽-단어 분포 계산

def topic_word_dist(wb, topic_lst, beta):
    lst = []
    words = list(set(wb))
    tmp = pd.DataFrame(zip(wb,topic_lst))
    for i in range(k):
        a = []
        for n, j in enumerate(tmp.iloc[:, 1]):
            if j == i+1:
                a.append(tmp.iloc[n, 0])
        lst.append(a)

    word_count = []
    for i in lst:
        c = []
        for j in range(len(words)):
            c.append(i.count(words[j]))
        word_count.append(c)
    return words, pd.DataFrame(word_count, columns=words)+beta

words, twd = topic_word_dist(wb, topic_lst, 0.01)
print(words)
twd

['패드', '고장', '마우스', '냄새', '요청', '수리']


Unnamed: 0,패드,고장,마우스,냄새,요청,수리
0,0.01,1.01,1.01,1.01,0.01,0.01
1,1.01,0.01,1.01,0.01,0.01,0.01
2,1.01,0.01,0.01,0.01,1.01,1.01


In [5]:
# 토픽 번호 업데이트
# 토픽 확률 곱 매트릭스 추출

def topic_prob_matrix(cb, randomtopic_df, k):
    k=3
    c=0
    total = []
    for i in cb:
        c+=i
        total.append(c)

    dtdd=[]
    for i in range(k):
        k_lst=[]
        cc=0
        for j in range(len(randomtopic_df.T.iloc[:,0])):
            if total[cc]==j:
                cc+=1
            k_lst.append(tdm.iloc[i,cc]/tdm.iloc[i,].sum())
        dtdd.append(k_lst)
    dtdd = pd.DataFrame(dtdd, columns=wb)

    twdd=[]
    for kk in range(k):
        k_lst=[]
        for i, t in enumerate(randomtopic_df.T.iloc[:,0]):
            tmp_tw = twd[randomtopic_df.columns[i]][kk]/twd.iloc[kk,:].sum()
            k_lst.append(tmp_tw)
        twdd.append(k_lst)
    twdd = pd.DataFrame(twdd, columns=wb)
    new_topic_dist_df = twdd * dtdd
    
    return new_topic_dist_df
        
new_topic_df = topic_prob_matrix(cb, randomtopic_df, k)
new_topic_df

Unnamed: 0,마우스,패드,마우스.1,고장,수리,요청,패드.1,냄새,우리집,마우스.2,...,피스,냄새.1,나,우리,집,마우스.3,고장.1,배변,패드.2,어디
0,0.0,0.000155,0.0,0.0,0.031134,0.031134,0.015644,0.031289,0.0,0.0,...,0.0,0.093247,0.046623,0.046623,0.0,0.0,0.0,0.031134,0.031134,0.0
1,0.061224,0.0,0.000606,0.000202,0.0,0.0,0.0,0.0,0.040614,0.121843,...,0.040614,0.0,0.0,0.0,0.0,0.000606,0.040614,0.0,0.0,0.040614
2,0.055464,0.055464,0.110379,0.055189,0.0,0.0,0.055464,0.0,0.0,0.000549,...,0.0,0.0,0.0,0.0,0.055189,0.110379,0.000275,0.0,0.000549,0.0


In [181]:
# 토픽 업데이트
def topic_update(new_topic_df, wb):
    lst = []
    for i in range(len(new_topic_df.iloc[0,:])):
        lst.append(new_topic_df.T.iloc[i,:].idxmax()+1)
    result = pd.DataFrame(lst, index=wb, columns=["Topic"]).T
    
    return result, lst

updated_topic, lst = topic_update(new_topic_df, wb)
updated_topic

Unnamed: 0,마우스,패드,마우스.1,고장,수리,요청,패드.1,냄새
Topic,1,3,2,1,3,3,2,1


In [9]:
# 통합 버젼
randomtopic_df=0
import pandas as pd
import random
from konlpy.tag import Kkma

kkma = Kkma()

def extract_word(sentence):
    lst = kkma.nouns(sentence)
    return lst

def word_box(df):
    wb = []
    cb = []
    wwb = []
    for i in df.iloc[:,0]:
        tmp = extract_word(i)
        cb.append(len(tmp))
        wwb.append(tmp)
        for j in tmp:
            wb.append(j)
        
    return wwb, wb, cb

def random_topic(wb, k):
    topic_lst = []
    for i in range(len(wb)):
        topic_lst.append(random.randrange(1,k+1))
    a = pd.DataFrame(topic_lst)
    a = a.T
    a.columns = wb
    a.index = ["Topic"]
    return a, topic_lst

def extract_topic_lst(wb, cb, topic_lst):
    lst = []      # 문서 별 토픽 리스트
    tmp = []      # 각 문서 별 토픽 리스트
    a = 0         # 각 문서 내 단어 갯수
    c = 0         # 현재 단어 위치
    time = cb[a]  # 문서당 단어 개수 컷팅

    for i in range(len(wb)):
        if c == time:
            a += 1
            time += cb[a]
            lst.append(tmp)
            tmp = []
        tmp.append(topic_lst[i])
        c+=1
    lst.append(tmp)
    
    return lst

def topic_docs_matrix(etl, k, alpha):
    lst = []
    for j in range(k):
        c = []
        for i in etl:
            c.append(i.count(j+1))
        lst.append(c)
    return pd.DataFrame(lst, columns=[i for i in range(1,len(etl)+1)])+alpha

def topic_word_dist(wb, topic_lst, beta):
    lst = []
    words = list(set(wb))
    tmp = pd.DataFrame(zip(wb,topic_lst))
    for i in range(k):
        a = []
        for n, j in enumerate(tmp.iloc[:, 1]):
            if j == i+1:
                a.append(tmp.iloc[n, 0])
        lst.append(a)

    word_count = []
    for i in lst:
        c = []
        for j in range(len(words)):
            c.append(i.count(words[j]))
        word_count.append(c)
    return words, pd.DataFrame(word_count, columns=words)+beta

def topic_prob_matrix(cb, randomtopic_df, k, tdm, wb, twd):
    c=0
    total = []
    for i in cb:
        c+=i
        total.append(c)

    dtdd=[]
    for i in range(k):
        k_lst=[]
        cc=0
        for j in range(len(randomtopic_df.T.iloc[:,0])):
            if total[cc]==j:
                cc+=1
            k_lst.append(tdm.iloc[i,cc]/tdm.iloc[i,].sum())
        dtdd.append(k_lst)
    dtdd = pd.DataFrame(dtdd, columns=wb)

    twdd=[]
    for kk in range(k):
        k_lst=[]
        for i, t in enumerate(randomtopic_df.T.iloc[:,0]):
            tmp_tw = twd[randomtopic_df.columns[i]][kk]/twd.iloc[kk,:].sum()
            k_lst.append(tmp_tw)
        twdd.append(k_lst)
    twdd = pd.DataFrame(twdd, columns=wb)
    new_topic_dist_df = twdd * dtdd
    
    return new_topic_dist_df

def topic_update(new_topic_df, wb):
    lst = []
    for i in range(len(new_topic_df.iloc[0,:])):
        lst.append(new_topic_df.T.iloc[i,:].idxmax()+1)
    result = pd.DataFrame(lst, index=wb, columns=["Topic"]).T
    
    return result, lst

def main(tmp_df, k, it, alpha, beta):
    wwb, wb, cb = word_box(tmp_df)
    randomtopic_df, topic_lst = random_topic(wb, k)
    print(randomtopic_df)
    print(topic_lst)
    for i in range(it):
        etl = extract_topic_lst(wb, cb, topic_lst)
        tdm = topic_docs_matrix(etl, k, alpha)
        words, twd = topic_word_dist(wb, topic_lst, beta)
        new_topic_df = topic_prob_matrix(cb, randomtopic_df, k, tdm, wb, twd)
        updated_topic, u_lst = topic_update(new_topic_df, wb)
        randomtopic_df = updated_topic
        topic_lst = u_lst
        print(topic_lst)
        
    words, twd = topic_word_dist(wb, topic_lst, 0)
    return updated_topic, twd

def topic_dist(topic_word):
    return (topic_word.T/topic_word.T.sum()).T

In [17]:
# tmp_df = pd.DataFrame(["마우스 패드 삽니다","마우스가 고장이 났네요 수리 요청","패드에서 냄새가 나요"])
tmp_df = pd.DataFrame(["마우스 패드 삽니다","마우스가 고장이 났네요 수리 요청","패드에서 냄새가 나요","우리집 마우스 너무 귀여워",
                      "마우스 피스 팔아요","냄새 나는 우리 집 마우스","고장 배변 패드 어디서 사나요"])
k = 3
alpha = 0.01
beta = 0.01
it = 3
final_topic, topic_word = main(tmp_df, k, it, alpha, beta)
topic_dist(topic_word)

       마우스  패드  마우스  고장  수리  요청  패드  냄새  우리집  마우스  ...  피스  냄새  나  우리  집  마우스  \
Topic    1   3    1   3   1   3   1   1    3    2  ...   2   3  2   3  3    2   

       고장  배변  패드  어디  
Topic   3   2   3   1  

[1 rows x 21 columns]
[1, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 1]
[1, 1, 1, 3, 1, 3, 1, 1, 3, 2, 2, 2, 3, 2, 3, 3, 2, 3, 2, 3, 1]
[1, 1, 1, 3, 1, 3, 1, 1, 3, 2, 2, 2, 3, 2, 3, 3, 2, 3, 2, 1, 1]
[1, 1, 1, 3, 1, 3, 1, 1, 3, 2, 2, 2, 3, 2, 3, 3, 2, 3, 2, 1, 1]


Unnamed: 0,수리,어디,고장,우리집,나,피스,패드,우리,요청,집,배변,마우스,냄새
0,0.125,0.125,0.0,0.0,0.0,0.0,0.375,0.0,0.0,0.0,0.0,0.25,0.125
1,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.166667,0.5,0.0
2,0.0,0.0,0.285714,0.142857,0.0,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.0,0.142857


In [18]:
final_topic

Unnamed: 0,마우스,패드,마우스.1,고장,수리,요청,패드.1,냄새,우리집,마우스.2,...,피스,냄새.1,나,우리,집,마우스.3,고장.1,배변,패드.2,어디
Topic,1,1,1,3,1,3,1,1,3,2,...,2,3,2,3,3,2,3,2,1,1
