[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytextbook/pytextbook/blob/main/7.1.%20seoul-120-LDA.ipynb)

## 토픽 모델링
* 실습을 위해 pyLDAvis 설치
* colab사용시 설치 후에도 제대로 동작하지 않거나 오류가 나면 런타임 재실행!

In [2]:
# pyLDAvis 는 scikit-learn 의존성 문제로 해당 실습은 아래 버전으로 진행을 추천
!pip install -q scikit-learn
!pip install -U -q pyLDAvis

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.6 MB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/2.6 MB[0m [31m17.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

## 라이브러리 로드

In [5]:
# 필요 라이브러리를 로드
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 데이터 로드

In [6]:
df = pd.read_csv("/content/drive/MyDrive/claw/민원분류.csv")
df.shape

(10373, 6)

In [7]:
df.head(3)

Unnamed: 0,Name,Value.age,Value.sex,Value.dep_name,UpdatedContent,UpdatedTitle
0,list,20,1,환경건축과,상오정로 **\n건물의 지붕 기와가 떨어지려고 합니다.\n연립주택이라 수리가 어렵습니다,수리요청바랍니다
1,list,NONE,NONE,건설안전과,경기도 부천시 오정구 오정동 ***-*\n안녕하세요 \n여기는. 오정동 우체국 옆 ...,안녕하세요 여기는. 오정동 우체국 옆 상가 주차장 입
2,list,NONE,NONE,도시미관과,( https://www.safetyreport.go.kr/fileDown/singo/,쓰레기기를 낮애버려놓아 더니기불편해요* 안전신문고


In [8]:
# 결측치가 있다면 제거
df = df.dropna()
df.shape

(10338, 6)

In [20]:
filter_criteria = ['도시미관과', '장애인복지과', '건설안전과', '기후에너지과','차량등록과','대중교통과']
df = df[df['Value.dep_name'].isin(filter_criteria)]

## 문서 만들기
* 제목과 내용을 함께 사용

In [21]:
df["문서"] = df["UpdatedTitle"] + " " + df["UpdatedContent"]

## 벡터화

* [Bag-of-words model - Wikipedia](https://en.wikipedia.org/wiki/Bag-of-words_model)


## CountVectorizer

* analyzer : 단어, 문자 단위의 벡터화 방법 정의
* ngram_range : BOW 단위 수 (1, 3) 이라면 1개~3개까지 토큰을 묶어서 벡터화
* max_df : 어휘를 작성할 때 문서 빈도가 주어진 임계값보다 높은 용어(말뭉치 관련 불용어)는 제외 (기본값=1.0)
    * max_df = 0.90 : 문서의 90% 이상에 나타나는 단어 제외
    * max_df = 10 : 10개 이상의 문서에 나타나는 단어 제외
* min_df : 어휘를 작성할 때 문서 빈도가 주어진 임계값보다 낮은 용어는 제외합니다. 컷오프라고도 합니다.(기본값=1.0)
    * min_df = 0.01 : 문서의 1% 미만으로 나타나는 단어 제외
    * min_df = 10 : 문서에 10개 미만으로 나타나는 단어 제외
* stop_words : 불용어 정의
* API Document: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [22]:
# 단어들의 출현 빈도(frequency)로 여러 문서들을 벡터화하기 위해 CountVectorizer 사용
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words=["돋움", "경우", "또는"])

### 참고: fit, transform, fit_transfrom의 차이점
- fit(): 원시 문서에 있는 모든 토큰의 어휘 사전을 배운다
- transform(): 문서를 문서 용어 매트릭스로 변환, transform 이후엔 매트릭스로 변환되어 숫자형태로 변경
- fit_transform(): 어휘 사전을 배우고 문서 용어 매트릭스를 반환, fit 다음에 변환이 오는 것과 동일하지만 더 효율적으로 구현

* API Document: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.fit_transform

In [23]:
# fit_transform을 사용하여 문장에서 노출되는 feature(특징이 될만한 단어) 수를 합한 변수 Document Term Matrix(이하 dtm)를 생성
dtm_cv = cv.fit_transform(df["문서"])

In [None]:
# cv.vocabulary_ 를 봅니다.
# cv.vocabulary_

In [24]:
cv_cols = cv.get_feature_names_out()

In [25]:
# 각 row에서 전체 단어가방에 있는 어휘에서 등장하는 단어에 대한 one-hot-vector를 확인
# toarray()로 희소 행렬(sparse matrix, 행렬의 값이 대부분 '0'인 행렬)을 NumPy array 배열로 변환하여 값을 확인

pd.DataFrame(dtm_cv.toarray(), columns=cv_cols).sum().sort_values()

Unnamed: 0,0
힘이듭니다,1
사항입니다,1
사회활동을,1
삭막한,1
산림이,1
...,...
원미구,1565
불법주차,2010
신고입니다,2042
경기도,2467


## 잠재 디리클레 할당(Latent Dirichlet Allocation, LDA)

* API documentation: https://pyldavis.readthedocs.io/en/latest/modules/API.html

In [26]:
# 정답인 '분류'의 유일한 값을 확인하여 주제 수를 확인
df["Value.dep_name"].value_counts()

Unnamed: 0_level_0,count
Value.dep_name,Unnamed: 1_level_1
도시미관과,807
장애인복지과,681
건설안전과,594
기후에너지과,471
차량등록과,267
대중교통과,261


In [38]:
# 주어진 문서에 대하여 각 문서에 어떤 주제들이 존재하는지를 확인하는 잠재 디리클레 분석(LDA)을 불러옴
# n_components에 넣을 하이퍼파라미터 NUM_TOPICS로 주제수를 설정(기본값=10)
# max_iter는 훈련 데이터(epoch라고도 함)에 대한 최대 패스 수(기본값=10)

from sklearn.decomposition import LatentDirichletAllocation

NUM_TOPICS = 6
LDA_model = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=42)

  and should_run_async(code)


In [39]:
# LDA_model 에 dtm_cv 를 넣어 학습
LDA_model.fit(dtm_cv)

  and should_run_async(code)


### pyLDAvis

In [37]:
 !pip install pyLDAvis gensim

  and should_run_async(code)




In [36]:
# 토픽 모델링에 이용되는 LDA 모델의 학습 결과를 시각화하는 Python 라이브러리인 pyLDAvis를 불러옴
# mds(Multi-Dimensional Scaling)는 데이터 포인트 간의 거리를 보존하면서 차원을 축소하는 기법
# t-SNE(t-Stochastic Neighbor Embedding)은 고차원 데이터를 특히 2, 3차원 등으로 줄여 가시화하는데에 유용

import pyLDAvis
import pyLDAvis.sklearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(LDA_model, dtm_cv, cv, mds='tsne')

  and should_run_async(code)


ModuleNotFoundError: No module named 'pyLDAvis.sklearn'

In [55]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
import re
data = df["문서"]
def preprocess_text(text):
    # 특수기호와 특정 단어를 정규표현식으로 제거
    text = re.sub(r'\*|\(|:|안전신문고|\*\*\*_\*|\*\*\*\*|\*\*\*|\_|\,|\-|\==|\.', '', text)
    return text

# 데이터 전처리
data_ = [preprocess_text(doc) for doc in data]


# 1. CountVectorizer로 문서-단어 행렬 생성
vectorizer = CountVectorizer(stop_words=["돋움", "경우", "또는"])
data_vectorized = vectorizer.fit_transform(data_)
data_words = [text.split() for text in data_]  # 텍스트를 리스트 형태로 변환

# 2. gensim의 Dictionary와 Corpus로 변환
id2word = corpora.Dictionary(data_words)
corpus = [id2word.doc2bow(text) for text in data_words]

# 3. LDA 모델 학습
lda_model = gensim.models.LdaModel(corpus=corpus,
                                   id2word=id2word,
                                   num_topics=6,
                                   random_state=42,
                                   passes=10)

# 4. pyLDAvis로 시각화
pyLDAvis.enable_notebook()  # 주피터 노트북 사용 시
lda_display = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.display(lda_display)

  and should_run_async(code)


## TF-IDF(Term Frequency - Inverse Document Frequency)

## TfidfVectorizer

TF-IDF 인코딩은 단어를 갯수 그대로 카운트하지 않고 모든 문서에 공통적으로 들어있는 단어(낮은 구별력)의 경우 가중치를 축소하는 방법

매개변수
* norm='l2' 각 문서의 피처 벡터를 어떻게 벡터 정규화 할지 정한다.
    - L2 : 벡터의 각 원소의 제곱의 합이 1이 되도록 만드는 것이고 기본 값
    - L1 : 벡터의 각 원소의 절댓값의 합이 1이 되도록 크기를 조절
* smooth_idf=False
    - 피처를 만들 때 0으로 나오는 항목에 대해 작은 값을 더해서(스무딩을 해서) 피처를 만들지 아니면 그냥 생성할지를 결정
* sublinear_tf=False
* use_idf=True
    - TF-IDF를 사용해 피처를 만들 것인지 아니면 단어 빈도 자체를 사용할 것인지 여부
* API Document: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer


In [56]:
# TF-IDF 방식으로 단어의 가중치를 조정한 BOW 인코딩하여 벡터화하기 위해 TfidfVectorizer를 사용

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=["돋움", "경우", "또는", "있습니다", "있는", "합니다"])
tfidf

  and should_run_async(code)


In [57]:
# 문장에서 노출되는 feature(특징이 될만한 단어) 수를 합한 변수 Document Term Matrix(이하 dtm)를 생성
dtm_tfidf = tfidf.fit_transform(df["문서"])

  and should_run_async(code)


In [59]:
# tfidf.vocabulary_
cols_tfidf = tfidf.get_feature_names_out()

  and should_run_async(code)


In [60]:
# dtm_tf를 axis=0(수직 방향으로) 기준으로 합계를 낸 dist 변수를 생성
# dist 변수를 vocabulary_ 순으로 정렬하여 비율을 확인
dist = np.sum(dtm_tfidf, axis=0)
pd.DataFrame(dist, columns=cols_tfidf).T.sort_values(by=0).tail(10)

  and should_run_async(code)


Unnamed: 0,0
원미구,153.751424
부천시,170.507163
경기도,171.608916
안전신문고,175.464416
충전구역,188.43407
친환경차,192.966454
전용구역,254.872561
장애인,263.224024
신고입니다,336.537353
불법주차,337.97849


In [61]:
# 각 row에서 전체 단어가방에 있는 어휘에서 등장하는 단어에 대한 가중치를 적용한 vector를 확인
# toarray()로 희소 행렬(sparse matrix, 행렬의 값이 대부분 '0'인 행렬)을 NumPy array 배열로 변환하여 값을 확인
pd.DataFrame(dtm_tfidf.toarray(), columns=cols_tfidf)

  and should_run_async(code)


Unnamed: 0,aa,ai,allalr,alrno,amp,ars로만,a동,beom,bound,brt,...,힘들어서,힘들어요,힘들지,힘듧니다,힘듬,힘듭니,힘듭니다,힘써,힘써주세요,힘이듭니다
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 코사인 유사도
* API Document: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html

In [62]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_simple_pair = cosine_similarity(dtm_tfidf[0] , dtm_tfidf)
result_list = similarity_simple_pair.tolist()[0]

  and should_run_async(code)


In [64]:
df["유사도"] = result_list
df[["Value.dep_name", "UpdatedTitle", "유사도"]].sort_values(by="유사도", ascending=False).head(10)

  and should_run_async(code)


Unnamed: 0,Value.dep_name,UpdatedTitle,유사도
1,건설안전과,안녕하세요 여기는. 오정동 우체국 옆 상가 주차장 입,1.0
7618,기후에너지과,친환경차 충전구역 불법주차 신고입니다.**너****,0.184303
1436,건설안전과,오정동 도로 파임* 안전신문고 신고파일(사진·동영상,0.167439
2199,건설안전과,촬,0.140731
577,건설안전과,도로시설물불량,0.130824
3036,장애인복지과,장애인 전용구역 불법주차 신고입니다.* 안전신문고,0.127487
7641,장애인복지과,장애인 전용구역 불법주차 신고입니다.* 안전신문고,0.127487
5462,장애인복지과,장애인 전용구역 불법주차 신고입니다.* 안전신문고,0.127487
8623,장애인복지과,장애인 전용구역 불법주차 신고입니다.* 안전신문고,0.127487
10226,장애인복지과,장애인 전용구역 불법주차 신고입니다.* 안전신문고,0.127487
