## 전처리이후 - Actor 도출

- 토큰화 형태소
- 불용어사전
- 임베딩
- Ward clustering 군집 덴드로그램 
- 실루엣계수 조정
- 빈도분석
- TF - IDF

In [None]:
!pip install keras_tuner

In [None]:
!pip install gensim

In [None]:
!pip install kss

In [None]:
%matplotlib inline

import os
import pandas as pd
from pandas import DataFrame
from pandas import Series
import numpy as np
import matplotlib.pyplot as plt
import string
import re

from konlpy.tag import Okt
# from konlpy.tag import Mecab 

#pip install kss
from kss import split_sentences   
#from pykospacing import spacing

from gensim.models import Word2Vec

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, Dense,GRU,Flatten, LSTM,Conv1D, GlobalMaxPooling1D, Embedding, Dropout, GlobalAveragePooling1D

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

from kerastuner.tuners import RandomSearch # 랜덤서치를 합니다

from gensim .models import Word2Vec
from sklearn.manifold import TSNE
from matplotlib import font_manager as fm
from matplotlib import rc
from tqdm import tqdm

### 전처리 파일 불러오기

In [None]:
data = pd.read_csv("우리_크롤링_전처리후.csv", encoding='utf-8-sig')

del data['Unnamed: 0']
data

In [None]:
data = data.dropna().reset_index(drop = True)

### 토큰화 형태소/불용어 사전 

In [None]:
def preprocess_okt(text):
    #     text = spacing(text) # 띄어쓰기 보정 위에서 했으면 필요없습니다
    pos_words = okt.pos(text, stem=True)
    words = [word for word, tag in pos_words if tag in ['Noun', 'Adjective', 'Verb', 'KoreanParticle', 'VerbPrefix'] ]
    stopwords = ['하다', '되다', '안', '기', '고', '요', '란', '다음', '요즘', '지금', '앞', '왜', '여기', '후', '다른', '함', '등', '동안', '원래', '아주', '날', '더', '진짜', '이', '월', '시간', '오늘', '저', '또', '그', '좀', '년', '린지', '정말', '블로그', '그냥', '사실', '이제', '때문', '이번', '제', '다시', '정도', '시', '못', '주간', '일기', '하루', '일상', '전', '난', '일', '걸', '뭐', '줄', '만', '건', '분', '개', '끝', '잼', '이거', '번', '중', '듯', '때', '게', '내', '말', '나', '수', '거', '점', '것']
    stopped_words = [w for w in words if not w in stopwords]
    return stopped_words

In [None]:
okt = Okt()

In [None]:
for i in tqdm(range(len(data))):
    data['review'].iloc[i] = preprocess_okt(data['review'].iloc[i])

In [None]:
data['review']

### 임베딩

In [None]:
EMBEDDING_DIM = 20 # 임베딩 크기는 논문을 따름
model_shopping = Word2Vec(sentences=data.review, sg=1, vector_size=EMBEDDING_DIM, window=5, min_count=1) #sg 0은 CBOW, 1은 SKIP-GRAM
w2v_vocab_shopping = list(model_shopping.wv.key_to_index) # 임베딩 된 단어 리스트
print('Vocabulary size : ',len(w2v_vocab_shopping)) 
print('Vecotr shape :',model_shopping.wv.vectors.shape)

In [None]:
print("shopping :" ,model_shopping.wv.most_similar('대학생')) #TEST용
print()

In [None]:
# save model in ASCII (word2vec) format
# 텍스트 파일로 단어들의 임베딩 벡터 저장
filename = 'shopping_word2vec.txt'
model_shopping.wv.save_word2vec_format(filename, binary=False)

### Ward clustering 군집 덴드로그램 

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import silhouette_score

In [None]:
# ward linkage (가장 일반적으로 사용됨)

def visualize_silhouette_layer1(data, num_cluster):
    clusters_range = range(2,int(num_cluster))
    results = []

    for i in clusters_range:
        clusterer = AgglomerativeClustering(n_clusters=i,linkage='ward')
        cluster_labels = clusterer.fit_predict(data)
        silhouette_avg = silhouette_score(data, cluster_labels)
        results.append([i, silhouette_avg])

    result = pd.DataFrame(results, columns=["n_clusters", "silhouette_score"])
    pivot_ac = pd.pivot_table(result, index="n_clusters", values="silhouette_score")

    return result, pivot_ac

In [None]:
model_shopping.wv["있다"]

In [None]:
rv = []
for i in data["review"]:
    review_vector = 0
    for w in i:
        review_vector += model_shopping.wv[w]
    try:
        rv.append(review_vector/len(i))
    except:
        rv.append(0)
data["review vector"] = rv

In [None]:
data["review vector"]

In [None]:
# 추가된 코드
useless = []
for a, i in enumerate(data["review vector"]):
    if type(i) == int:
        useless.append(i)
for i in useless:
    data = data.drop(data.index[i])
data = data.reset_index(drop = True)
data

In [None]:
list(data["review vector"])

In [None]:
data["review vector"] = np.array(data["review vector"],dtype=object)
print(data["review vector"])

In [None]:
rv = []
for i in data["review vector"]:
    if type(i) != int:
        rv.append(i)
len(rv)

In [None]:
linked = linkage(rv, 'ward')

plt.figure(figsize=(15, 9))
dendrogram(linked,
            orientation='top',
            distance_sort='descending',
            show_leaf_counts=True)
plt.show()

### 실루엣 계수 조정

In [None]:
def visualize_silhouette_layer1(data["review vector"], num_cluster):
    clusters_range = range(2,int(num_cluster))
    results = []

    for i in clusters_range:
        clusterer = AgglomerativeClustering(n_clusters=i,linkage='ward')
        cluster_labels = clusterer.fit_predict(data['review vector'])
        silhouette_avg = silhouette_score(data['review vector'], cluster_labels)
        results.append([i, silhouette_avg])

    result = pd.DataFrame(results, columns=["n_clusters", "silhouette_score"])
    pivot_ac = pd.pivot_table(result, index="n_clusters", values="silhouette_score")

    return result, pivot_ac

In [None]:
result, pivot_ac = visualize_silhouette_layer1(data["review vector"],10)
result

In [None]:
import matplotlib.pyplot as plt

plt.plot(result.n_clusters, result.silhouette_score)

In [None]:
# 각 군집에서의 문장을 sample해서 봐보기 위한 코드입니다

cluster_index = range(6) #클러스터 개수에 따라 range()에 숫자넣기!

representative_sentence = {}

for i in cluster_index:
    sent_sample = data["review vector"][data['predict']==i].sample(n=2, random_state=26)
    representative_sentence[str(i)+'번 군집'] = sent_sample.iloc[:]

In [None]:
representative_sentence

In [None]:
data['predict'] = predict

In [None]:
cluster0= []
cluster1= []
cluster2= []
cluster3= []
cluster4= []
cluster5= []

for i in range(len(data["review vector"])):
    if data['predict'][i] == 0:
        for e in df_pos18["review vector"][i]:
            cluster0.append(e)
            
    elif data['predict'][i] == 1:
        for e in df_pos18["review vector"][i]:
            cluster1.append(e)
                
    elif data['predict'][i] == 2:
        for e in df_pos18["review vector"][i]:
            cluster2.append(e)
            
    elif data['predict'][i] == 3:
        for e in df_pos18["review vector"][i]:
            cluster3.append(e)
                
    elif data['predict'][i] == 4:
        for e in df_pos18["review vector"][i]:
            cluster4.append(e)
            
    elif data['predict'][i] == 5:
        for e in df_pos18["review vector"][i]:
            cluster5.append(e)

In [None]:
#빈도분석
korean = pd.Series(cluster3).value_counts().head(10)
print(korean)

In [None]:
korean = pd.Series(cluster0).value_counts()
counts0 = korean

korean = pd.Series(cluster1).value_counts()
counts1 = korean

korean = pd.Series(cluster2).value_counts()
counts2 = korean

korean = pd.Series(cluster3).value_counts()
counts3 = korean

korean = pd.Series(cluster4).value_counts()
counts4 = korean

korean = pd.Series(cluster5).value_counts()
counts5 = korean

# korean = pd.Series(cluster6).value_counts()
# counts6 = korean

In [None]:
count_list = [counts0,counts1,counts2,counts3,counts4,counts5]

list_list = []
for i in count_list:
    for w in i.index:
        list_list.append(w)
dic = pd.DataFrame(list_list)
dic = dic.value_counts()
dic

### TF-IDF

In [None]:
# N개(모든) 클러스터에 있는 토큰 제외

tfidf = []
for i in count_list:
    imsi = []
    for w in i.index:
        if dic[w] == 6:
            imsi.append(0)
        else:
            imsi.append(i[w]/np.exp(dic[w]))
    tfidf.append(imsi)
tfidf

In [None]:
tfidf0 = {"단어" : counts0.index, "tfidf" : tfidf[0]}
tfidf1 = {"단어" : counts1.index, "tfidf" : tfidf[1]}
tfidf2 = {"단어" : counts2.index, "tfidf" : tfidf[2]}
tfidf3 = {"단어" : counts3.index, "tfidf" : tfidf[3]}
tfidf4 = {"단어" : counts4.index, "tfidf" : tfidf[4]}
tfidf5 = {"단어" : counts5.index, "tfidf" : tfidf[5]}
# tfidf6 = {"단어" : counts6.index, "tfidf" : tfidf[6]}

tfidf0 = pd.DataFrame(tfidf0)
tfidf1 = pd.DataFrame(tfidf1)
tfidf2 = pd.DataFrame(tfidf2)
tfidf3 = pd.DataFrame(tfidf3)
tfidf4 = pd.DataFrame(tfidf4)
tfidf5 = pd.DataFrame(tfidf5)
# tfidf6 = pd.DataFrame(tfidf6)

tfidf0 = tfidf0.sort_values(by = "tfidf", ascending = False)
tfidf1 = tfidf1.sort_values(by = "tfidf", ascending = False)
tfidf2 = tfidf2.sort_values(by = "tfidf", ascending = False)
tfidf3 = tfidf3.sort_values(by = "tfidf", ascending = False)
tfidf4 = tfidf4.sort_values(by = "tfidf", ascending = False)
tfidf5 = tfidf5.sort_values(by = "tfidf", ascending = False)
# tfidf6 = tfidf6.sort_values(by = "tfidf", ascending = False)

# # CSV 저장
tfidf0.to_csv('shopping_tfidf0.csv', encoding = "utf-8-sig")
tfidf1.to_csv('shopping_tfidf1.csv', encoding = "utf-8-sig")
tfidf2.to_csv('shopping_tfidf2.csv', encoding = "utf-8-sig")
tfidf3.to_csv('shopping_tfidf3.csv', encoding = "utf-8-sig")
tfidf4.to_csv('shopping_tfidf4.csv', encoding = "utf-8-sig")
tfidf5.to_csv('shopping_tfidf5.csv', encoding = "utf-8-sig")
# tfidf6.to_csv('중_tfidf6.csv', encoding = "utf-8-sig")