### OTT별 토픽모델링

- 토큰화 함수화시키기
- LDA 적용 및 시각화까지 추출
- 2020 ~ 2023에 대한 토픽 변화 확인

\

+향후 진행
1. 사전학습모델 완료되면 OTT별 리뷰에 감성분석 라벨링 달기 (수작업으로 어느 정도 라벨링 붙인 후 정확도 비교해야할 듯)
2. 긍부정 나눈 뒤 토팍 모델링 진행
3. 토픽별로 임베딩
4. 시간에 따라 임베딩

## 설치

In [None]:
# Scraper 설치
# !pip install app_store_scraper
# !pip install google-play-scraper
!pip install pyLDAvis # LDA 시각화
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git # 한국어 띄어쓰기 패키지(pykospacing)
!pip install git+https://github.com/ssut/py-hanspell.git # 한국어 Norm : 맞춤법 교정(hanspell)

# Colab에 Mecab 설치
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh
!pip install konlpy

! pip install MeCab
# 오류나면 [런타임 다시 시작] 누르고 다시 설치 필요 없이 진행 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

## 라이브러리

In [None]:
import re
import time
import numpy as np
import pandas as pd
import datetime
import warnings # 경고 메시지 무시
import matplotlib.pyplot as plt

from tqdm import tqdm 
warnings.filterwarnings(action='ignore')

# Tokenize
# from pykospacing import Spacing # 띄어쓰기
from gensim import corpora # 단어 빈도수 계산 패키지
from sklearn.decomposition import LatentDirichletAllocation
import gensim # LDA 모델 활용 목적
import pyLDAvis.gensim_models # LDA 시각화용 패키지
from collections import Counter # 단어 등장 횟수 카운트

# LDA
from gensim.models.ldamodel import LdaModel
from gensim.models.callbacks import CoherenceMetric
from gensim import corpora
from gensim.models.callbacks import PerplexityMetric
from gensim.models import CoherenceModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 한국어 형태소 분석기 중 성능이 가장 우수한 Mecab 사용
from konlpy.tag import *
mecab = Mecab()

# Data 불러오기
from google.colab import drive
drive.mount('/content/drive')

  self._read_thread.setDaemon(True)


Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/

/content/drive/MyDrive


  and should_run_async(code)


## 함수

In [None]:
## ------------------ 데이터 전처리 ------------------ ##
def date_extract(dataset, ago_year=3, printing=False):
  """
  Input: OTT별 전체 데이터프레임
  Output: 최근 n년의 기간에 해당하는 데이터 추출, 결측처리 완료
  """

  dataset['at'] = pd.to_datetime(dataset['at'])
  dataset['year'] = dataset['at'].dt.year
  dataset = dataset[dataset['year'] >= (dataset['year'].unique().max()-ago_year)]
  dataset = dataset.dropna(subset=['content'])
  
  if printing:
    for y in dataset['year'].unique():
      tmp = dataset[dataset['year'] == y]
      print(f'{y}: {len(tmp)} rows\n')
  return dataset


class Data_processing:
  '''
  LDA 적용을 위한 모든 전처리
  '''
  def __init__(self, dataset):
    self.dataset = dataset
    self.replace_list = pd.read_excel('./replace_list.xlsx')
    self.stopwords_list = list(pd.read_excel('./stopword_list.xlsx')['stopword'])

  def ko_language(self, text):
    # 한글 외 문자 제거 & 띄어쓰기 맞추기
    hangul = re.compile('[^가-힣 ]')
    result = hangul.sub('', text)
    return result

  def replace_word(self, text):
    # 단어 치환
    for i, word in enumerate(self.replace_list['before_replacement']):
      if word in text:
        result = re.sub(word, self.replace_list['after_replacement'][i], text)
        return result
    return text

  def tokenize(self, text):
    # 토큰화
    return mecab.nouns(text)

  def remove_stopwords(self, text, add_stopwords=None):
    # 불용어제거
    if add_stopwords:
      self.stopwords_list += add_stopwords
    result = [x for x in text if x not in self.stopwords_list]
    return result
  
  def select_review(self, data, min_token_n:int, max_token_n:int):
    # 특정 토큰 개수의 리뷰 선별
    remove_idx_list = []
    for i in range(len(data)):
      if min_token_n <= len(data.iloc[i]['review_prep']) <= max_token_n:
        continue
      else:
        remove_idx_list.append(i)

    return data.drop(remove_idx_list, axis=0)

  def get_token(self, add_stopwords=None, min_token_n:int=3, max_token_n:int=1000):
    self.dataset['review_prep'] = self.dataset['content'].apply(lambda x:self.ko_language(x))
    self.dataset = self.dataset.reset_index(drop=True)
    print('ko_language done..')
    self.dataset['review_prep'] = self.dataset['review_prep'].apply(lambda x:self.replace_word(x))
    self.dataset = self.dataset.reset_index(drop=True)
    print('replace_word done..')
    self.dataset['review_prep'] = self.dataset['review_prep'].apply(lambda x:self.tokenize(x))
    self.dataset = self.dataset.reset_index(drop=True)
    print('tokenize done..')
    self.dataset['review_prep'] = self.dataset['review_prep'].apply(lambda x:self.remove_stopwords(x, add_stopwords))
    self.dataset = self.dataset.reset_index(drop=True)
    print('remove_stopwords done..')
    self.dataset = self.select_review(self.dataset, min_token_n, max_token_n)
    return list(self.dataset['review_prep']), self.dataset



## ------------------ 모델링 ------------------ ##
class Model:
  '''
  LDA 모델

  no_below = 분석에 사용할 단어의 최소 빈도 수 제약 (ex) 2이면, 빈도가 최소 2이상 넘어간 단어만 취급)
  no_above = 전체의 몇 %로 이상 차지하는 단어를 필터링 할 것인지?
  '''
  def __init__(self, inputs, num_topics:int, no_below:int=2):
    self.dictionary = corpora.Dictionary(inputs)
    self.dictionary.filter_extremes(no_below=no_below)
    self.corpus = [self.dictionary.doc2bow(x) for x in inputs]
    self.inputs = inputs
    self.num_topics = num_topics
  
  def LDA_model(self, chunksize=2000, passes=20, iterations=400, eval_every=None):
    '''
    num_topics: 생성될 토픽의 개수
    chunksize: 한번의 트레이닝에 처리될 문서의 개수
    passes: 딥러닝에서 Epoch와 같은 개념으로, 전체 corpus로 모델 학습 횟수 결정
    interations: 문서 당 반복 횟수
    '''
    temp = self.dictionary[0]
    id2word = self.dictionary.id2token

    self.model = LdaModel(
      corpus=self.corpus,
      id2word=id2word,
      chunksize=chunksize,
      alpha='auto',
      eta='auto',
      iterations=iterations,
      num_topics=self.num_topics,
      passes=passes,
      eval_every=eval_every)


  def print_topic_prop(self, topn=10, num_words=20):
    self.LDA_model()
    topics = self.model.print_topics(num_words=num_words)


    # 토픽별 포함 단어 추출
    topic_words = {}
    for idx, words in topics:
      topic_words[idx] = words.split('+')

    topic_table = pd.DataFrame(topic_words)
    topic_table.columns = [f'topic_{t+1}' for t in range(len(topics))]

    # coherence
    coherence_model_lda = CoherenceModel(model=self.model, texts=self.inputs, dictionary = self.dictionary, topn=10)
    coherence_lda = coherence_model_lda.get_coherence()
    print('LDA done..')

    return topic_table, coherence_lda, topics

  and should_run_async(code)


----

## 사용자 정의

In [None]:
# OTT 선택, 전체로 할 시 None 으로 기입
# OTT = 'tiving'
OTT = None

# 토픽 결정 수
num_topics = 5

# 불용어 
add_stopwords_list = ['티빙', '넷플릭스', '웨이브', '쿠팡', '쿠팡플레이', '디즈니', '나무',
                      '디즈니플러스', '애플리케이션', '개발자', '도도', '슈슈', '휴휴',
                      '옥수수', '라프텔', '라프']

  and should_run_async(code)


## __Main__()

In [None]:
# 데이터 불러오기
df = pd.read_csv('google_store_review.csv', encoding='utf-8-sig')
if OTT: 
  df_ott = df[df['ott']== OTT]
else:
  df_ott = df.copy()


### 리뷰 기간 조정 및 결측치 처리 ###
df_ott = date_extract(df_ott, printing=True)

### 사전학습모델을 이용한 리뷰 라벨링 ###


### 토픽모델링 ###
token_data = Data_processing(df_ott) # 전처리 시작
tokenized_data, _ = token_data.get_token(add_stopwords_list)
lda = Model(tokenized_data, num_topics=num_topics) # 토픽모델링 시작
topic_table, coherence, _ = lda.print_topic_prop()

  and should_run_async(code)


2023: 3051 rows

2022: 16623 rows

2021: 22667 rows

2020: 17116 rows

ko_language done..
replace_word done..
tokenize done..
remove_stopwords done..
LDA done..


In [None]:
topic_table

  and should_run_async(code)


Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5
0,"0.050*""돈""","0.072*""화면""","0.108*""수""","0.083*""거""","0.281*""자막"""
1,"0.043*""결제""","0.068*""영상""","0.099*""영화""","0.054*""플""","0.079*""한글"""
2,"0.038*""오류""","0.056*""재생""","0.053*""확인""","0.051*""게""","0.056*""시즌"""
3,"0.033*""로그인""","0.050*""업데이트""","0.043*""컨텐츠""","0.024*""오픈""","0.038*""심슨"""
4,"0.025*""내""","0.044*""불편""","0.042*""드라마""","0.024*""지원""","0.035*""날자"""
5,"0.022*""회원""","0.030*""시청""","0.028*""마블""","0.024*""한국인""","0.027*""개"""
6,"0.021*""번""","0.026*""개선""","0.026*""최신""","0.023*""화질""","0.016*""영어"""
7,"0.021*""문제""","0.019*""중""","0.020*""추가""","0.022*""말""","0.014*""설정"""
8,"0.021*""로딩""","0.018*""조절""","0.018*""프로그램""","0.022*""리뷰""","0.014*""한국어"""
9,"0.020*""삭제""","0.015*""설정""","0.017*""콘텐츠""","0.020*""연동""","0.011*""애"""


---

In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import gluonnlp as nlp
import pandas as pd
import numpy as np
import time
import random
import re
from tqdm.notebook import tqdm
from keras.utils import pad_sequences

#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

  and should_run_async(code)


In [None]:
_, vocab = get_pytorch_kobert_model()
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

class KoBERTClassifier(nn.Module):
  def __init__(self, bert, hidden_size=768, num_classes=2, dr_rate=None, params=None):
    super(KoBERTClassifier, self).__init__()

    self.bert = bert
    self.dr_rate = dr_rate

    self.classifier = nn.Linear(hidden_size, num_classes)
    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)

  def gen_attention_mask(self, token_ids, valid_length):
    attention_mask = torch.zeros_like(token_ids)
    for i, v in enumerate(valid_length):
      attention_mask[i][:v] = 1
    return attention_mask.float()

  def forward(self, token_ids, valid_length, segment_ids):
    attention_mask = self.gen_attention_mask(token_ids, valid_length)

    _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(), attention_mask=attention_mask.float().to(token_ids.device))
    if self.dr_rate:
      out = self.dropout(pooler)
    return self.classifier(out)

    
def predict(predict_sentence, max_len=146, batch_size=1):
  data = [predict_sentence, '0']
  dataset_another = [data]

  another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
  test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size)

  # load model
  model = torch.load("checkpoint_best.pt", map_location=torch.device('cpu'))
  device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

  model.eval()
  with torch.no_grad():
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
      token_ids = token_ids.long().to(device)
      segment_ids = segment_ids.long().to(device)

      valid_length = valid_length
      label = label.long().to(device)

      out = model(token_ids, valid_length, segment_ids)

      test_eval = []
      for i in out:
        logits = i
        logits = logits.detach().cpu().numpy()

        if np.argmax(logits) == 0:
          test_eval.append(0)
        else:
          test_eval.append(1)

  return(test_eval[0])

  and should_run_async(code)


using cached model. /content/drive/MyDrive/.cache/kobert_v1.zip
using cached model. /content/drive/MyDrive/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /content/drive/MyDrive/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
predict_list = []
for s in tqdm(df_ott.review_prep):
  pred = predict(s)
  print(f'{s}:', pred)
  
  predict_list.append(pred)