# Combined Topic Models, CTM

## drive mount & path & install

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture
!pip install transformers

In [3]:
%%capture
!pip install contextualized-topic-models==2.2.0

In [4]:
%%capture
!pip install pyldavis

In [5]:
%%capture
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

In [6]:
import pandas as pd
import numpy as np
from konlpy.tag import Mecab
from tqdm import tqdm
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, bert_embeddings_from_list
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing

In [7]:
%cd /content/drive/MyDrive/chat_summary
from load_dataset import Aihub

/content/drive/.shortcut-targets-by-id/1g3BOWNdfNSvSYVyCXZo0TvHh9AWJwFKb/chat_summary


## load data & preprocessing
- aihub dataset (chat log)

In [None]:
aihub = Aihub(data_dir=Path('korean_chat_data'))

docs, summaries, true_topics = aihub.load_data('Training')

In [None]:
len(docs), len(summaries), len(true_topics)

(279992, 279992, 279992)

In [None]:
docs[0]

'우리 기간 언제까지나고 물어볼 수 없어 재등록 할 줄 알면 어케? 헉... 어떻게말해야되지? 그냥 자연스레.. 안나오면되비 ㄱ 아저씨가 먼저 말하려나? 연장할거냐고..하면 뭐랗9? 고민해볼게요..? 생각해보고나올게요..? 대본짜줘... 다음 달은 바빠서 어려울 것 같고 다음번에 다시 올게요~ 어떠니?? 오 역시말을잘하네 너가말해! 참내 너 같아 부끄러워...'

## 사전 tokenizer
- mecab을 통해 형태소분석

In [None]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        sent = sent[:1000000]
        word_tokens = self.tagger.nouns(sent)
        result = [word for word in word_tokens if len(word) > 1]
        return result

In [None]:
custom_tokenizer = CustomTokenizer(Mecab())

In [None]:
## custom_tokenizer check
# docs_token = list(map(custom_tokenizer, docs))
# docs_token

## 빈도수 기반 Bag of words

In [None]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=1000)

In [None]:
train_bow_embeddings = vectorizer.fit_transform(docs)

In [None]:
print(train_bow_embeddings.shape)

(279992, 1000)


In [None]:
## 
vocab = vectorizer.get_feature_names()
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}



In [None]:
len(vocab)

1000

## bert sentence embedding
- 다국어 sbert 사용

### bert

In [None]:
train_contextualized_embeddings = bert_embeddings_from_list(docs, \
                                                            "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens")

Batches:   0%|          | 0/1400 [00:00<?, ?it/s]

In [None]:
train_contextualized_embeddings.shape

(279992, 768)

In [None]:
type(train_contextualized_embeddings)

numpy.ndarray

## train CTM

In [None]:
## train input: sentence embedding, bow, id2token
qt = TopicModelDataPreparation()
training_dataset = qt.load(train_contextualized_embeddings, train_bow_embeddings, id2token)

In [None]:
ctm = CombinedTM(bow_size=len(vocab), contextual_size=768, n_components=9, num_epochs=10)
ctm.fit(training_dataset, verbose=True)

Settings: 
                   N Components: 9
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.8888888888888888
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [10/10]	 Seen Samples: [2799920/2799920]	Train Loss: 41.78895504366531	Time: 0:00:28.760117: : 10it [04:49, 28.96s/it]


In [None]:
ctm.save('model')



## CTM 예측 결과

In [None]:
ctm.get_topics(5)

defaultdict(list,
            {0: ['머리', '사진', '이거', '노래', '영화'],
             1: ['엄마', '언니', '전화', '아빠', '오빠'],
             2: ['어디', '거기', '여기', '버스', '도착'],
             3: ['저녁', '운동', '라면', '점심', '커피'],
             4: ['허리', '감기', '소리', '난리', '검사'],
             5: ['시간', '이번', '공부', '하루', '오늘'],
             6: ['사람', '생각', '우리', '친구', '자기'],
             7: ['그거', '하나', '카드', '이거', '결제'],
             8: ['정확', '영업', '부동산', '공장', '보장']})

In [None]:
result_df = pd.DataFrame(ctm.get_topics(5)).T
result_df.to_csv('./topic_modeling_result/ctm_bert_nouns.csv')

## 테스트셋 예측

In [8]:
aihub = Aihub(data_dir=Path('korean_chat_data'))

docs, summaries, true_topics = aihub.load_data('Validation')

In [9]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        sent = sent[:1000000]
        word_tokens = self.tagger.nouns(sent)
        result = [word for word in word_tokens if len(word) > 1]
        return result
        
custom_tokenizer = CustomTokenizer(Mecab())
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=1000)
val_bow_embeddings = vectorizer.fit_transform(docs)
vocab = vectorizer.get_feature_names()
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

qt = TopicModelDataPreparation()



In [10]:
val_contextualized_embeddings = bert_embeddings_from_list(docs, \
                                                            "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens")
val_dataset = qt.load(val_contextualized_embeddings, val_bow_embeddings, id2token)

Batches:   0%|          | 0/176 [00:00<?, ?it/s]

In [11]:
ctm = CombinedTM(bow_size=len(vocab), contextual_size=768, n_components=9, num_epochs=10)

In [12]:
ctm.load('./model/contextualized_topic_model_nc_9_tpm_0.0_tpv_0.8888888888888888_hs_prodLDA_ac_(100, 100)_do_softplus_lr_0.2_mo_0.002_rp_0.99', 9)



In [None]:
ctm.get_doc_topic_distribution(val_dataset, n_samples=1)



## visualize

In [None]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

  from collections import Iterable
  from collections import Mapping
0it [00:00, ?it/s]