In [1]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, bert_embeddings_from_list
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Mecab
from tqdm import tqdm

In [2]:
import pandas as pd

In [3]:
import re

In [4]:
hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')

In [5]:
df1 = pd.read_csv('reviews.csv', encoding='utf-8')

In [6]:
df1.head()

Unnamed: 0,data_id,postdate,score,content
0,61f28a4e1b20b71cb60f32c4,2022-01-27 21:04:30.000,4,어디서많이맡아본향 전남친향수냄새
1,61f133a109550d3b5462b1f9,2022-01-26 20:42:25.000,1,배송받기까지 12일 걸림. 연계된 택배사 확인 후 연락준다던 &#39;논픽션&#3...
2,61fdc70e3a781d35b9ed83cd,2022-02-05 09:38:38.000,2,향만 좋아요 근데 향도 10초 컷 선물 많이받아서 여러개 써봤는데 보습이 1도 안됨...
3,61f437773a781d35b9ed6ae2,2022-01-29 03:35:34.000,1,상탈크림 시켰는데 다른게 왔네요..? 신경 좀 쓰시지;당연히 잘 왔겠거니 뜯어버려서...
4,620b6bb01c6a4873b17058c3,2022-02-15 18:00:32.000,4,와 진짜 좋아여계속 킁카킁카 맡고 있는 중 😌


In [7]:
data_list = df1.content.tolist()

In [8]:
type(data_list[0])

str

In [9]:
data_list2 = [hangul.sub('', str(d)) for d in data_list]

In [10]:
preprocessed_documents = []

for line in tqdm(data_list2):
  # 빈 문자열이거나 숫자로만 이루어진 줄은 제외
  if line and not line.replace(' ', '').isdecimal():
    preprocessed_documents.append(line)

100%|██████████| 7088/7088 [00:00<00:00, 1180199.55it/s]


In [11]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        word_tokens = self.tagger.nouns(sent)
        result = [word for word in word_tokens if len(word) > 1]
        return result

In [12]:
custom_tokenizer = CustomTokenizer(Mecab('C:\mecab\mecab-ko-dic'))

In [13]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=3000)

In [14]:
train_bow_embeddings = vectorizer.fit_transform(preprocessed_documents)

In [15]:
print(train_bow_embeddings.shape)

(7043, 1969)


In [16]:
vocab = vectorizer.get_feature_names()
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}



In [17]:
len(vocab)

1969

In [18]:
train_contextualized_embeddings = bert_embeddings_from_list(preprocessed_documents, \
                                                            "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens")

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

In [19]:
qt = TopicModelDataPreparation()

training_dataset = qt.load(train_contextualized_embeddings, train_bow_embeddings, id2token)

In [23]:
ctm = CombinedTM(bow_size=len(vocab), contextual_size=768, n_components=10, num_epochs=20)
ctm.fit(training_dataset)

Epoch: [20/20]	 Seen Samples: [140860/140860]	Train Loss: 23.149858809729352	Time: 0:00:09.763064: : 20it [03:16,  9.84s/it]


In [24]:
ctm.get_topics(3)

defaultdict(list,
            {0: ['뭡니까', '사도', '반송'],
             1: ['핸드크림', '생일', '선물'],
             2: ['향수', '남자', '나잇'],
             3: ['향기', '가요', '발림'],
             4: ['뭡니까', '사도', '향일'],
             5: ['뭡니까', '사도', '반송'],
             6: ['뭡니까', '사도', '향일'],
             7: ['냄새', '향수', '핸드크림'],
             8: ['기분', '생일', '포장'],
             9: ['기분', '배송', '포장']})

In [22]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

Sampling: [10/10]: : 10it [01:26,  8.64s/it]
  by='saliency', ascending=False).head(R).drop('saliency', 1)
