In [1]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, bert_embeddings_from_list
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Mecab
from tqdm import tqdm

In [2]:
import pandas as pd

In [3]:
import re

In [4]:
hangul = re.compile('[^ „Ñ±-„Ö£Í∞Ä-Ìû£]+')

In [5]:
df1 = pd.read_csv('reviews.csv', encoding='utf-8')

In [6]:
df1.head()

Unnamed: 0,data_id,postdate,score,content
0,61f28a4e1b20b71cb60f32c4,2022-01-27 21:04:30.000,4,Ïñ¥ÎîîÏÑúÎßéÏù¥Îß°ÏïÑÎ≥∏Ìñ• Ï†ÑÎÇ®ÏπúÌñ•ÏàòÎÉÑÏÉà
1,61f133a109550d3b5462b1f9,2022-01-26 20:42:25.000,1,Î∞∞ÏÜ°Î∞õÍ∏∞ÍπåÏßÄ 12Ïùº Í±∏Î¶º. Ïó∞Í≥ÑÎêú ÌÉùÎ∞∞ÏÇ¨ ÌôïÏù∏ ÌõÑ Ïó∞ÎùΩÏ§ÄÎã§Îçò &#39;ÎÖºÌîΩÏÖò&#3...
2,61fdc70e3a781d35b9ed83cd,2022-02-05 09:38:38.000,2,Ìñ•Îßå Ï¢ãÏïÑÏöî Í∑ºÎç∞ Ìñ•ÎèÑ 10Ï¥à Ïª∑ ÏÑ†Î¨º ÎßéÏù¥Î∞õÏïÑÏÑú Ïó¨Îü¨Í∞ú Ïç®Î¥§ÎäîÎç∞ Î≥¥ÏäµÏù¥ 1ÎèÑ ÏïàÎê®...
3,61f437773a781d35b9ed6ae2,2022-01-29 03:35:34.000,1,ÏÉÅÌÉàÌÅ¨Î¶º ÏãúÏº∞ÎäîÎç∞ Îã§Î•∏Í≤å ÏôîÎÑ§Ïöî..? Ïã†Í≤Ω Ï¢Ä Ïì∞ÏãúÏßÄ;ÎãπÏó∞Ìûà Ïûò ÏôîÍ≤†Í±∞Îãà ÎúØÏñ¥Î≤ÑÎ†§ÏÑú...
4,620b6bb01c6a4873b17058c3,2022-02-15 18:00:32.000,4,ÏôÄ ÏßÑÏßú Ï¢ãÏïÑÏó¨Í≥ÑÏÜç ÌÇÅÏπ¥ÌÇÅÏπ¥ Îß°Í≥† ÏûàÎäî Ï§ë üòå


In [7]:
data_list = df1.content.tolist()

In [8]:
type(data_list[0])

str

In [9]:
data_list2 = [hangul.sub('', str(d)) for d in data_list]

In [10]:
preprocessed_documents = []

for line in tqdm(data_list2):
  # Îπà Î¨∏ÏûêÏó¥Ïù¥Í±∞ÎÇò Ïà´ÏûêÎ°úÎßå Ïù¥Î£®Ïñ¥ÏßÑ Ï§ÑÏùÄ Ï†úÏô∏
  if line and not line.replace(' ', '').isdecimal():
    preprocessed_documents.append(line)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7088/7088 [00:00<00:00, 1180199.55it/s]


In [11]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        word_tokens = self.tagger.nouns(sent)
        result = [word for word in word_tokens if len(word) > 1]
        return result

In [12]:
custom_tokenizer = CustomTokenizer(Mecab('C:\mecab\mecab-ko-dic'))

In [13]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=3000)

In [14]:
train_bow_embeddings = vectorizer.fit_transform(preprocessed_documents)

In [15]:
print(train_bow_embeddings.shape)

(7043, 1969)


In [16]:
vocab = vectorizer.get_feature_names()
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}



In [17]:
len(vocab)

1969

In [18]:
train_contextualized_embeddings = bert_embeddings_from_list(preprocessed_documents, \
                                                            "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens")

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

In [19]:
qt = TopicModelDataPreparation()

training_dataset = qt.load(train_contextualized_embeddings, train_bow_embeddings, id2token)

In [23]:
ctm = CombinedTM(bow_size=len(vocab), contextual_size=768, n_components=10, num_epochs=20)
ctm.fit(training_dataset)

Epoch: [20/20]	 Seen Samples: [140860/140860]	Train Loss: 23.149858809729352	Time: 0:00:09.763064: : 20it [03:16,  9.84s/it]


In [24]:
ctm.get_topics(3)

defaultdict(list,
            {0: ['Î≠°ÎãàÍπå', 'ÏÇ¨ÎèÑ', 'Î∞òÏÜ°'],
             1: ['Ìï∏ÎìúÌÅ¨Î¶º', 'ÏÉùÏùº', 'ÏÑ†Î¨º'],
             2: ['Ìñ•Ïàò', 'ÎÇ®Ïûê', 'ÎÇòÏûá'],
             3: ['Ìñ•Í∏∞', 'Í∞ÄÏöî', 'Î∞úÎ¶º'],
             4: ['Î≠°ÎãàÍπå', 'ÏÇ¨ÎèÑ', 'Ìñ•Ïùº'],
             5: ['Î≠°ÎãàÍπå', 'ÏÇ¨ÎèÑ', 'Î∞òÏÜ°'],
             6: ['Î≠°ÎãàÍπå', 'ÏÇ¨ÎèÑ', 'Ìñ•Ïùº'],
             7: ['ÎÉÑÏÉà', 'Ìñ•Ïàò', 'Ìï∏ÎìúÌÅ¨Î¶º'],
             8: ['Í∏∞Î∂Ñ', 'ÏÉùÏùº', 'Ìè¨Ïû•'],
             9: ['Í∏∞Î∂Ñ', 'Î∞∞ÏÜ°', 'Ìè¨Ïû•']})

In [22]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

Sampling: [10/10]: : 10it [01:26,  8.64s/it]
  by='saliency', ascending=False).head(R).drop('saliency', 1)
