In [1]:
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import pandas as pd
import numpy as np
from torch.cuda.amp import autocast, GradScaler
import multiprocessing

OSError: [WinError 127] 지정된 프로시저를 찾을 수 없습니다. Error loading "c:\ProgramData\anaconda3\envs\dl-dev\lib\site-packages\torch\lib\caffe2_detectron_ops.dll" or one of its dependencies.

In [2]:
# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
# KoBERT 또는 KC-BERT 모델과 토크나이저 로드
model_name = "monologg/kobert"  # KoBERT 모델 사용
# model_name = "beomi/kcbert-base"  # KC-BERT를 사용하려면 이 줄을 사용하세요.
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)

In [4]:
# 데이터 로드
data = pd.read_csv('../외국음식전문점.csv')
data = data.dropna(subset=['content'])  # 결측값 처리


In [5]:
# 기본적인 텍스트 전처리
data['content'] = data['content'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")  # 한글 외 문자 제거
data['content'] = data['content'].str.replace("\s+", " ")  # 불필요한 공백 제거


In [6]:
# Dataset 클래스를 정의하여 데이터를 관리
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = self.tokenizer(
            text, 
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=self.max_len
        )
        tokens = {key: val.squeeze(0) for key, val in tokens.items()}
        return tokens


In [7]:

# DataLoader로 배치 처리 (병렬 처리 강화)
def create_dataloader(data, tokenizer, batch_size=64, num_workers=None):
    if num_workers is None:
        num_workers = multiprocessing.cpu_count()  # 모든 CPU 코어 사용
    dataset = TextDataset(data, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True)
    return dataloader


In [8]:
# 텍스트 데이터를 토큰화하고 임베딩을 얻는 함수
def get_bert_embeddings(dataloader, model, device):
    model.eval()
    embeddings = []
    scaler = GradScaler()  # Mixed Precision 사용 설정
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Mixed Precision을 사용하여 속도 최적화
            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
                embeddings.extend(batch_embeddings)
    return np.array(embeddings)


In [9]:
# 데이터 로더 생성 (모든 CPU 코어를 활용해 데이터 로딩 병렬화)
dataloader = create_dataloader(data['content'].tolist(), tokenizer, batch_size=64, num_workers=multiprocessing.cpu_count())


In [10]:
# 임베딩 계산
embeddings = get_bert_embeddings(dataloader, model, device)

# LDA를 사용한 주제 모델링
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=None)
tfidf = tfidf_vectorizer.fit_transform(data['content'])

lda = LDA(n_components=3, random_state=42)  # 3개의 주제 (맛, 분위기, 서비스)
lda.fit(tfidf)

# 각 문서의 주제 분포를 확인하고 주제를 할당
topic_distributions = lda.transform(tfidf)
data['lda_topic'] = topic_distributions.argmax(axis=1)

# 결과 확인
for i, text in enumerate(data['content'].head(10)):
    print(f"Text: {text} | LDA Topic: {data['lda_topic'].iloc[i]}")

from sklearn.metrics import silhouette_score

# 각 주제별로 문서 확인
for topic in range(3):
    print(f"\nTopic {topic}:")
    print(data[data['lda_topic'] == topic]['content'].head(5))

# 클러스터링의 품질을 평가하기 위해 실루엣 점수를 계산합니다.
silhouette_avg = silhouette_score(embeddings, data['lda_topic'])
print(f"Silhouette Score: {silhouette_avg:.4f}")

  scaler = GradScaler()  # Mixed Precision 사용 설정
