In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, BartTokenizer, BartForConditionalGeneration
from tqdm import tqdm
from soynlp.tokenizer import LTokenizer
from gensim.models import LdaModel
from gensim.corpora import Dictionary

# 1. 데이터 로드 및 전처리
file_path = './외국음식전문점.csv'
df = pd.read_csv(file_path)
text_data = df['content'].dropna().tolist()

# Soynlp LTokenizer 사용
from soynlp.word import WordExtractor
word_extractor = WordExtractor()
word_extractor.train(text_data)
word_scores = word_extractor.extract()
float_word_scores = {word: score.cohesion_forward for word, score in word_scores.items()}

# LTokenizer를 사용해 텍스트 토큰화
tokenizer = LTokenizer(scores=float_word_scores)
tokenized_data = []
for review in tqdm(text_data, desc="Tokenizing Reviews with Soynlp"):
    tokens = tokenizer.tokenize(review)
    tokenized_data.append(tokens)

print("Tokenization completed successfully.")

# 2. LDA 모델을 사용한 토픽 모델링
dictionary = Dictionary(tokenized_data)
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

num_topics = 10
passes = 15
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=passes)
print("LDA Model Training Completed.")
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

# 3. KoBERT를 사용한 감성 분석
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device set to: {device}")
tokenizer_bert = BertTokenizer.from_pretrained('monologg/kobert')
model_kobert = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=2).to(device)

inputs = tokenizer_bert(text_data, return_tensors='pt', padding=True, truncation=True, max_length=128)
labels = torch.tensor([0] * len(text_data))  # 예시 레이블 설정 (실제 데이터 사용 필요)
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

optimizer = torch.optim.AdamW(model_kobert.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

epochs = 3
for epoch in range(epochs):
    model_kobert.train()
    epoch_loss = 0
    for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"KoBERT Training Epoch {epoch+1}/{epochs}")):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        outputs = model_kobert(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss/len(train_loader):.4f}")

# 4. KC-BERT를 사용한 신조어 처리 및 감성 분석
tokenizer_kcbert = BertTokenizer.from_pretrained('beomi/kcbert-base')
model_kcbert = BertForSequenceClassification.from_pretrained('beomi/kcbert-base', num_labels=2).to(device)

inputs_kcbert = tokenizer_kcbert(text_data, return_tensors='pt', padding=True, truncation=True, max_length=128)
labels = torch.tensor([0, 1] * (inputs_kcbert['input_ids'].size(0) // 2 + 1))[:inputs_kcbert['input_ids'].size(0)]
dataset_kcbert = TensorDataset(inputs_kcbert['input_ids'], inputs_kcbert['attention_mask'], labels)
train_loader_kcbert = DataLoader(dataset_kcbert, batch_size=16, shuffle=True)

for epoch in range(epochs):
    model_kcbert.train()
    epoch_loss_kcbert = 0
    for batch in tqdm(train_loader_kcbert, desc=f"KC-BERT Training Epoch {epoch+1}/{epochs}"):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        outputs = model_kcbert(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        epoch_loss_kcbert += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss_kcbert/len(train_loader_kcbert)}")

# 5. BART 모델을 사용한 리뷰 요약
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)

# NaN 값을 빈 문자열로 대체하고, 모든 값을 문자열로 변환
df['content'] = df['content'].fillna('').astype(str)
grouped_reviews = df.groupby('store_name')['content'].apply(lambda x: ' '.join(x)).reset_index()

# 각 가게별 리뷰를 저장할 리스트
store_summaries = []
bart_model.eval()

with torch.no_grad():
    for index, row in tqdm(grouped_reviews.iterrows(), desc="Summarizing Reviews by Store", total=len(grouped_reviews)):
        store_name = row['store_name']
        combined_reviews = row['content']

        # 가게에 대한 모든 리뷰를 하나의 입력으로 요약 (간단 요약)
        inputs = bart_tokenizer(combined_reviews, return_tensors='pt', max_length=1024, truncation=True)
        inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
        
        # 간단한 문장 요약
        summary_ids_sentence = bart_model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=50,  # 간단한 문장 요약
            min_length=20, 
            length_penalty=2.0, 
            num_beams=4, 
            early_stopping=True
        )
        summary_sentence = bart_tokenizer.decode(summary_ids_sentence[0], skip_special_tokens=True)
        
        # 상세한 문단 요약
        summary_ids_paragraph = bart_model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=150,  # 상세한 문단 요약
            min_length=60, 
            length_penalty=2.0, 
            num_beams=4, 
            early_stopping=True
        )
        summary_paragraph = bart_tokenizer.decode(summary_ids_paragraph[0], skip_special_tokens=True)
        
        store_summaries.append({
            'store_name': store_name, 
            'summary_sentence': summary_sentence,
            'summary_paragraph': summary_paragraph
        })
        
        # 콘솔에 요약 결과 출력
        print(f"Store: {store_name}\nSummary (Sentence): {summary_sentence}\nSummary (Paragraph): {summary_paragraph}\n")

# 가게별 요약 결과를 파일로 저장
store_summaries_df = pd.DataFrame(store_summaries)
store_summaries_df.to_csv("store_review_summaries.csv", index=False)
print("Store review summaries saved to store_review_summaries.csv")


training was done. used memory 0.737 Gbory 0.492 Gb
all cohesion probabilities was computed. # words = 46096
all branching entropies was computed # words = 62150
all accessor variety was computed # words = 62150


Tokenizing Reviews with Soynlp: 100%|██████████| 51124/51124 [00:02<00:00, 18365.96it/s]


Tokenization completed successfully.
LDA Model Training Completed.
(0, '0.034*"샐러" + 0.012*"드" + 0.010*"스프" + 0.008*"케이크"')
(1, '0.013*"케밥" + 0.013*"아요" + 0.012*"괜찮" + 0.009*"싹싹"')
(2, '0.022*"분위기" + 0.019*"좋은" + 0.017*"파스타" + 0.016*"한"')
(3, '0.039*"맛나요" + 0.017*"굳" + 0.008*"디아" + 0.008*"퀘사"')
(4, '0.080*"맛있" + 0.054*"도" + 0.049*"너무" + 0.033*"친절"')
(5, '0.198*"맛있" + 0.160*"어요" + 0.015*"어요!" + 0.010*"너무"')
(6, '0.016*"방문" + 0.014*"이" + 0.013*"너무" + 0.013*"음식"')
(7, '0.142*"굿" + 0.020*"굿굿" + 0.009*"바삭" + 0.008*"쏘쏘"')
(8, '0.114*"좋아요" + 0.009*"맛잇어요" + 0.007*"+" + 0.006*"밥은"')
(9, '0.042*"맛있" + 0.022*"쌀국수" + 0.017*"가" + 0.017*"는"')
Device set to: cuda


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
KoBERT Training Epoch 1/3: 100%|██████████| 3196/3196 [1:34:15<00:00,  1.77s/it]


Epoch 1/3 - Loss: 0.0047


KoBERT Training Epoch 2/3: 100%|██████████| 3196/3196 [1:34:13<00:00,  1.77s/it]


Epoch 2/3 - Loss: 0.0000


KoBERT Training Epoch 3/3: 100%|██████████| 3196/3196 [1:34:03<00:00,  1.77s/it]


Epoch 3/3 - Loss: 0.0000


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
KC-BERT Training Epoch 1/3: 100%|██████████| 3196/3196 [5:13:03<00:00,  5.88s/it]  


Epoch 1/3 - Loss: 0.7035132192998416


KC-BERT Training Epoch 2/3:  10%|▉         | 305/3196 [28:29<4:30:08,  5.61s/it]


KeyboardInterrupt: 