In [1]:
import os
import pandas as pd
import torch
import re
from tqdm import tqdm
from soynlp.tokenizer import LTokenizer
from soynlp.normalizer import repeat_normalize
from transformers import BertTokenizer, BertForSequenceClassification, PreTrainedTokenizerFast, BartForConditionalGeneration
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


In [3]:

# CUDA 오류 발생 시 정확한 위치를 파악하기 위해 설정
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device set to: {device}")

# 데이터셋 로드 및 전처리
file_path = './외국음식전문점.csv'
df = pd.read_csv(file_path)
df['content'] = df['content'].fillna('')
df = df.sample(500, random_state=42)  # 샘플 데이터로 500개만 사용

text_data = df['content'].tolist()


Device set to: cuda


In [4]:

# 전처리 함수 정의
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^가-힣\s]', '', text)
    text = repeat_normalize(text, num_repeats=2)
    stopwords = ['이', '그', '저', '의', '을', '를', '은', '는', '에', '와', '과', '도', '으로', '그리고', '하지만', '그래서']
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text.strip()

tokenizer = LTokenizer()


In [5]:

# 전처리 및 토큰화 함수
def preprocess_and_tokenize(review):
    preprocessed_review = preprocess_text(review)
    tokens = tokenizer.tokenize(preprocessed_review)
    return tokens


In [6]:

# KoBERT 및 KC-BERT 모델 로드
tokenizer_kobert = BertTokenizer.from_pretrained('monologg/kobert')
model_kobert = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=2).to(device)

tokenizer_kcbert = BertTokenizer.from_pretrained('beomi/kcbert-base')
model_kcbert = BertForSequenceClassification.from_pretrained('beomi/kcbert-base', num_labels=2).to(device)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:

# KoBART 모델 로드
bart_tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1')
bart_model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-base-v1').to(device)

# 감성 분석과 요약 결과를 저장할 리스트
results = []


tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


model.safetensors:   0%|          | 0.00/495M [00:00<?, ?B/s]

In [16]:
# 감성 분석과 요약을 동시에 수행
for review in tqdm(text_data, desc="Processing Reviews"):
    review = review.strip()
    if len(review) < 10:  # 리뷰가 너무 짧으면 건너뜁니다.
        continue

    # 감성 분석 (KoBERT, KC-BERT)
    inputs_kobert = tokenizer_kobert(review, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    inputs_kcbert = tokenizer_kcbert(review, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)

    try:
        with torch.no_grad():
            output_kobert = model_kobert(**inputs_kobert)
            output_kcbert = model_kcbert(**inputs_kcbert)

        sentiment_kobert = torch.argmax(output_kobert.logits, dim=1).item()
        sentiment_kcbert = torch.argmax(output_kcbert.logits, dim=1).item()

        # 리뷰 요약 (KoBART)
        inputs_bart = bart_tokenizer([review], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

        with torch.no_grad():
            summary_ids = bart_model.generate(
                inputs_bart['input_ids'], 
                attention_mask=inputs_bart['attention_mask'], 
                max_length=100,  
                min_length=20,   
                length_penalty=1.5,  
                num_beams=4, 
                no_repeat_ngram_size=3,  # n-그램 반복 방지
                repetition_penalty=1.2,  # 반복에 대한 패널티 부여
                early_stopping=True
            )
        summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    except RuntimeError as e:
        print(f"Runtime error with review: {review}")
        print(e)
        continue  # 에러가 발생하면 해당 리뷰는 건너뜁니다.

    # 결과 저장
    results.append({
        'Original Review': review,
        'KoBERT Sentiment': sentiment_kobert,
        'KC-BERT Sentiment': sentiment_kcbert,
        'KoBART Summary': summary
    })

# 결과를 CSV 파일로 저장
results_df = pd.DataFrame(results)
results_df.to_csv('kobert_kcbert_kobart_results_fixed_final.csv', index=False)

print("Results saved to kobert_kcbert_kobart_results_fixed_final.csv")

Processing Reviews: 100%|██████████| 500/500 [15:35<00:00,  1.87s/it]

Results saved to kobert_kcbert_kobart_results_fixed_final.csv





Results saved to kobert_kcbert_kobart_results.csv
