In [6]:
import os
import pandas as pd
import torch
import re
from tqdm import tqdm
from soynlp.normalizer import repeat_normalize
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, BertTokenizer, BertForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob

# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터셋 로드 및 전처리
file_path = './외국음식전문점.csv'
df = pd.read_csv(file_path)
df['content'] = df['content'].fillna('')
text_data = df['content'].tolist()

# 전처리 함수 정의
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^가-힣\s]', '', text)
    text = repeat_normalize(text, num_repeats=2)
    stopwords = ['이', '그', '저', '의', '을', '를', '은', '는', '에', '와', '과', '도', '으로', '그리고', '하지만', '그래서']
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text.strip()

preprocessed_data = [preprocess_text(review) for review in text_data]

# 카테고리 및 강화된 키워드 설정 ## 비추천 항목 추가
categories = ['맛', '가격', '서비스', '분위기', '추천', '비추천']
category_keywords = {
    '맛': ['맛', '요리', '음식', '메뉴', '향', '식감', '맛집', '고소', '달콤', '짭짤', '신선', '풍미', '스파이시', '향긋', '촉촉', '바삭', '탱탱', '육즙', '알싸', '상큼', '쫄깃'],
    '가격': ['가격', '비싸', '저렴', '가성비', '비용', '비교', '비율', '가격대', '할인', '구성', '합리적', '저렴하다', '가격 대비', '적당', '경제적', '비용 절감', '가치', '수익', '돈값', '고가', '저가'],
    '서비스': ['서비스', '직원', '응대', '친절', '매너', '대응', '접대', '환대', '서빙', '응대', '고객', '케어', '친절하다', '프로페셔널', '빠르다', '세심하다', '친절도', '고객 관리', '서비스 정신', '상냥', '대접'],
    '분위기': ['분위기', '환경', '인테리어', '공간', '장식', '조명', '음악', '장소', '배경', '아늑하다', '세련', '깔끔', '모던', '클래식', '고급스럽다', '편안하다', '우아', '로맨틱', '분위기 있는', '차분하다', '힐링'],
    '추천': ['추천', '소개', '다음에 또', '재방문', '다시 방문', '친구', '소개하고 싶다', '꼭 오고 싶다', '꼭 다시', '나만 알고 싶다'],
    '비추천': ['비추천', '다신 안', '절대 안', '다시 오지 않다', '다신 안 가다', '비추', '실망', '후회', '최악']
}

# 리뷰 카테고리 분류 함수
def categorize_review(review):
    tfidf_vectorizer = TfidfVectorizer()
    keyword_docs = [" ".join(keywords) for keywords in category_keywords.values()]
    tfidf_matrix = tfidf_vectorizer.fit_transform(keyword_docs + [review])
    cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    top_categories = [categories[i] for i in cosine_similarities.argsort()[-2:]]
    return top_categories

# 대표 메뉴 추출
menu_keywords = ["피자", "스파게티", "버거", "스테이크", "초밥"]  # 예시 메뉴 키워드
menu_counter = Counter()
for review in preprocessed_data:
    for menu in menu_keywords:
        if menu in review:
            menu_counter[menu] += 1

top_menus = menu_counter.most_common(3)

# 감정 분석 모델 로드
tokenizer_kobert = BertTokenizer.from_pretrained('monologg/kobert')
model_kobert = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=3).to(device)  # 3개의 레이블: 긍정, 부정, 중립

# 감정 분석 수행 및 카테고리별 비율 분석
sentiment_results = []
category_sentiments = {category: Counter() for category in categories}

for review in tqdm(preprocessed_data, desc="Processing Reviews"):
    inputs_kobert = tokenizer_kobert(review, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        output_kobert = model_kobert(**inputs_kobert)
        sentiment = torch.argmax(output_kobert.logits, dim=1).item()

    categories_found = categorize_review(review)
    sentiment_results.append(sentiment)
    for category in categories_found:
        category_sentiments[category][sentiment] += 1

# 리뷰 요약 모델 로드 및 요약 생성 (간결한 요약 설정)
bart_tokenizer = PreTrainedTokenizerFast.from_pretrained('EbanLee/kobart-summary-v3')
bart_model = BartForConditionalGeneration.from_pretrained('EbanLee/kobart-summary-v3').to(device)

def summarize_reviews(reviews):
    combined_reviews = " ".join(reviews)
    inputs_bart = bart_tokenizer([combined_reviews], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    summary_ids = bart_model.generate(
        inputs_bart['input_ids'], 
        attention_mask=inputs_bart['attention_mask'], 
        max_length=50,  # 더 짧은 요약
        min_length=20,  
        length_penalty=2.0,  
        num_beams=4, 
        no_repeat_ngram_size=2,  
        repetition_penalty=1.2,  
        early_stopping=True
    )
    summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # TextBlob을 사용하여 문장 자연스러움 개선
    blob = TextBlob(summary)
    corrected_summary = str(blob.correct())
    
    # 'of' 같은 불필요한 단어 제거
    corrected_summary = re.sub(r'\bof\b', '', corrected_summary)
    
    return corrected_summary

# 가게별 리뷰 종합 요약 및 점수 계산

# 그룹화된 데이터를 가게별로 가져오기
store_grouped = df.groupby('store_name')['content']

store_summaries = []

# 순차적으로 가게별 리뷰를 요약 생성 및 점수 계산
for store_name, reviews in tqdm(store_grouped, desc="Processing Stores"):
    try:
        reviews = reviews.tolist()  # 그룹화된 데이터는 시리즈로 나오므로 리스트로 변환
        preprocessed_reviews = [preprocess_text(review) for review in reviews]
        summary = summarize_reviews(preprocessed_reviews)

        # 감성 비율 계산
        sentiments = [sentiment_results[i] for i, r in enumerate(preprocessed_data) if r in reviews]
        sentiment_counts = Counter(sentiments)
        total_reviews = len(sentiments)
        sentiment_ratios = {label: sentiment_counts.get(i, 0) / total_reviews for i, label in enumerate(['Negative', 'Neutral', 'Positive'])}

        # 카테고리별 점수 계산
        category_scores = {}
        for category in categories:
            category_count = sum(category_sentiments[category].values())
            category_scores[category] = category_sentiments[category][2] / category_count if category_count > 0 else 0  # 긍정 점수

        # 최종 점수 계산 (예: 감성 비율과 카테고리 점수의 평균)
        overall_score = (sentiment_ratios['Positive'] + sum(category_scores.values()) / len(categories)) / 2

        # 결과 저장
        store_summaries.append({
            'Store Name': store_name,
            'Summary': summary,
            'Negative Ratio': sentiment_ratios['Negative'],
            'Neutral Ratio': sentiment_ratios['Neutral'],
            'Positive Ratio': sentiment_ratios['Positive'],
            'Taste Score': category_scores['맛'],
            'Price Score': category_scores['가격'],
            'Service Score': category_scores['서비스'],
            'Atmosphere Score': category_scores['분위기'],
            'Recommendation Score': category_scores['추천'],
            'Non-recommendation Score': category_scores['비추천'],
            'Overall Score': overall_score
        })

    except Exception as e:
        print(f"Error processing store {store_name}: {e}")

# 현재 날짜와 시간을 가져와서 파일 이름에 포함
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
file_name = f'{current_time}_리뷰요약및점수.csv'

# 결과를 CSV 파일로 저장
try:
    final_results = pd.DataFrame(store_summaries)
    final_results.to_csv(file_name, index=False)
    print(f"Results saved to {file_name}")
except Exception as e:
    print(f"Error in saving results to CSV: {e}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Reviews: 100%|██████████| 53319/53319 [42:11<00:00, 21.07it/s]  
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Processing Stores:   8%|▊         | 62/737 [03:05<36:20,  3.23s/it]

Error processing store 남미 플랜트 랩: division by zero


Processing Stores:  10%|█         | 76/737 [03:46<31:41,  2.88s/it]

Error processing store 다올: division by zero


Processing Stores:  13%|█▎        | 93/737 [04:41<33:08,  3.09s/it]

Error processing store 도만다 신용산점: division by zero


Processing Stores:  14%|█▍        | 104/737 [05:14<31:13,  2.96s/it]

Error processing store 동남옥탑: division by zero


Processing Stores:  15%|█▌        | 114/737 [05:45<31:31,  3.04s/it]

Error processing store 따올라이 상도점: division by zero


Processing Stores:  16%|█▋        | 121/737 [06:07<31:11,  3.04s/it]

Error processing store 라뜰리에 이은: division by zero


Processing Stores:  17%|█▋        | 126/737 [06:23<32:36,  3.20s/it]

Error processing store 라페리체: division by zero


Processing Stores:  23%|██▎       | 166/737 [08:25<28:32,  3.00s/it]

Error processing store 릴리스이나살: division by zero


Processing Stores:  26%|██▋       | 194/737 [09:48<27:33,  3.05s/it]

Error processing store 멕시칸라이브그릴 송파점: division by zero


Processing Stores:  27%|██▋       | 201/737 [10:09<26:19,  2.95s/it]

Error processing store 모아쏭 콤마: division by zero


Processing Stores:  27%|██▋       | 202/737 [10:11<22:02,  2.47s/it]

Error processing store 몬스택 버거: division by zero


Processing Stores:  37%|███▋      | 271/737 [13:34<20:03,  2.58s/it]

Error processing store 백스트리트독스(Backstreet Dogs): division by zero


Processing Stores:  42%|████▏     | 307/737 [15:19<22:16,  3.11s/it]

Error processing store 비스트로베름: division by zero


Processing Stores:  42%|████▏     | 310/737 [15:27<20:40,  2.90s/it]

Error processing store 비아세라(Via sera): division by zero


Processing Stores:  43%|████▎     | 316/737 [15:45<21:28,  3.06s/it]

Error processing store 사계원: division by zero


Processing Stores:  45%|████▍     | 328/737 [16:18<18:29,  2.71s/it]

Error processing store 살롱 드 이자드(Salon de Izzard): division by zero


Processing Stores:  48%|████▊     | 353/737 [17:37<20:04,  3.14s/it]

Error processing store 쉐 올리비아: division by zero


Processing Stores:  54%|█████▍    | 401/737 [19:58<16:46,  3.00s/it]

Error processing store 안녕 쿠마: division by zero


Processing Stores:  64%|██████▍   | 473/737 [23:36<14:09,  3.22s/it]

Error processing store 육도담 압구정로데오점: division by zero


Processing Stores:  72%|███████▏  | 527/737 [26:17<10:54,  3.12s/it]

Error processing store 초비국수 서울구로점: division by zero


Processing Stores:  77%|███████▋  | 568/737 [28:19<08:18,  2.95s/it]

Error processing store 클레오파트라 라운지카페(cleopatra lounge cafe): division by zero


Processing Stores:  81%|████████  | 594/737 [29:35<07:47,  3.27s/it]

Error processing store 태그누: division by zero


Processing Stores:  82%|████████▏ | 603/737 [30:01<06:33,  2.93s/it]

Error processing store 트라토리아 하이디(TRATTORIA HEIDI): division by zero


Processing Stores:  84%|████████▍ | 621/737 [30:55<05:38,  2.92s/it]

Error processing store 페르바코(PERBACCO): division by zero


Processing Stores:  92%|█████████▏| 676/737 [33:37<02:30,  2.46s/it]

Error processing store 피자스톰 강남점: division by zero


Processing Stores:  95%|█████████▍| 699/737 [34:50<01:59,  3.15s/it]

Error processing store 핸드파스타(Hand pasta): division by zero


Processing Stores:  98%|█████████▊| 723/737 [36:01<00:41,  2.98s/it]

Error processing store 후안타이: division by zero


Processing Stores: 100%|██████████| 737/737 [36:41<00:00,  2.99s/it]

Results saved to 20240901_170257_리뷰요약및점수.csv





In [9]:
import pandas as pd

# 데이터 불러오기
file_path = './20240901_170257_리뷰요약및점수.csv'  # 실제 파일 경로로 변경하세요
df = pd.read_csv(file_path)

# 가중치 설정
weights = {
    'Positive Ratio': 50,
    'Neutral Ratio': 20,
    'Negative Ratio': -30,
    'Recommendation Score': 30,
    'Non-recommendation Score': -30
}

# 각 가게의 최종 점수 계산 (100점 만점)
df['Final Score'] = (
    df['Positive Ratio'] * weights['Positive Ratio'] +
    df['Neutral Ratio'] * weights['Neutral Ratio'] +
    df['Negative Ratio'] * weights['Negative Ratio'] +
    df['Recommendation Score'] * weights['Recommendation Score'] +
    df['Non-recommendation Score'] * weights['Non-recommendation Score']
)

# 100점 만점으로 정규화
df['Final Score'] = df['Final Score'].clip(0, 100)  # 점수가 0보다 작거나 100을 넘지 않도록 조정

# 상위 10개 가게 선택
top_10_stores = df.sort_values(by='Final Score', ascending=False).head(10)

# 최종 점수 출력
print(top_10_stores[['Store Name', 'Final Score']])


               Store Name  Final Score
249            발리다포차(한남점)    50.151089
262   버팔로스팟(Buffalo Spot)    50.137258
0                 (주)애브릿지    50.137232
707                 흑석호치민    50.137232
381             아토키토 보라매점    50.137232
136      로찌마곡(LodgeMagok)    50.137232
24              강남쌀국수 마곡점    50.137232
541                퀼 QUIL    50.123388
422                   오빈스    50.109557
523  커피버넷(COFFEE BURNETT)    50.109557
