In [None]:
stopwords = list(set([
    '이', '가', '은', '는', '을', '를', '의', '에', '에서', '에게', '께', '로', '으로', '하다', '있다',
    '와', '과', '보다', '처럼', '만큼', '같이', '까지', '마저', '조차', '부터', 
    '이나', '나', '이며', '며', '등', '하다', '한다', '하고', '하니', '하면', 
    '되어', '되다', '되고', '되니', '입니다', '습니다', 'ㅂ니다', '어요', '아요', '다', '방이', '제대로',
    '고', '면', '게', '지', '죠',
    '그리고', '그러나', '하지만', '그런데', '그래서', '그러면', '그러므로', '따라서', 
    '또한', '또는', '및', '즉', '한편', '반면에', '근데',
    '나', '저', '우리', '저희', '너', '너희', '당신', '그', '그녀', '그들', '누구', '그렇다',
    '무엇', '어디', '언제', '어느', '이것', '그것', '저것', '여기', '거기', '저기', 
    '이쪽', '그쪽', '저쪽',
    '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉', '열',
    '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', '십', '백', '천', '만',
    '첫째', '둘째', '셋째',
    '바로', '때', '것', '수', '문제', '경우', '부분', '이다',
    '내용', '결과', '자체', '가지', '있다',
    '않았어요', '있었어요', '했어요', '했는데요', '있는데요', '합니다', '없다', '나다','생각하다',
    '했다', '같다', '네요','아니다',
    '좀', '너무', '정말', '많이', '조금',
    '사장', '이용', '용하다', '물이', '매우',
    '뿐', '대로', '만', '따름', '나름', '김에', '터',
    '아', '아이고', '아이구', '아하', '어', '그래', '응', '네', '예', '아니', '않다', '안되다','안','그냥',
    '가다', '오다', '주다', '말다', '나다', '받다', '알다', '모르다', '싶다', '생각하다', '들다'
]))

In [46]:
import pandas as pd
df = pd.read_csv('accom_review_source.csv')

In [47]:
df['review_type'].value_counts()


review_type
 1    141108
-1     15828
 0     14826
Name: count, dtype: int64

In [None]:
print(df['review_type'].unique())
print(df['review_type'].map(type).unique())

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from konlpy.tag import Okt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.font_manager as fm
import os
from tqdm import tqdm
import re

# 형태소 분석기 설정
okt = Okt()

def tokenize(text, stopwords=[]):
    try:
        return [
            word.lower()
            for word, pos in okt.pos(text, stem=True)
            if pos in ['Noun', 'Adjective']
            and word.lower() not in stopwords
            and len(word) > 1
        ]
    except Exception as e:
        print(f"Tokenization error: {e}")
        return []


In [None]:
#import os
#print(os.getcwd())  # 현재 작업 디렉토리
fm.findSystemFonts()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from konlpy.tag import Okt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.font_manager as fm
import os
from tqdm import tqdm
import re

# ----- 한글 폰트 설정 -----
font_path = "C:\\Users\\MYCOM\\AppData\\Local\\Microsoft\\Windows\\Fonts\\NotoSansKR-Bold.ttf"
font_prop = fm.FontProperties(fname=font_path)
plt.rc('font', family=font_prop.get_name())
plt.rcParams['axes.unicode_minus'] = False


# ----- 토크나이저 -----
okt = Okt()
def tokenize(text, stopwords=[]):
    try:
        return [
            word.lower()
            for word, pos in okt.pos(text, stem=True)
            if pos in ['Noun', 'Adjective']
            and word.lower() not in stopwords
            and len(word) > 1
        ]
    except Exception as e:
        print(f"Tokenization error: {e}")
        return []

# ----- 폴더 생성 -----
os.makedirs("output/wordcloud", exist_ok=True)
os.makedirs("output/barplot", exist_ok=True)

# ----- CSV 불러오기 -----
df = pd.read_csv('accom_review_type.csv')
df['review_type'] = df['review_type'].astype(int)

if not {'accommodation_id', 'review_id', 'content', 'review_type'}.issubset(df.columns):
    raise ValueError("'accommodation_id', 'review_id', 'content', 'review_type' 컬럼이 존재해야 합니다.")

# ----- 분석 시작 -----
hotel_names = df['accommodation_id'].unique()

for hotel in tqdm(hotel_names, desc="숙소별 키워드 분석"):
    df_hotel = df[df['accommodation_id'] == hotel]

    for label in [-1, 1]:  # 부정 / 긍정
        df_sentiment = df_hotel[df_hotel['review_type'] == label]

        if len(df_sentiment) < 5:
            continue

        # 감정별 추가 불용어 설정
        local_stopwords = stopwords.copy()
        if label == -1:
            local_stopwords += ['좋다', '예쁘다', '깔끔하다', '깨끗하다', '친절하다', '편안하다', '따뜻하다', '만족하다']

        # TF-IDF
        tfidf = TfidfVectorizer(tokenizer=lambda x: tokenize(x, stopwords=local_stopwords), max_features=1000)
        X = tfidf.fit_transform(df_sentiment['content'])
        feature_names = np.array(tfidf.get_feature_names_out())
        tfidf_mean = np.asarray(X.mean(axis=0)).ravel()

        # 키워드 정렬 및 필터링
        topn = 20
        top_idx = np.argsort(tfidf_mean)[::-1][:topn]
        top_words = feature_names[top_idx]
        top_scores = tfidf_mean[top_idx]

        filtered = [(w, s) for w, s in zip(top_words, top_scores) if w not in local_stopwords]

        # 워드클라우드용 데이터 (모두 사용)
        word_freq = dict(filtered)

        # 바 그래프용 상위 20개
        bar_keywords = filtered[:20]
        bar_words, bar_scores = zip(*bar_keywords) if bar_keywords else ([], [])

        # ----- 워드클라우드 저장 -----
        suffix = 'pos' if label == 1 else 'neg'
        cmap = "rainbow" if label == 1 else "gist_stern"
        wc = WordCloud(
            font_path=font_path,
            background_color='white',
            width=800,
            height=400,
            colormap=cmap,
            max_font_size=100,
            relative_scaling=0.3
        )
        wc.generate_from_frequencies(word_freq)
        wc_path = f"output/wordcloud/{(hotel)}_{suffix}.png"
        wc.to_file(wc_path)

        # ----- 바 그래프 저장 -----
        if bar_keywords:
            df_keywords = pd.DataFrame({'word': bar_words, 'score': bar_scores})
            plt.figure(figsize=(10, 5))
            
            base_palette = sns.color_palette("Blues", n_colors=len(df_keywords)) if label == 1 else sns.color_palette("Reds", n_colors=len(df_keywords))
            sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))
            #sns.barplot(data=df_keywords, y='word', x='score', palette='Blues' if label == 1 else 'Reds')
            plt.title(f"{hotel} - {'긍정' if label == 1 else '부정'} 키워드 (상위 {len(bar_keywords)}개)")
            plt.xlabel("TF-IDF 점수")
            plt.ylabel("단어")
            plt.tight_layout()
            bar_path = f"output/barplot/{(hotel)}_{suffix}.png"
            plt.savefig(bar_path)
            plt.close()
        else:
            print(f"{hotel} ({'긍정' if label == 1 else '부정'}) 바 그래프 생략 - 키워드 부족")


In [None]:
font_prop = fm.FontProperties(fname='NanumGothic')

In [None]:
# ----- 한글 폰트 설정 -----
plt.rc('font', family='NanumGothic')
plt.rcParams['axes.unicode_minus'] = False

# ----- 토크나이저 -----
okt = Okt()
def tokenize(text, stopwords=[]):
    try:
        return [
            word.lower()
            for word, pos in okt.pos(text, stem=True)
            if pos in ['Noun', 'Adjective']
            and word.lower() not in stopwords
            and len(word) > 1
        ]
    except Exception as e:
        print(f"Tokenization error: {e}")
        return []

# ----- 분석 시작 -----
hotel_names = df['name'].unique()

for hotel in tqdm(hotel_names, desc="숙소별 키워드 분석") :
    df_hotel = df[df['name'] == hotel]
    accom_ids = df_hotel['accommodation_id'].value_counts()
    accom_id = accom_ids.idxmax() if not accom_ids.empty else None

    for label in [-1, 1]:
        local_stopwords = stopwords.copy()
        if label == -1:
            local_stopwords += ['좋다', '좋', '좋습니다', '좋은', '예쁘다', '깔끔하다', '깨끗하다', '친절하다', '편안하다', '따뜻하다', '만족하다', '마을']
        else :
            local_stopwords += ['마을']
        tfidf = TfidfVectorizer(tokenizer=lambda x: tokenize(x, stopwords=local_stopwords), max_features=1000)
        # 이하 기존 작업 동일
        df_sentiment = df_hotel[df_hotel['review_type'] == label]
        if len(df_sentiment) < 5:
            continue
        try :
            X = tfidf.fit_transform(df_sentiment['content'])
        except Exception as e :
            print("오류 : ", e)
            continue
        feature_names = np.array(tfidf.get_feature_names_out())
        tfidf_mean = np.asarray(X.mean(axis=0)).ravel()
        
        topn = 20
        top_idx = np.argsort(tfidf_mean)[::-1][:topn]
        top_words = feature_names[top_idx]
        top_scores = tfidf_mean[top_idx]
        # 감정별 추가 불용어 설정

        # TF-IDF
        filtered = [(w, s) for w, s in zip(top_words, top_scores) if w.strip() not in local_stopwords]
        bar_keywords = filtered[:20]
        bar_words, bar_scores = zip(*bar_keywords) if bar_keywords else ([], [])

        # 워드클라우드 저장
        suffix = 'pos' if label == 1 else 'neg'
        # 바 플롯 저장
        df_keywords = pd.DataFrame({'word': bar_words, 'score': bar_scores})
        plt.figure(figsize=(10, 4))
        base_palette = sns.color_palette("Blues", n_colors=len(df_keywords)) if label == 1 else sns.color_palette("Reds", n_colors=len(df_keywords))
        sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))
        plt.title(f"{hotel} {'긍정' if label == 1 else '부정'} 키워드 (상위 20개)")
        plt.xlabel("TF-IDF 점수")
        plt.ylabel("단어")
        plt.tight_layout()
        plt.savefig(f"output/barplot/{accom_id}_{suffix}.png")
        plt.close()
    else:
        print(f"{hotel} ({'긍정' if label == 1 else '부정'}) 바 그래프 생략 - 키워드 부족")
        continue




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))
숙소별 키워드 분석:   0%|          | 1/437 [00:03<23:47,  3.27s/it]

전주 중앙동 라온 호텔 (긍정) 바 그래프 생략 - 키워드 부족



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))
숙소별 키워드 분석:   0%|          | 2/437 [00:19<1:19:25, 10.95s/it]

전주 산정동 호텔 감스테이 (긍정) 바 그래프 생략 - 키워드 부족



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))
숙소별 키워드 분석:   1%|          | 3/437 [00:29<1:14:06, 10.25s/it]

전주 중화산동 호텔 인트로(HOTEL INTRO) (긍정) 바 그래프 생략 - 키워드 부족



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))
숙소별 키워드 분석:   1%|          | 4/437 [00:40<1:18:33, 10.89s/it]

전주 신시가지 호텔 팝 (긍정) 바 그래프 생략 - 키워드 부족



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_keywords, y='word', x='score', palette=list(reversed(base_palette)))
숙소별 키워드 분석:   1%|          | 5/437 [00:55<1:29:21, 12.41s/it]

전주 산정동 호텔 레이나 (긍정) 바 그래프 생략 - 키워드 부족


숙소별 키워드 분석:   1%|          | 5/437 [00:57<1:22:16, 11.43s/it]


KeyboardInterrupt: 

In [None]:
sentiment_summary = []

for hotel in tqdm(hotel_names, desc="숙소별 키워드 분석") :
    df_hotel = df[df['name'] == hotel]
    accom_ids = df_hotel['accommodation_id'].value_counts()
    accom_id = accom_ids.idxmax() if not accom_ids.empty else None

    # 감성 점수 계산
    pos_count = len(df_hotel[df_hotel['review_type'] == 1])
    neg_count = len(df_hotel[df_hotel['review_type'] == -1])
    total = pos_count + neg_count
    if total > 0:
        pos_percent = round((pos_count / total) * 100, 1)
        neg_percent = round((neg_count / total) * 100, 1)
        sentiment_score = round(pos_count / total, 3)
    else:
        pos_percent = neg_percent = sentiment_score = None

    # 점수 저장
    sentiment_summary.append({
        'accommodation_id': accom_id,
        'name': hotel,
        'positive_count': pos_count,
        'negative_count': neg_count,
        'positive_percent': f"{pos_percent}%",
        'negative_percent': f"{neg_percent}%",
        'sentiment_score': sentiment_score
    })

summary_df = pd.DataFrame(sentiment_summary)

df = df.merge(summary_df, on='accommodation_id', how='left')