In [None]:
stopwords = list(set([
    '이', '가', '은', '는', '을', '를', '의', '에', '에서', '에게', '께', '로', '으로', '하다', '있다',
    '와', '과', '보다', '처럼', '만큼', '같이', '까지', '마저', '조차', '부터', 
    '이나', '나', '이며', '며', '등', '하다', '한다', '하고', '하니', '하면', 
    '되어', '되다', '되고', '되니', '입니다', '습니다', 'ㅂ니다', '어요', '아요', '다', '방이', '제대로',
    '고', '면', '게', '지', '죠',
    '그리고', '그러나', '하지만', '그런데', '그래서', '그러면', '그러므로', '따라서', 
    '또한', '또는', '및', '즉', '한편', '반면에', '근데',
    '나', '저', '우리', '저희', '너', '너희', '당신', '그', '그녀', '그들', '누구', '그렇다',
    '무엇', '어디', '언제', '어느', '이것', '그것', '저것', '여기', '거기', '저기', 
    '이쪽', '그쪽', '저쪽',
    '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉', '열',
    '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', '십', '백', '천', '만',
    '첫째', '둘째', '셋째',
    '바로', '때', '것', '수', '문제', '경우', '부분', '이다',
    '내용', '결과', '자체', '가지', '있다',
    '않았어요', '있었어요', '했어요', '했는데요', '있는데요', '합니다', '없다', '나다','생각하다',
    '했다', '같다', '네요','아니다',
    '좀', '너무', '정말', '많이', '조금',
    '사장', '이용', '용하다', '물이', '매우',
    '뿐', '대로', '만', '따름', '나름', '김에', '터',
    '아', '아이고', '아이구', '아하', '어', '그래', '응', '네', '예', '아니', '않다', '안되다','안','그냥',
    '가다', '오다', '주다', '말다', '나다', '받다', '알다', '모르다', '싶다', '생각하다', '들다'
]))

In [None]:
from functools import partial
from konlpy.tag import Okt
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 1. 데이터 불러오기 (중립 제거 포함)
train_df = pd.read_csv("../data/train_data.csv", encoding="utf-8-sig")
test_df = pd.read_csv("../data/test_data.csv", encoding="utf-8-sig")

train_df = train_df[train_df["label"].isin([0, 1])]
test_df = test_df[test_df["label"].isin([0, 1])]

X_train_text = train_df["text"]
y_train = train_df["label"]
X_test_text = test_df["text"]
y_test = test_df["label"]

# 2. 형태소 분석기 + tokenizer 함수 정의
okt = Okt()

def tokenize(text, stopwords=[]):
    try:
        return [
            word.lower()
            for word, pos in okt.pos(text, stem=True)
            if pos in ['Noun', 'Adjective', 'Verb']
            and word.lower() not in stopwords
            and len(word) > 1
        ]
    except Exception as e:
        print(f"Tokenization error: {e}")
        return []

tokenizer_with_stopwords = partial(tokenize, stopwords=stopwords)


# 3. TF-IDF 벡터화 with tokenizer
vectorizer = TfidfVectorizer(tokenizer=tokenizer_with_stopwords, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# 4. 모델 학습
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# 5. 평가
y_pred = model.predict(X_test)
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))


In [None]:
import joblib

# 모델과 벡터라이저 저장
joblib.dump(model, 'logistic_model.pkl')
joblib.dump(vectorizer, 'logistic_tfdf_vectorizer.pkl')

print("모델과 벡터라이저 저장 완료!")

In [None]:
#!pip install -U ipywidgets
#jupyter nbextension enable --py widgetsnbextension
#!pip install --upgrade huggingface_hub
#!pip install --upgrade transformers huggingface_hub

In [None]:
import transformers
import huggingface_hub

print(transformers.__version__)   # ex: 4.42.1
print(huggingface_hub.__version__)  # ex: 0.23.2

In [None]:
import pandas as pd
df = pd.read_csv("ratings_train.csv")

In [None]:
#!pip install datasets
# import keras
# print(keras.__version__)
# !pip install tf-keras

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import torch

# 2. 훈련/검증 분할
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# 3. 토크나이저 준비
tokenizer = BertTokenizer.from_pretrained("skt/kobert-base-v1")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=64)

# 4. HuggingFace Dataset 객체로 변환
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# 5. 텐서 변환
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# 6. 모델 로딩
model = BertForSequenceClassification.from_pretrained("skt/kobert-base-v1", num_labels=2)

# 7. 훈련 인자 설정
training_args = TrainingArguments(
    output_dir="./kobert_results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# 8. 평가 함수 정의
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds)
    }

# 9. Trainer 구성 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np

# 1. 단어 리스트 및 가중치
feature_names = np.array(vectorizer.get_feature_names_out())
coef = model.coef_[0]  # 이진 분류이므로 shape (1, n_features)

# 2. 긍정/부정 top 단어 인덱스 추출
topn = 30
top_pos_idx = np.argsort(coef)[::-1][:topn]
top_neg_idx = np.argsort(coef)[:topn]

# 3. 긍정 / 부정 단어별 가중치 딕셔너리 생성
word_weights = {
    1: dict(zip(feature_names[top_pos_idx], coef[top_pos_idx])),
    0: dict(zip(feature_names[top_neg_idx], coef[top_neg_idx])),
}

In [None]:

from matplotlib import font_manager as fm
font_path = "C:/Windows/Fonts/NanumGothic.ttf"
font_prop = fm.FontProperties(fname=font_path)
plt.rc('font', family=font_prop.get_name())
plt.rcParams['axes.unicode_minus'] = False

def draw_wordcloud(word_weight_dict, title, color='Greens'):
    wc = WordCloud(
        font_path='C:/Windows/Fonts/NanumGothic.ttf',  # Mac은 AppleGothic, Linux는 나눔폰트
        background_color='white',
        colormap=color,
        width=800,
        height=400
    )
    wc.generate_from_frequencies(word_weight_dict)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=20)
    plt.show()

In [None]:
draw_wordcloud(word_weights[0], '부정 키워드', color='Reds')
draw_wordcloud(word_weights[1], '긍정 키워드', color='Greens')

In [None]:
new_texts = ["잠은 잘 잤는데 냄새났어요", "서비스는 좋은데 시설이 별로였어요"]
X_new = vectorizer.transform(new_texts)
pred = model.predict(X_new)
proba = model.predict_proba(X_new)

for i, text in enumerate(new_texts):
    print(f"문장: {text}")
    print(f"예측: {pred[i]}, 확률: {proba[i]}")

In [None]:
# 5. 저장
# joblib.dump(model, 'Logistic_model.pkl')
# joblib.dump(vectorizer, 'Logistic_tfidf_vectorizer.pkl')
# print("모델 및 벡터라이저 저장 완료!")

In [None]:
from wordcloud import WordCloud

# 워드클라우드용 단어 + 가중치 딕셔너리 만들기
word_weights = {
    label: dict(zip(df['word'], df['weight']))
    for label, df in weights.items()
}
print(word_weights.keys())

# 워드클라우드 그리기 함수
def draw_wordcloud(word_weight_dict, title, color):
    wc = WordCloud(
        font_path='/usr/share/fonts/truetype/nanum/NanumGothic.ttf',
        background_color='white',
        colormap=color,
        width=800,
        height=400
    )
    wc.generate_from_frequencies(word_weight_dict)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=20)
    plt.show()

# 클래스별 워드클라우드 출력
# draw_wordcloud(word_weights[0], '부정 감성 주요 단어', 'Reds')
# draw_wordcloud(word_weights[1], '긍정 감성 주요 단어', 'Greens')

In [None]:
# 새 문장 리스트
import pandas as pd
df = pd.read_csv("../data/여기어때_리뷰.csv")

sentence = df['text']

# 벡터화 (학습한 vectorizer 사용)
X_new = vectorizer.transform(sentence)

# 예측 수행
predictions = model.predict(X_new)
probs = model.predict_proba(X_new)

labels = []
threshold = 0.5
for i, text in enumerate(sentence):
    prob_pos = probs[i][1]
    if prob_pos >= threshold:
        label = 1
    elif prob_pos <= 1 - threshold:
        label = -1
    else:
        label = 0
    labels.append(label)

df['label'] = labels

In [None]:
nw_df = df[['name','rating','write_date','text','label']]

In [None]:
nw_df.to_csv('예측리뷰.csv', encoding='utf-8-sig', index=False)
nw_df['label'].value_counts()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

from konlpy.tag import Okt
import warnings

font_path = "C:/Windows/Fonts/NanumGothic.ttf"
font_prop = fm.FontProperties(fname=font_path)
plt.rc('font', family=font_prop.get_name())
plt.rcParams['axes.unicode_minus'] = False

# 6. 중요 단어 추출 (이진 분류는 coef_[0] 사용)
feature_names = np.array(vectorizer.get_feature_names_out())
coef = model.coef_[0]

topn = 20
top_pos_idx = np.argsort(coef)[::-1][:topn]
top_neg_idx = np.argsort(coef)[:topn]

df_pos = pd.DataFrame({'word': feature_names[top_pos_idx], 'weight': coef[top_pos_idx]})
df_neg = pd.DataFrame({'word': feature_names[top_neg_idx], 'weight': coef[top_neg_idx]})

fig, axes = plt.subplots(1, 2, figsize=(18, 10), sharey=True)

sns.barplot(ax=axes[0], data=df_neg, y='word', x='weight', color='#e74c3c')
axes[0].set_title("부정 상위 단어 (label=0)")
axes[0].set_xlabel("가중치(weight)")
axes[0].set_ylabel("단어")

sns.barplot(ax=axes[1], data=df_pos, y='word', x='weight', color='#2ecc71')
axes[1].set_title("긍정 상위 단어 (label=1)")
axes[1].set_xlabel("가중치(weight)")

plt.tight_layout()
plt.show()

In [None]:
flist = fm.findSystemFonts()
flist

In [None]:
yeogi_df = pd.read_csv('예측리뷰.csv', encoding='utf-8-sig')
yeogi_df['label'].astype(int)

yeogi_df['label'] = 0 if yeogi_df['clean_reviews'] in ['좋다', '깔끔', '좋은', '예쁘다', '']

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from konlpy.tag import Okt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.font_manager as fm
import os
from tqdm import tqdm
import re

# 형태소 분석기 설정
okt = Okt()

def tokenize(text, stopwords=[]):
    try:
        return [
            word.lower()
            for word, pos in okt.pos(text, stem=True)
            if pos in ['Noun', 'Adjective']
            and word.lower() not in stopwords
            and len(word) > 1
        ]
    except Exception as e:
        print(f"Tokenization error: {e}")
        return []

# 그래프 한글 폰트 설정
font_path = "C:/Windows/Fonts/NanumGothic.ttf"
font_prop = fm.FontProperties(fname=font_path)
plt.rc('font', family=font_prop.get_name())
plt.rcParams['axes.unicode_minus'] = False

# ----- 저장 폴더 -----
os.makedirs("output/wordcloud", exist_ok=True)
os.makedirs("output/barplot", exist_ok=True)

def safe_filename(name):
    return re.sub(r'[\\/:"*?<>|]+', "_", name)

if not {'name', 'clean_reviews', 'label'}.issubset(df.columns):
    raise ValueError("'name', 'clean_reviews', 'label' 컬럼이 존재해야 합니다.")

# ----- 숙소별 반복 -----
hotel_names = yeogi_df['name'].unique()

for hotel in tqdm(hotel_names, desc="숙소별 키워드 분석"):
    df_hotel = yeogi_df[yeogi_df['name'] == hotel]

    for label in [-1, 1]:  # 부정(0), 긍정(1)
        df_sentiment = df_hotel[df_hotel['label'] == label]

        if len(df_sentiment) < 6:
            continue

        tfidf = TfidfVectorizer(tokenizer=tokenize, max_features=1000)
        X = tfidf.fit_transform(df_sentiment['clean_reviews'])
        feature_names = np.array(tfidf.get_feature_names_out())
        tfidf_mean = np.asarray(X.mean(axis=0)).ravel()

        topn = 30
        top_idx = np.argsort(tfidf_mean)[::-1][:topn]
        top_words = feature_names[top_idx]
        top_scores = tfidf_mean[top_idx]

        filtered = [(w, s) for w, s in zip(top_words, top_scores) if w not in stopwords]
        if label == -1 :
            stopwords.append(['좋다', '예쁘다', '깔끔하다','깨끗하다', '친절하다', '편안하다', '깔끔하다', '따뜻하다'])
        # 텍스트 토큰화 (전체 리뷰 합쳐서)
        # 리스트를 다시 분리
        top_words, top_scores = zip(*filtered) if filtered else ([], [])
        suffix = 'pos' if label == 1 else 'neg'
        # 워드클라우드 생성 및 저장
        word_freq = dict(zip(top_words, top_scores))
        wc = WordCloud(font_path=font_path, background_color='white', width=800, height=400)
        wc.generate_from_frequencies(word_freq)


        wc_path = f"output/wordcloud/{safe_filename(hotel)}_{suffix}.png"
        wc.to_file(wc_path)

        # 바 그래프 저장
        df_keywords = pd.DataFrame({'word': top_words, 'score': top_scores})
        plt.figure(figsize=(8, 5))
        sns.barplot(data=df_keywords, y='word', x='score', palette='Blues' if label== 1 else 'Reds')
        plt.title(f"{hotel} - {'긍정' if label== 1 else '부정'} 키워드")
        plt.xlabel("TF-IDF 점수")
        plt.ylabel("단어")
        plt.tight_layout()

        bar_path = f"output/barplot/{safe_filename(hotel)}_{suffix}.png"
        plt.savefig(bar_path)
        plt.close()

In [None]:
import os
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

# 필수 함수 예시
def tokenize(text):
    try:
        return [
            word.lower()
            for word, pos in okt.pos(text, stem=True)
            if pos in ['Noun', 'Adjective', 'Verb']
            and word.lower() not in stopwords
            and len(word) > 1
        ]
    except Exception as e:
        print(f"Tokenization error: {e}")
        return []

def safe_filename(name):
    return "".join(c if c.isalnum() else "_" for c in name)

# Stopwords 설정
bad_to_good_words = ['좋다', '예쁘다', '깔끔하다', '깨끗하다', '친절하다', '편안하다']

grouped = yeogi_df.groupby('name')
# 숙소별 반복
for hotel, group in tqdm(grouped, desc="숙소별 키워드 추출 진행중"):
    texts = group['clean_reviews'].tolist()
    
    # TF-IDF 계산
    vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=None, max_df=0.8, min_df=2)
    tfidf_matrix = vectorizer.fit_transform(texts)
    tfidf_mean = tfidf_matrix.mean(axis=0).A1
    feature_names = vectorizer.get_feature_names_out()
    
    # 상위 키워드 추출
    top_idx = tfidf_mean.argsort()[::-1][:200]
    top_words = feature_names[top_idx]
    top_scores = tfidf_mean[top_idx]

    # 전체 단어 등장 횟수
    all_tokens = []
    for text in texts:
        all_tokens.extend(tokenize(text))
    word_counts = Counter(all_tokens)

    # 감성 레이블에 따라 stopwords 다르게 설정
    label = group['label'].mode()[0]  # 가장 많은 감성으로 설정
    if label == -1:
        stopwords.update(bad_to_good_words)

    # 필터링 및 구성
    filtered = [
        (w, s, word_counts.get(w, 0))
        for w, s in zip(top_words, top_scores)
        if w not in stopwords and len(w) > 1
    ]
    if not filtered:
        continue

    top_words, _, top_counts = zip(*filtered)
    df_keywords = pd.DataFrame({
        'word': top_words,
        'count': top_counts
    })

    # 저장
    suffix = 'neg' if label == -1 else 'pos'
    save_name = safe_filename(hotel)
    os.makedirs("output/csv", exist_ok=True)
    csv_path = f"output/csv/{save_name}_{suffix}_keywords.csv"
    df_keywords.to_csv(csv_path, index=False, encoding='utf-8-sig')
    print(f"[✓] 저장 완료: {csv_path}")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from konlpy.tag import Okt
from tqdm import tqdm

okt = Okt()

# 토큰화 함수 정의
def tokenize(text):
    try:
        return [
            word.lower()
            for word, pos in okt.pos(text, stem=True)
            if pos in ['Noun', 'Adjective', 'Verb'] and len(word) > 1
        ]
    except:
        return []

# 텍스트 리스트 준비
texts = yeogi_df['clean_reviews'].fillna("").tolist()

# TF-IDF 벡터화
vectorizer = TfidfVectorizer(analyzer=tokenize, max_df=0.8, min_df=2)
tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()

# 리뷰당 키워드 추출 함수
def extract_keywords_from_row(row_idx, top_n=5):
    vec = tfidf_matrix.getrow(row_idx).toarray().flatten()
    top_idx = vec.argsort()[::-1][:top_n]
    keywords = [feature_names[i] for i in top_idx if vec[i] > 0]
    return " ".join(keywords)  # 띄어쓰기로 구분된 키워드 문자열

# tqdm으로 진행률 보면서 keyword 컬럼 생성
yeogi_df['keyword'] = [extract_keywords_from_row(i) for i in tqdm(range(len(yeogi_df)), desc="리뷰 키워드 추출 중")]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
import pandas as pd
import torch

# 1. 데이터 로드
df = pd.read_csv("keyword_test.csv")  # 'clean_reviews'와 'label' 컬럼이 있어야 함
df = df.dropna(subset=['clean_reviews', 'label'])  # 결측 제거
df['label'] = df['label'].astype(int)  # 레이블 정수형 변환

# 2. 데이터셋 분할
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# 3. 토크나이저 로드 (KoBERT)
model_name = "beomi/kcbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 4. 토큰화 함수 정의
def tokenize_function(example):
    return tokenizer(example['clean_reviews'], padding="max_length", truncation=True, max_length=128)

# 5. HuggingFace Dataset으로 변환
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# 6. 토큰화 적용
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# 7. label, input_ids, attention_mask만 추림
train_dataset = train_dataset.remove_columns(['clean_reviews'])
val_dataset = val_dataset.remove_columns(['clean_reviews'])

# 8. PyTorch tensor 형식 명시
train_dataset.set_format("torch")
val_dataset.set_format("torch")

# 9. 모델 로드
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 10. 평가지표 함수
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

# 11. 학습 설정
training_args = TrainingArguments(
    output_dir="./bert_sentiment_output",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    seed=42
)

# 12. Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 13. 학습 실행
trainer.train()

# 14. 모델 저장
trainer.save_model("./saved_bert_sentiment_model")
tokenizer.save_pretrained("./saved_bert_sentiment_model")
