In [39]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# 1. CSV 파일 불러오기
train_df = pd.read_csv("ratings_train.csv", encoding="utf-8-sig")
test_df = pd.read_csv("ratings_test.csv", encoding="utf-8-sig")

# 2. 벡터화 (BoW 방식, 단순 단어 단위 토크나이징)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])

y_train = train_df["label"]
y_test = test_df["label"]

# 3. 모델 학습
model = MultinomialNB()
model.fit(X_train, y_train)

# 4. 예측 및 평가
y_pred = model.predict(X_test)

print("=== 📊 Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== 🧾 Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))


=== 📊 Confusion Matrix ===
[[196  31]
 [105 167]]

=== 🧾 Classification Report ===
              precision    recall  f1-score   support

          -1      0.651     0.863     0.742       227
           1      0.843     0.614     0.711       272

    accuracy                          0.727       499
   macro avg      0.747     0.739     0.727       499
weighted avg      0.756     0.727     0.725       499



In [33]:
stopwords = [
    '이', '가', '은', '는', '을', '를', '의', '에', '에서', '에게', '께', '로', '으로', 
    '와', '과', '보다', '처럼', '만큼', '같이', '까지', '마저', '조차', '부터', 
    '이나', '나', '이며', '며', '등', '하다', '한다', '하고', '하니', '하면', 
    '되어', '되다', '되고', '되니', '입니다', '습니다', 'ㅂ니다', '어요', '아요', '다', 
    '고', '면', '며', '게', '지', '죠',
    '그리고', '그러나', '하지만', '그런데', '그래서', '그러면', '그러므로', '따라서', 
    '또한', '또는', '및', '즉', '한편', '반면에', '근데',
    '나', '저', '우리', '저희', '너', '너희', '당신', '그', '그녀', '그들', '누구', 
    '무엇', '어디', '언제', '어느', '이것', '그것', '저것', '여기', '거기', '저기', 
    '이쪽', '그쪽', '저쪽',
    '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉', '열',
    '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', '십', '백', '천', '만',
    '첫째', '둘째', '셋째',
    '바로', '때', '것', '수', '일', '문제', '경우', '부분', 
    '내용', '결과', '자체', '가지',
    '뿐', '대로', '만큼', '만', '지', '따름', '나름', '김에', '터', '너무', '어요'
    '아', '아이고', '아이구', '아하', '어', '그래', '응', '네', '예', '아니',
]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# TF-IDF 벡터화
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(train_df["text"])
X_test = tfidf.transform(test_df["text"])

# 모델 학습
model = MultinomialNB()
model.fit(X_train, y_train)

# 예측 및 평가
y_pred = model.predict(X_test)

print("=== TF-IDF Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print("\n=== TF-IDF Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# 1~2그램 적용 (ex: "맛있어요", "정말 맛있어요")
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_ng = ngram_vectorizer.fit_transform(train_df["text"])
X_test_ng = ngram_vectorizer.transform(test_df["text"])

# 모델 학습
model = MultinomialNB()
model.fit(X_train_ng, y_train)

# 예측 및 평가
y_pred_ng = model.predict(X_test_ng)

print("=== N-gram Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred_ng))
print("\n=== N-gram Classification Report ===")
print(classification_report(y_test, y_pred_ng, digits=3))


In [None]:
# TF-IDF + N-gram (1~2그램)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

In [None]:
!pip uninstall sns

In [None]:
!pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

# 1. 데이터 불러오기
train_df = pd.read_csv("ratings_train_extended.csv", encoding="utf-8-sig")
test_df = pd.read_csv("ratings_test_extended.csv", encoding="utf-8-sig")

# 2. TF-IDF 벡터화 + N-gram(1,2)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])
y_train = train_df["label"]
y_test = test_df["label"]

# 3. 로지스틱 회귀 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 4. 예측 및 성능 평가
y_pred = model.predict(X_test)
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))

# 5. 중요 단어 추출
feature_names = np.array(vectorizer.get_feature_names_out())
coef = model.coef_  # shape: (3, n_features)

# 클래스별 상위 단어 추출
topn = 10
weights = {}
for idx, class_label in enumerate(model.classes_):  # -1, 0, 1
    top_indices = np.argsort(coef[idx])[::-1][:topn]
    weights[class_label] = pd.DataFrame({
        'word': feature_names[top_indices],
        'weight': coef[idx][top_indices]
    })

# 6. 시각화 준비용 데이터프레임 생성
df_plot = pd.concat(
    [df.assign(label=str(label)) for label, df in weights.items()],
    axis=0
)

# 7. 시각화: 클래스별 단어 중요도
plt.rcParams['font.family'] = 'Malgun Gothic'  # Mac이라면 AppleGothic, Windows면 Malgun Gothic
plt.rcParams['axes.unicode_minus'] = False

fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
labels = ['-1 (부정)', '0 (중립)', '1 (긍정)']
color_dict = {'-1': '#e74c3c', '0': '#f1c40f', '1': '#2ecc71'}

for i, label in enumerate(['-1', '0', '1']):
    subset = df_plot[df_plot['label'] == label]
    sns.barplot(
        ax=axes[i],
        data=subset,
        y='word',
        x='weight',
        color=color_dict[label]
    )
    axes[i].set_title(f"감성 {labels[i]} 상위 단어")
    axes[i].set_xlabel("가중치(weight)")
    axes[i].set_ylabel("단어")

plt.tight_layout()
plt.show()


In [None]:
from wordcloud import WordCloud

# 워드클라우드용 단어 + 가중치 딕셔너리 만들기
word_weights = {
    label: dict(zip(df['word'], df['weight']))
    for label, df in weights.items()
}

# 워드클라우드 그리기 함수
def draw_wordcloud(word_weight_dict, title, color):
    wc = WordCloud(
        font_path='/usr/share/fonts/truetype/nanum/NanumGothic.ttf',
        background_color='white',
        colormap=color,
        width=800,
        height=400
    )
    wc.generate_from_frequencies(word_weight_dict)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=20)
    plt.show()

# 클래스별 워드클라우드 출력
draw_wordcloud(word_weights[-1], '부정 감성 주요 단어', 'Reds')
draw_wordcloud(word_weights[0], '중립 감성 주요 단어', 'Oranges')
draw_wordcloud(word_weights[1], '긍정 감성 주요 단어', 'Greens')


In [None]:
# 새 문장 리스트
new_texts = [
    "침대가 너무 불편해서 허리가 아팠어요.",
    "친절하고 조용해서 편하게 쉬었어요.",
    "그냥 무난한 숙소였어요. 특별한 건 없었어요."
]

# 벡터화 (학습한 vectorizer 사용)
X_new = vectorizer.transform(new_texts)

# 예측 수행
predictions = model.predict(X_new)
probs = model.predict_proba(X_new)

# 결과 출력
for i, text in enumerate(new_texts):
    print(f"문장: {text}")
    print(f"예측 감성: {predictions[i]} (확률: {probs[i]})\n")