In [None]:

# 📦 1. 데이터 로딩 및 분할
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("36000_reviews_label.csv", encoding="utf-8-sig")
df = df[['sentence', 'label']].dropna()
df = df[df['label'].isin(['긍정', '부정', '중립'])]

X = df['sentence'].fillna('')
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:

# 🛠️ 2. 형태소 분석기 및 불용어 정의
from konlpy.tag import Okt

okt = Okt()
stopwords = set([
    '이', '가', '은', '는', '을', '를', '의', '에', '에서', '에게', '께', '로', '으로',
    '와', '과', '보다', '처럼', '만큼', '같이', '까지', '마저', '조차', '부터',
    '이나', '나', '이며', '며', '등', '하다', '한다', '하고', '하니', '하면',
    '되어', '되다', '되고', '되니', '입니다', '습니다', 'ㅂ니다', '어요', '아요', '다', '방이', '제대로',
    '고', '면', '게', '지', '죠', '그리고', '그러나', '하지만', '그런데', '그래서',
    '또한', '또는', '및', '즉', '한편', '반면에', '근데', '좀', '너무', '정말', '많이',
    '아', '어', '예', '응', '네', '안', '않다', '가다', '오다', '이다'
])


In [None]:

# 🧼 3. tokenizer + 벡터화
from sklearn.feature_extraction.text import TfidfVectorizer

def custom_tokenizer(text):
    return [
        word for word, pos in okt.pos(text, stem=True)
        if pos in ['Noun', 'Adjective'] and word not in stopwords and len(word) > 1
    ]

vectorizer = TfidfVectorizer(
    tokenizer=custom_tokenizer,
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:

# 🔎 4. Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

logi = LogisticRegression(max_iter=1000, class_weight='balanced', multi_class='multinomial', solver='lbfgs')
logi.fit(X_train_vec, y_train)
logi_pred = logi.predict(X_test_vec)

print("📌 Logistic Regression 결과:")
print(classification_report(y_test, logi_pred, digits=3))


In [None]:

# 🔎 5. SVM
from sklearn.svm import LinearSVC

svm = LinearSVC(class_weight='balanced', C=1.0, max_iter=1000)
svm.fit(X_train_vec, y_train)
svm_pred = svm.predict(X_test_vec)

print("📌 SVM 결과:")
print(classification_report(y_test, svm_pred, digits=3))


In [None]:

# 🔎 6. Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
gbc.fit(X_train_vec.toarray(), y_train)
gbc_pred = gbc.predict(X_test_vec.toarray())

print("📌 Gradient Boosting 결과:")
print(classification_report(y_test, gbc_pred, digits=3))


In [None]:

# 🎨 7. 혼동 행렬 시각화 (for GBC)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from matplotlib import font_manager as fm
import numpy as np

font_path = "C:/Windows/Fonts/NanumGothic.ttf"
font_name = fm.FontProperties(fname=font_path).get_name()
plt.rc('font', family=font_name)
plt.rcParams['axes.unicode_minus'] = False

labels = sorted(df['label'].unique())
cm = confusion_matrix(y_test, gbc_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap=plt.cm.Oranges)
plt.title("Gradient Boosting 혼동 행렬")
plt.show()


In [None]:

# 📊 8. 성능 비교 바 차트 (GBC 기준)
report = classification_report(y_test, gbc_pred, output_dict=True)
import pandas as pd

df_report = pd.DataFrame(report).transpose().iloc[:-3]
df_report[['precision', 'recall', 'f1-score']].plot(kind='bar', figsize=(10, 6))
plt.title("Gradient Boosting 성능 지표 비교")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.grid(axis='y')
plt.xticks(rotation=0)
plt.show()
