In [32]:
stopwords = list(set([
    '이', '가', '은', '는', '을', '를', '의', '에', '에서', '에게', '께', '로', '으로', 
    '와', '과', '보다', '처럼', '만큼', '같이', '까지', '마저', '조차', '부터', 
    '이나', '나', '이며', '며', '등', '하다', '한다', '하고', '하니', '하면', 
    '되어', '되다', '되고', '되니', '입니다', '습니다', 'ㅂ니다', '어요', '아요', '다', '방이', '제대로',
    '고', '면', '게', '지', '죠',
    '그리고', '그러나', '하지만', '그런데', '그래서', '그러면', '그러므로', '따라서', 
    '또한', '또는', '및', '즉', '한편', '반면에', '근데',
    '나', '저', '우리', '저희', '너', '너희', '당신', '그', '그녀', '그들', '누구', '그렇다',
    '무엇', '어디', '언제', '어느', '이것', '그것', '저것', '여기', '거기', '저기', 
    '이쪽', '그쪽', '저쪽',
    '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉', '열',
    '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', '십', '백', '천', '만',
    '첫째', '둘째', '셋째',
    '바로', '때', '것', '수', '문제', '경우', '부분', '이다',
    '내용', '결과', '자체', '가지',
    '않았어요', '있었어요', '했어요', '했는데요', '있는데요', '합니다', '없다', '나다','생각하다',
    '했다', '같다', '네요','아니다',
    '좀', '너무', '정말', '많이', '조금',
    '사장', '이용', '용하다', '물이',
    '뿐', '대로', '만', '따름', '나름', '김에', '터',
    '아', '아이고', '아이구', '아하', '어', '그래', '응', '네', '예', '아니', '않다', '안되다','안','그냥',
    '가다', '오다', '주다', '말다', '나다', '받다', '알다', '모르다', '싶다', '생각하다', '들다'
]))

stopwords = set(w.lower() for w in stopwords)

In [27]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# 1. CSV 파일 불러오기
train_df = pd.read_csv("updated_ratings_train.csv", encoding="utf-8-sig")
test_df = pd.read_csv("updated_ratings_test.csv", encoding="utf-8-sig")

train_df = train_df[train_df['label'].isin([-1, 0, 1])]
test_df = test_df[test_df['label'].isin([-1, 0, 1])]

vectorizer = CountVectorizer(stop_words=stopwords)

# 2. 벡터화 (BoW 방식, 단순 단어 단위 토크나이징)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])

y_train = train_df["label"]
y_test = test_df["label"]

# 3. 모델 학습
model = MultinomialNB()
model.fit(X_train, y_train)

# 4. 예측 및 평가
y_pred = model.predict(X_test)

print("=== 📊 Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== 🧾 Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))


=== 📊 Confusion Matrix ===
[[ 49  42  30]
 [ 26  53  64]
 [  5  18 214]]

=== 🧾 Classification Report ===
              precision    recall  f1-score   support

          -1      0.613     0.405     0.488       121
           0      0.469     0.371     0.414       143
           1      0.695     0.903     0.785       237

    accuracy                          0.631       501
   macro avg      0.592     0.560     0.562       501
weighted avg      0.610     0.631     0.607       501



In [None]:
#!pip install joblib

In [None]:
import joblib

# 모델과 벡터라이저 저장
joblib.dump(model, 'models/sentiment_model.pkl')
joblib.dump(vectorizer, 'models/tfidf_vectorizer.pkl')

print("✅ 모델과 벡터라이저 저장 완료!")

In [28]:
new_sentences = ["그냥 그럼", "진짜 별로에요"]
X_new = vectorizer.transform(new_sentences)
predictions = model.predict(X_new)
probs = model.predict_proba(X_new)

for i, text in enumerate(new_sentences):
    print(f"문장: {text}")
    print(f"예측 감성: {predictions[i]} (긍정 확률: {probs[i][1]:.3f})\n")

문장: 그냥 그럼
예측 감성: 0 (긍정 확률: 0.872)

문장: 진짜 별로에요
예측 감성: 1 (긍정 확률: 0.294)



In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# TF-IDF 벡터화
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(train_df["text"])
X_test = tfidf.transform(test_df["text"])

# 모델 학습
model = MultinomialNB()
model.fit(X_train, y_train)

# 예측 및 평가
y_pred = model.predict(X_test)

print("=== TF-IDF Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print("\n=== TF-IDF Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))

=== TF-IDF Confusion Matrix ===
[[ 38  13  70]
 [ 15  24 104]
 [  3   3 231]]

=== TF-IDF Classification Report ===
              precision    recall  f1-score   support

          -1      0.679     0.314     0.429       121
           0      0.600     0.168     0.262       143
           1      0.570     0.975     0.720       237

    accuracy                          0.585       501
   macro avg      0.616     0.486     0.470       501
weighted avg      0.605     0.585     0.519       501



In [30]:
from sklearn.feature_extraction.text import CountVectorizer

# 1~2그램 적용 (ex: "맛있어요", "정말 맛있어요")
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_ng = ngram_vectorizer.fit_transform(train_df["text"])
X_test_ng = ngram_vectorizer.transform(test_df["text"])

# 모델 학습
model = MultinomialNB()
model.fit(X_train_ng, y_train)

# 예측 및 평가
y_pred_ng = model.predict(X_test_ng)

print("=== N-gram Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred_ng))
print("\n=== N-gram Classification Report ===")
print(classification_report(y_test, y_pred_ng, digits=3))


=== N-gram Confusion Matrix ===
[[ 47  43  31]
 [ 28  52  63]
 [  5  19 213]]

=== N-gram Classification Report ===
              precision    recall  f1-score   support

          -1      0.588     0.388     0.468       121
           0      0.456     0.364     0.405       143
           1      0.694     0.899     0.783       237

    accuracy                          0.623       501
   macro avg      0.579     0.550     0.552       501
weighted avg      0.600     0.623     0.599       501



In [None]:
# TF-IDF + N-gram (1~2그램)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])

In [31]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[ 40  23  58]
 [ 18  39  86]
 [  6   6 225]]
              precision    recall  f1-score   support

          -1      0.625     0.331     0.432       121
           0      0.574     0.273     0.370       143
           1      0.610     0.949     0.743       237

    accuracy                          0.607       501
   macro avg      0.603     0.518     0.515       501
weighted avg      0.603     0.607     0.561       501



In [None]:
# !pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

from konlpy.tag import Okt
import warnings

# 1. 데이터 불러오기
train_df = pd.read_csv("ratings_train.csv", encoding="utf-8-sig")
test_df = pd.read_csv("ratings_test.csv", encoding="utf-8-sig")

# ✅ 2. 긍정(1)과 부정(0)만 남기고, 그 외 중립 제거
train_df = train_df[train_df["label"].isin([0, 1])].copy()
test_df = test_df[test_df["label"].isin([0, 1])].copy()
okt = Okt()
def tokenize(text):
    try:
        return [word for word, pos in okt.pos(text, stem=True) 
                if pos in ['Noun', 'Adjective'] 
                and word not in stopwords
                and len(word) > 1
                ]
    except:
        return []

# 3. 벡터화
vectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])
y_train = train_df["label"]
y_test = test_df["label"]

# 4. 모델 학습 (클래스 가중치 balanced)
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# 5. 예측 및 평가
y_pred = model.predict(X_test)
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=2))

# 6. 중요 단어 추출 (이진 분류는 coef_[0] 사용)
feature_names = np.array(vectorizer.get_feature_names_out())
coef = model.coef_[0]

topn = 30
top_pos_idx = np.argsort(coef)[::-1][:topn]
top_neg_idx = np.argsort(coef)[:topn]

df_pos = pd.DataFrame({'word': feature_names[top_pos_idx], 'weight': coef[top_pos_idx]})
df_neg = pd.DataFrame({'word': feature_names[top_neg_idx], 'weight': coef[top_neg_idx]})

# 7. 시각화
plt.rcParams['font.family'] = 'Malgun Gothic'  # Mac이면 AppleGothic
plt.rcParams['axes.unicode_minus'] = False

fig, axes = plt.subplots(1, 2, figsize=(18, 10), sharey=True)

sns.barplot(ax=axes[0], data=df_neg, y='word', x='weight', color='#e74c3c')
axes[0].set_title("부정 상위 단어 (label=0)")
axes[0].set_xlabel("가중치(weight)")
axes[0].set_ylabel("단어")

sns.barplot(ax=axes[1], data=df_pos, y='word', x='weight', color='#2ecc71')
axes[1].set_title("긍정 상위 단어 (label=1)")
axes[1].set_xlabel("가중치(weight)")

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
# 새 문장 리스트
df = pd.read_csv('36000_reviews.csv', encoding='utf-8-sig')

sentence = df['sentence']

# 벡터화 (학습한 vectorizer 사용)
X_new = vectorizer.transform(sentence)

# 예측 수행
predictions = model.predict(X_new)
probs = model.predict_proba(X_new)
labels = []
threshold_low = 0.3
threshold_high = 0.5

labels = []
for i, text in enumerate(sentence):
    prob_pos = probs[i][1]
    if prob_pos >= threshold_high:
        label = "긍정"
    elif prob_pos <= threshold_low:
        label = "부정"
    else:
        label = "중립"
    labels.append(label)

df['미세조정하면'] = labels

df.to_csv('리뷰_라벨.csv', encoding='utf-8-sig', index=False)


In [None]:
import numpy as np
feature_names = np.array(vectorizer.get_feature_names_out())

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

coef = model.coef_[0]

topn = 30
top_pos_idx = np.argsort(coef)[::-1][:topn]
top_neg_idx = np.argsort(coef)[:topn]
top_pos_idx = [ item for item in top_pos_idx if item not in stopwords ]
top_neg_idx = [ item for item in top_neg_idx if item not in stopwords ]
df_pos = pd.DataFrame({'word': feature_names[top_pos_idx], 'weight': coef[top_pos_idx]})
df_neg = pd.DataFrame({'word': feature_names[top_neg_idx], 'weight': coef[top_neg_idx]})

# 7. 시각화
plt.rcParams['font.family'] = 'Malgun Gothic'  # Mac이면 AppleGothic
plt.rcParams['axes.unicode_minus'] = False

fig, axes = plt.subplots(1, 2, figsize=(18, 10), sharey=True)

sns.barplot(ax=axes[0], data=df_neg, y='word', x='weight', color='#e74c3c')
axes[0].set_title("부정 상위 단어 (label=0)")
axes[0].set_xlabel("가중치(weight)")
axes[0].set_ylabel("단어")

sns.barplot(ax=axes[1], data=df_pos, y='word', x='weight', color='#2ecc71')
axes[1].set_title("긍정 상위 단어 (label=1)")
axes[1].set_xlabel("가중치(weight)")

plt.tight_layout()
plt.show()

In [None]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# tokenizer = T5Tokenizer.from_pretrained("KETI-AIR/ke-t5-base-ko-sentence-correction")
# model = T5ForConditionalGeneration.from_pretrained("KETI-AIR/ke-t5-base-ko-sentence-correction")

# input_sentence = "아버지가방에들어가신다"
# input_ids = tokenizer.encode(input_sentence, return_tensors="pt")

# outputs = model.generate(input_ids, max_length=128)
# corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print("수정 전:", input_sentence)
# print("수정 후:", corrected)

In [None]:
# import os
# from huggingface_hub import InferenceClient

# client = InferenceClient(
#     provider="featherless-ai",
#     api_key=os.environ["HF_TOKEN"],
# )

# result = client.text_generation(
#     "Can you please let us know more details about your ",
#     model="upstage/SOLAR-10.7B-v1.0",
# )