In [8]:
stopwords = list(set([
    '이', '가', '은', '는', '을', '를', '의', '에', '에서', '에게', '께', '로', '으로', 
    '와', '과', '보다', '처럼', '만큼', '같이', '까지', '마저', '조차', '부터', 
    '이나', '나', '이며', '며', '등', '하다', '한다', '하고', '하니', '하면', 
    '되어', '되다', '되고', '되니', '입니다', '습니다', 'ㅂ니다', '어요', '아요', '다', '방이', '제대로',
    '고', '면', '게', '지', '죠',
    '그리고', '그러나', '하지만', '그런데', '그래서', '그러면', '그러므로', '따라서', 
    '또한', '또는', '및', '즉', '한편', '반면에', '근데',
    '나', '저', '우리', '저희', '너', '너희', '당신', '그', '그녀', '그들', '누구', '그렇다',
    '무엇', '어디', '언제', '어느', '이것', '그것', '저것', '여기', '거기', '저기', 
    '이쪽', '그쪽', '저쪽',
    '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉', '열',
    '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', '십', '백', '천', '만',
    '첫째', '둘째', '셋째',
    '바로', '때', '것', '수', '문제', '경우', '부분', '이다',
    '내용', '결과', '자체', '가지', '있다',
    '않았어요', '있었어요', '했어요', '했는데요', '있는데요', '합니다', '없다', '나다','생각하다',
    '했다', '같다', '네요','아니다',
    '좀', '너무', '정말', '많이', '조금',
    '사장', '이용', '용하다', '물이',
    '뿐', '대로', '만', '따름', '나름', '김에', '터',
    '아', '아이고', '아이구', '아하', '어', '그래', '응', '네', '예', '아니', '않다', '안되다','안','그냥',
    '가다', '오다', '주다', '말다', '나다', '받다', '알다', '모르다', '싶다', '생각하다', '들다'
]))

#stopwords = set(w.lower() for w in stopwords)


In [7]:
import os
import json
import joblib
from datetime import datetime
from functools import partial
import pandas as pd
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# ====== 설정 ======
MODEL_NAME = "logistic_regression"
VERSION = "v1"  # 변경 가능: v1, v2, v20250712 등
BASE_DIR = f"models/{MODEL_NAME}/{VERSION}"
os.makedirs(BASE_DIR, exist_ok=True)
#stopwords = []  # 필요시 stopwords 채워 넣기

# ====== 데이터 로딩 ======
train_df = pd.read_csv("ratings_train.csv", encoding="utf-8-sig")
test_df = pd.read_csv("ratings_test.csv", encoding="utf-8-sig")

# 중립 제거
train_df = train_df[train_df["label"].isin([0, 1])]
test_df = test_df[test_df["label"].isin([0, 1])]

X_train_text = train_df["text"]
y_train = train_df["label"]
X_test_text = test_df["text"]
y_test = test_df["label"]

# ====== 토크나이저 정의 ======
okt = Okt()
def tokenize(text, stopwords=[]):
    try:
        return [
            word.lower()
            for word, pos in okt.pos(text, stem=True)
            if pos in ['Noun', 'Adjective']
            and word.lower() not in stopwords
            and len(word) > 1
        ]
    except Exception as e:
        print(f"Tokenization error: {e}")
        return []

tokenizer_with_stopwords = partial(tokenize, stopwords=stopwords)

# ====== 벡터화 ======
vectorizer = TfidfVectorizer(tokenizer=tokenizer_with_stopwords, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# ====== 모델 학습 ======
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# ====== 평가 ======
y_pred = model.predict(X_test)
print("\n✅ [모델 평가 결과]")
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))

# ====== 저장 ======
joblib.dump(model, os.path.join(BASE_DIR, "model.pkl"))
joblib.dump(vectorizer, os.path.join(BASE_DIR, "vectorizer.pkl"))
with open(os.path.join(BASE_DIR, "stopwords.json"), "w", encoding="utf-8") as f:
    json.dump(stopwords, f, ensure_ascii=False)

# 메타 정보 저장
metadata = {
    "model": "LogisticRegression",
    "description": "TF-IDF + Okt 기반 이진 감성 분석",
    "created_at": str(datetime.now()),
    "vectorizer": "TfidfVectorizer(ngram_range=(1,2))",
    "stopwords_count": len(stopwords),
    "train_size": len(train_df),
    "test_size": len(test_df)
}
with open(os.path.join(BASE_DIR, "metadata.json"), "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

with open(os.path.join(BASE_DIR, "saved_at.txt"), "w") as f:
    f.write(str(datetime.now()))

print(f"\n✅ 모델과 벡터라이저가 [{BASE_DIR}]에 저장되었습니다.")





✅ [모델 평가 결과]
=== Confusion Matrix ===
[[171  43]
 [ 57 230]]

=== Classification Report ===
              precision    recall  f1-score   support

           0      0.750     0.799     0.774       214
           1      0.842     0.801     0.821       287

    accuracy                          0.800       501
   macro avg      0.796     0.800     0.798       501
weighted avg      0.803     0.800     0.801       501


✅ 모델과 벡터라이저가 [models/logistic_regression/v1]에 저장되었습니다.


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from konlpy.tag import Okt
from functools import partial
import joblib
import os

# ===== 설정 =====
MODEL_DIR = "saved_model"
os.makedirs(MODEL_DIR, exist_ok=True)
MODEL_PATH = os.path.join(MODEL_DIR, "logistic_model.pkl")
VECTORIZER_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl")

# ===== 데이터 불러오기 =====
train_df = pd.read_csv("updated_ratings_train.csv", encoding="utf-8-sig")
test_df = pd.read_csv("updated_ratings_test.csv", encoding="utf-8-sig")

train_df = train_df[train_df["label"].isin([-1, 0, 1])]
test_df = test_df[test_df["label"].isin([-1, 0, 1])]

X_train_text = train_df["text"]
y_train = train_df["label"]
X_test_text = test_df["text"]
y_test = test_df["label"]

# ===== 형태소 분석기 기반 토크나이저 =====
okt = Okt()
# stopwords = ["그냥", "좀", "정도", "이런", "저런"]

def tokenize(text, stopwords=[]):
    try:
        return [
            word for word, pos in okt.pos(text, stem=True)
            if pos in ["Noun", "Adjective"] and word not in stopwords and len(word) > 1
        ]
    except Exception as e:
        print(f"Tokenize Error: {e}")
        return []

tokenizer = partial(tokenize, stopwords=stopwords)

# ===== TF-IDF 벡터화 =====
vectorizer = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# ===== 로지스틱 회귀 모델 학습 =====
model = LogisticRegression(max_iter=1000, class_weight="balanced")
model.fit(X_train, y_train)

# ===== 평가 출력 =====
y_pred = model.predict(X_test)
print("=== 📊 Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== 🧾 Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))

# ===== 모델 저장 =====
joblib.dump(model, MODEL_PATH)
joblib.dump(vectorizer, VECTORIZER_PATH)
print(f"\n✅ 모델이 저장되었습니다: {MODEL_PATH}")
print(f"✅ 벡터라이저 저장됨: {VECTORIZER_PATH}")

# ===== 예측 함수 정의 =====
def predict_sentiment(text):
    loaded_model = joblib.load(MODEL_PATH)
    loaded_vectorizer = joblib.load(VECTORIZER_PATH)
    loaded_vectorizer.tokenizer = tokenizer  # tokenizer 다시 지정

    X_input = loaded_vectorizer.transform([text])
    pred = loaded_model.predict(X_input)[0]
    
    label_map = {-1: "부정", 0: "중립", 1: "긍정"}
    return label_map.get(pred, "알 수 없음")

# ===== 예측 테스트 예시 =====
print("\n🧪 예시 문장 테스트:")
example = "위치는 괜찮았지만 다시 가고 싶진 않네요."
print(f"문장: {example}")
print(f"예측 감성: {predict_sentiment(example)}")




=== 📊 Confusion Matrix ===
[[110  24  32]
 [ 29  23  20]
 [ 34  17 212]]

=== 🧾 Classification Report ===
              precision    recall  f1-score   support

          -1      0.636     0.663     0.649       166
           0      0.359     0.319     0.338        72
           1      0.803     0.806     0.805       263

    accuracy                          0.689       501
   macro avg      0.599     0.596     0.597       501
weighted avg      0.684     0.689     0.686       501


✅ 모델이 저장되었습니다: saved_model\logistic_model.pkl
✅ 벡터라이저 저장됨: saved_model\tfidf_vectorizer.pkl

🧪 예시 문장 테스트:
문장: 위치는 괜찮았지만 다시 가고 싶진 않네요.
예측 감성: 중립
