In [None]:
import re

def clean_text(text):
    # 한글, 숫자만 남기고 나머지 제거
    text = re.sub(r"[^가-힣0-9\s]", "", str(text))
    words = text.split()
    return ' '.join(words)

In [None]:
import json
with open("word_index.json","r", encoding='utf-8') as f:
    word_index = json.load(f)
print(f"워드 인덱스 단어 개수 : {len(word_index)}")
vocab_size = len(word_index) + 1

In [None]:
# 모델의 하이퍼 파라메타인 임베딩 백터
embedding_dim = 100
hidden_units = 128
# 샘플의 원소 개수 제한
max_length = 100

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense

model = Sequential()
# 입력층 설정
model.add(Embedding(
    # 입력 단어의 종류 수 : 단어 사전의 단어 개수
    vocab_size,             # 정수 인덱스의 최대 값
    # 각 단어를 몇개의 특성으로 표현할 것인가
    embedding_dim,          # 임베딩 백터 차원 값
    input_length=max_length # 훈련 데이터의 특성 개수 : 샘플의 원소 개수
    ))

In [None]:
import pickle
# 학습 데이터 가져오기
with open("X_train_sequences.pickle","rb") as fr:
    X_train = pickle.load(fr)
with open("X_test_sequences.pickle","rb") as fr:
    X_test = pickle.load(fr)
# 테스트 데이터 가져오기
with open("y_train_filterd.pickle","rb") as fr:
    y_train = pickle.load(fr)
with open("y_test_filterd.pickle","rb") as fr:
    y_test = pickle.load(fr)

In [None]:
print("로드된 데이터 개수 :")
print(f"X_train : {len(X_train)} / y_train : {len(y_train)}")
print(f"X_test : {len(X_test)} / y_test : {len(y_test)}")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

# 1. 원래 텍스트 (str) 리스트가 필요
train_df = pd.read_csv("train_data.csv")
X_train_raw = train_df['sentence'].astype(str).apply(clean_text).tolist()

# 2. 토크나이저 학습
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_raw)

# 3. 시퀀스로 변환
X_train_seq = tokenizer.texts_to_sequences(X_train_raw)

# 4. 시퀀스를 패딩
X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post', truncating='post', dtype='int32')

# 5. 라벨
y_train = train_df['label'].astype(np.int32).values

In [None]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def clean_text(text):
    text = re.sub(r"[^가-힣0-9\s]", "", str(text))
    return ' '.join(text.split())

# 데이터 로드
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# 전처리
train_df['sentence'] = train_df['sentence'].astype(str).apply(clean_text)
test_df['sentence'] = test_df['sentence'].astype(str).apply(clean_text)

# 텍스트 리스트
X_train = train_df['sentence'].tolist()
X_test = test_df['sentence'].tolist()

# 라벨 리스트
y_train = train_df['label'].astype(np.int32).values
y_test = test_df['label'].astype(np.int32).values

# 토크나이저 학습
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# 시퀀스 변환
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# 패딩
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post', dtype='int32')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post', dtype='int32')


In [None]:
import numpy as np
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    print(f"라벨 {label} 개수: {count}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.utils import class_weight

# 자동 클래스 가중치 계산
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))


model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_len),
    LSTM(64),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer=Adam(learning_rate=1e-4),
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
model.save("best_model_keras.keras")

history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    callbacks=[es, mc],
    class_weight=class_weights
)


In [None]:
#!pip install --upgrade transformers datasets accelerate

In [2]:
from transformers import Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import torch

# ✅ 1. 데이터 로딩
df = pd.read_csv("train_data.csv")  # 반드시 text, label 컬럼 있어야 함
df = df[['text', 'label']]  # label: 0=부정, 1=긍정

# ✅ 2. 학습/검증 분할
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# ✅ 3. Huggingface Datasets 변환
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ✅ 4. Tokenizer 및 전처리 함수
model_name = "klue/roberta-base"  # 또는 "monologg/kobert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

# ✅ 5. 모델 정의 (2 클래스 분류)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# ✅ 6. 훈련 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# ✅ 7. 평가 지표 정의
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = np.mean(preds == labels)
    return {"accuracy": acc}

# ✅ 8. Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# ✅ 9. 모델 학습
trainer.train()

# ✅ 10. 예측 결과 출력
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = preds.label_ids

print("\n📊 평가 결과:")
print(classification_report(true_labels, pred_labels, target_names=["부정", "긍정"]))


ModuleNotFoundError: Could not import module 'Trainer'. Are this object's requirements defined correctly?

In [None]:
import pickle

with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import matplotlib.pyplot as plt

loss, acc = model.evaluate(X_test_pad, y_test)
print(f"✅ Test Accuracy: {acc:.4f}")

# 시각화
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.title("Training History")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
y_pred_probs = model.predict(X_test_pad)
y_pred = (y_pred_probs > 0.5).astype(int)

# 예시 출력
for i in range(5):
    print(f"📝 {X_test[i]}")
    print(f"✅ 실제: {y_test[i]} / 예측: {y_pred[i][0]}")
    print("-" * 40)

In [None]:
from tensorflow.keras.models import load_model

model = load_model("best_model_keras.keras")

In [None]:
import pandas as pd
import numpy as np

# 예측 확률 → 0.5 기준 이진 분류
y_pred_probs = model.predict(X_test_pad)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# 결과 DataFrame 생성
results_df = pd.DataFrame({
    'text': X_test,  # 원문 텍스트
    'true_label': y_test,
    'pred_label': y_pred,
    'pred_prob': y_pred_probs.flatten()
})

# 저장
results_df.to_csv("lstm_predictions.csv", index=False, encoding="utf-8-sig")
print("✅ 예측 결과가 lstm_predictions.csv에 저장되었습니다.")

In [None]:
!pip install wordcloud

In [None]:
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re

def extract_nouns(texts):
    from konlpy.tag import Okt
    okt = Okt()
    all_nouns = []
    for text in texts:
        nouns = okt.nouns(str(text))
        nouns = [n for n in nouns if len(n) > 1]  # 1글자 제외
        all_nouns.extend(nouns)
    return all_nouns

# 예측 기준 분리
positive_texts = results_df[results_df['pred_label'] == 1]['text'].tolist()
negative_texts = results_df[results_df['pred_label'] == 0]['text'].tolist()

# 키워드 추출
positive_nouns = extract_nouns(positive_texts)
negative_nouns = extract_nouns(negative_texts)

# 빈도수 계산
pos_freq = Counter(positive_nouns)
neg_freq = Counter(negative_nouns)

def generate_wordcloud(freq_dict, title, font_path='NanumGothic.ttf'):
    wc = WordCloud(
        font_path=font_path,
        width=800,
        height=400,
        background_color='white'
    )
    wc_img = wc.generate_from_frequencies(freq_dict)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc_img, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

generate_wordcloud(pos_freq, "pos")
generate_wordcloud(neg_freq, "neg")

wc = WordCloud(font_path='NanumGothic.ttf', width=800, height=400, background_color='white')
wc.generate_from_frequencies(pos_freq).to_file("positive_wordcloud.png")
wc.generate_from_frequencies(neg_freq).to_file("negative_wordcloud.png")

In [None]:
!pip cache purge

In [None]:
model.save("best_model.keras")

In [None]:
!pip uninstall tensorflow

In [None]:
!pip install tensorflow==2.9.0

In [None]:
import tensorflow
print(tensorflow.__version__)
import keras
print(keras.__version__)

In [1]:
from tensorflow.keras.models import load_model
model = load_model("best_model_keras.keras")

In [None]:
import pandas as pd
from keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle

# 모델 로드
# model = load_model("best_model_keras.keras")
# model = load_model("best_model.h5", compile=False)

# 토크나이저 로드
import pickle
with open("tokenizer.pickle", "rb") as handle:
    tokenizer = pickle.load(handle)

# 데이터 불러오기 (예: 실제 리뷰)
df = pd.read_csv("y_reviews.csv")
texts = df['sentence'].tolist()

# 전처리 및 시퀀스 변환
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=30)  # maxlen은 학습 때 사용한 값과 동일하게

# 예측
probs = model.predict(padded)
preds = (probs > 0.5).astype(int).flatten()

# 결과 추가
df['predicted_label'] = preds
df['sentiment_score'] = probs.flatten()  # 0~1 감성 점수

# 저장
df.to_csv("예측된_감성_리뷰.csv", index=False, encoding='utf-8-sig')


In [None]:
import matplotlib.pyplot as plt

plt.hist(df['sentiment_score'], bins=50)
plt.title("감성 점수 분포")
plt.xlabel("sentiment_score")
plt.ylabel("리뷰 수")
plt.axvline(0.5, color='gray', linestyle='--')
plt.show()

In [None]:
def classify_3way(score, center=0.5, neutral_margin=0.01):
    if abs(score - center) < neutral_margin:
        return "중립"
    elif score >= center + neutral_margin:
        return "긍정"
    else:
        return "부정"

In [None]:
df['predicted_label_3way'] = df['sentiment_score'].apply(classify_3way)
df.to_csv("예측된_감성_리뷰.csv", index=False, encoding='utf-8-sig')