In [10]:
# 불용어 목록 선언
stopwords = list(set([
    '이', '가', '은', '는', '을', '를', '의', '에', '에서', '에게', '께', '로', '으로', 
    '와', '과', '보다', '처럼', '만큼', '같이', '까지', '마저', '조차', '부터', 
    '이나', '나', '이며', '며', '등', '하다', '한다', '하고', '하니', '하면', 
    '되어', '되다', '되고', '되니', '입니다', '습니다', 'ㅂ니다', '어요', '아요', '다',
    '고', '면', '게', '지', '죠',
    '그리고', '그러나', '하지만', '그런데', '그래서', '그러면', '그러므로', '따라서', 
    '또한', '또는', '및', '즉', '한편', '반면에', '근데',
    '나', '저', '우리', '저희', '너', '너희', '당신', '그', '그녀', '그들', '누구',
    '무엇', '어디', '언제', '어느', '이것', '그것', '저것', '여기', '거기', '저기', 
    '이쪽', '그쪽', '저쪽',
    '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉', '열',
    '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', '십', '백', '천', '만',
    '첫째', '둘째', '셋째',
    '바로', '때', '것', '수', '문제', '경우', '부분', '이다',
    '내용', '결과', '자체', '가지', '있다',
    '않았어요', '있었어요', '했어요', '했는데요', '있는데요', '합니다', '없다', '나다','생각하다',
    '했다', '같다', '네요','아니다', '용하다', '물이',
    '뿐', '대로', '만', '따름', '김에', '터',
    '아', '아이고', '아이구', '아하', '어', '그래', '응', '네', '예', '아니', '않다', '안되다','안',
    '가다', '오다', '주다', '말다', '나다', '받다', '알다', '모르다', '싶다', '생각하다', '들다'
]))
stopwords = set(stopwords)

In [None]:
# sentiment_pipeline_template.py

import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from tqdm import tqdm
from collections import Counter
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from sklearn.utils import class_weight
from matplotlib import font_manager as fm
from konlpy.tag import Okt
import time
import itertools

okt = Okt()
font_path = "C:/Windows/Fonts/NanumGothic.ttf"
font_prop = fm.FontProperties(fname=font_path)
plt.rc('font', family=font_prop.get_name())
plt.rcParams['axes.unicode_minus'] = False

# 1. CSV 파일 로딩
df = pd.read_csv("new_train_datas.csv")  # 학습용 CSV 경로로 수정

# 2. 텍스트와 라벨 분리
texts = df["text"].astype(str).tolist()
labels = df["labels"].tolist()

# 1. train-test split 먼저 수행 (텍스트 기준)
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)

def preprocess_for_training(text, stopwords):
    tokens = okt.morphs(text)
    return [t for t in tokens if t not in stopwords]

# 2. 전처리 함수 적용
X_train_tokens = [' '.join(preprocess_for_training(t, stopwords)) for t in X_train_texts]
X_test_tokens = [' '.join(preprocess_for_training(t, stopwords)) for t in X_test_texts]

# 3. Tokenizer fit (훈련셋 기준)
tokenizer = Tokenizer(num_words=12000, oov_token='OOV')
tokenizer.fit_on_texts(X_train_tokens)

# 4. 시퀀스 변환
X_train_seq = tokenizer.texts_to_sequences(X_train_tokens)
X_test_seq = tokenizer.texts_to_sequences(X_test_tokens)

# 5. 패딩 적용
max_len = 50
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# 4. 텍스트 전처리 함수 (okt + 불용어 제거) 이용

In [None]:
# 예측 함수
def predict_sentiment(text, tokenizer, model, stopwords, max_len=50):
    filtered = preprocess_for_training(text, stopwords)
    sequence = tokenizer.texts_to_sequences([' '.join(filtered)])
    padded = pad_sequences(sequence, maxlen=max_len)
    prob = model.predict(padded)[0][0]
    label = '긍정' if prob > 0.5 else '부정'
    return label, prob

# 자동 하이퍼파라미터 튜닝 및 학습
def run_experiments(X_train_pad, y_train, X_texts, stopwords, tokenizer):
    y_train = np.array(y_train)
    embedding_dims = [32, 64]
    lstm_units_list = [32, 64]
    learning_rates = [1e-3, 1e-4]
    batch_sizes = [64, 128]
    max_len = 50

    all_combinations = list(itertools.product(embedding_dims, lstm_units_list, learning_rates, batch_sizes))

    for emb_dim, lstm_units, lr, batch_size in all_combinations:
        timestamp = time.strftime('%Y%m%d-%H%M%S')
        experiment_name = f"exp_{timestamp}_lr{lr}_bs{batch_size}_lstm{lstm_units}"
        log_dir = f"./logs/{experiment_name}"
        os.makedirs(log_dir, exist_ok=True)

        csv_logger = CSVLogger(os.path.join(log_dir, 'training_log.csv'))
        es = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
        mc = ModelCheckpoint(os.path.join(log_dir, 'best_model.h5'),
                             monitor='val_accuracy', save_best_only=True, verbose=1)

        model = Sequential([
            #Embedding(input_dim=10000, output_dim=emb_dim, input_length=max_len),
            Embedding(input_dim=120000, output_dim=emb_dim),
            LSTM(lstm_units),
            Dropout(0.5),
            Dense(32, activation='relu'),
            Dropout(0.3),
            Dense(1, activation='sigmoid')
        ])

        model.compile(loss='binary_crossentropy',
                      optimizer=Adam(learning_rate=lr),
                      metrics=['accuracy'])

        class_weights = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(y_train),
            y=y_train
        )
        class_weights = dict(enumerate(class_weights))

        history = model.fit(
            X_train_pad, y_train,
            epochs=10,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=[es, mc, csv_logger],
            class_weight=class_weights,
            verbose=0
        )

        # 성능 저장
        final_log_path = os.path.join(log_dir, 'summary.csv')
        with open(final_log_path, 'w', newline='', encoding='utf-8') as f:
            import csv
            writer = csv.writer(f)
            writer.writerow(['param', 'value'])
            writer.writerow(['embedding_dim', emb_dim])
            writer.writerow(['lstm_units', lstm_units])
            writer.writerow(['learning_rate', lr])
            writer.writerow(['batch_size', batch_size])
            writer.writerow(['val_accuracy_last', history.history['val_accuracy'][-1]])

        # 모델 저장
        model.save(os.path.join(log_dir, 'final_model.keras'))

        # 토크나이저 저장
        with open(os.path.join(log_dir, 'tokenizer.pkl'), 'wb') as f:
            pickle.dump(tokenizer, f)
        
        # 학습 데이터 예측 저장
        predictions = []
        for text in tqdm(X_texts, desc=f"Predicting for {experiment_name}"):
            label, prob = predict_sentiment(text, tokenizer, model, stopwords, max_len=max_len)
            predictions.append({'text': text, 'label': label, 'prob': prob})
        pd.DataFrame(predictions).to_csv(os.path.join(log_dir, 'train_predictions.csv'), index=False, encoding='utf-8-sig')

print(" 전체 감성 분석 유틸리티 + 자동화 실험 템플릿 완료")


 전체 감성 분석 유틸리티 + 자동화 실험 템플릿 완료


In [None]:
import os
import pandas as pd

def load_experiment_logs(logs_dir="./logs"):
    results = []
    for exp in os.listdir(logs_dir):
        log_path = os.path.join(logs_dir, exp, 'training_log.csv')
        if os.path.exists(log_path):
            df = pd.read_csv(log_path)
            best_epoch = df['val_accuracy'].idxmax()
            best_val_acc = df.loc[best_epoch, 'val_accuracy']
            best_train_acc = df.loc[best_epoch, 'accuracy']
            best_val_loss = df.loc[best_epoch, 'val_loss']
            best_train_loss = df.loc[best_epoch, 'loss']
            
            results.append({
                'experiment': exp,
                'best_val_accuracy': best_val_acc,
                'best_epoch': best_epoch,
                'train_accuracy': best_train_acc,
                'val_loss': best_val_loss,
                'train_loss': best_train_loss,
                'log_path': log_path
            })
    return pd.DataFrame(results).sort_values(by="best_val_accuracy", ascending=False)

def select_best_model(logs_df):
    return logs_df.iloc[0]  # val_accuracy 가장 높은 실험 선택

In [None]:
import os
import pandas as pd

def print_best_model_report(logs_df):
    best = logs_df.iloc[0]
    print("[Best Model Report]")
    print(f"실험명:         {best['experiment']}")
    print(f"최고 Val Acc:  {best['best_val_accuracy']:.4f}")
    print(f"최고 Epoch:     {best['best_epoch']}")
    print(f"Train Acc:      {best['train_accuracy']:.4f}")
    print(f"Val Loss:       {best['val_loss']:.4f}")
    print(f"Train Loss:     {best['train_loss']:.4f}")
    print(f"로그 경로:      {best['log_path']}")


In [None]:
print(len(y_train), type(y_train))
print(y_train[:10])

print(len(X_train_pad))
print(len(y_train))  

33712 <class 'list'>
[1, 0, 0, 0, 1, 1, 0, 0, 0, 1]
33712
33712


In [None]:
X_train_preprocessed = [' '.join(preprocess_for_training(t, stopwords)) for t in X_train_texts]
train_sequences = tokenizer.texts_to_sequences(X_train_preprocessed)
X_train_pad = pad_sequences(train_sequences, maxlen=50)

KeyboardInterrupt: 

In [None]:
#run_experiments(X_train_pad, y_train, X_texts, stopwords, tokenizer)
run_experiments(X_train_pad, y_train, X_train_texts, stopwords, tokenizer)

InvalidArgumentError: Graph execution error:

Detected at node 'sequential/embedding/embedding_lookup' defined at (most recent call last):
    File "c:\Users\MYCOM\.conda\envs\azen\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\MYCOM\.conda\envs\azen\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\tornado\platform\asyncio.py", line 211, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\MYCOM\.conda\envs\azen\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "c:\Users\MYCOM\.conda\envs\azen\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "c:\Users\MYCOM\.conda\envs\azen\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\IPython\core\interactiveshell.py", line 3077, in run_cell
      result = self._run_cell(
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\IPython\core\interactiveshell.py", line 3132, in _run_cell
      result = runner(coro)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\IPython\core\interactiveshell.py", line 3336, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\IPython\core\interactiveshell.py", line 3519, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\IPython\core\interactiveshell.py", line 3579, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\MYCOM\AppData\Local\Temp\ipykernel_12596\1003939383.py", line 2, in <module>
      run_experiments(X_train_pad, y_train, X_train_texts, stopwords, tokenizer)
    File "C:\Users\MYCOM\AppData\Local\Temp\ipykernel_12596\700203812.py", line 53, in run_experiments
      history = model.fit(
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\training.py", line 889, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\sequential.py", line 374, in call
      return super(Sequential, self).call(inputs, training=training, mask=mask)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\functional.py", line 458, in call
      return self._run_internal_graph(
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\functional.py", line 596, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\MYCOM\.conda\envs\azen\lib\site-packages\keras\layers\core\embedding.py", line 199, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'sequential/embedding/embedding_lookup'
indices[56,36] = 11915 is not in [0, 10000)
	 [[{{node sequential/embedding/embedding_lookup}}]] [Op:__inference_train_function_3345]