In [3]:
#!pip install tensorflow
import pickle
import numpy as np
import json
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# 불용어 목록 선언
stopwords = list(set([
    '이', '가', '은', '는', '을', '를', '의', '에', '에서', '에게', '께', '로', '으로', 
    '와', '과', '보다', '처럼', '만큼', '같이', '까지', '마저', '조차', '부터', 
    '이나', '나', '이며', '며', '등', '하다', '한다', '하고', '하니', '하면', 
    '되어', '되다', '되고', '되니', '입니다', '습니다', 'ㅂ니다', '어요', '아요', '다',
    '고', '면', '게', '지', '죠',
    '그리고', '그러나', '하지만', '그런데', '그래서', '그러면', '그러므로', '따라서', 
    '또한', '또는', '및', '즉', '한편', '반면에', '근데',
    '나', '저', '우리', '저희', '너', '너희', '당신', '그', '그녀', '그들', '누구',
    '무엇', '어디', '언제', '어느', '이것', '그것', '저것', '여기', '거기', '저기', 
    '이쪽', '그쪽', '저쪽',
    '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉', '열',
    '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', '십', '백', '천', '만',
    '첫째', '둘째', '셋째',
    '바로', '때', '것', '수', '문제', '경우', '부분', '이다',
    '내용', '결과', '자체', '가지', '있다',
    '않았어요', '있었어요', '했어요', '했는데요', '있는데요', '합니다', '없다', '나다','생각하다',
    '했다', '같다', '네요','아니다', '용하다', '물이',
    '뿐', '대로', '만', '따름', '김에', '터',
    '아', '아이고', '아이구', '아하', '어', '그래', '응', '네', '예', '아니', '않다', '안되다','안',
    '가다', '오다', '주다', '말다', '나다', '받다', '알다', '모르다', '싶다', '생각하다', '들다'
]))
stopwords = set(stopwords)

In [4]:
with open('LSTM/X_train.pickle', 'rb') as f:
    X_train = pickle.load(f)
with open('LSTM/X_test.pickle', 'rb') as f:
    X_test = pickle.load(f)
with open('LSTM/y_train.pickle', 'rb') as f:
    y_train = pickle.load(f)
with open('LSTM/y_test.pickle', 'rb') as f:
    y_test = pickle.load(f)

In [5]:
def remove_stopwords(data, stopwords):
    return [[word for word in sentence if word not in stopwords] for sentence in data]

X_train = remove_stopwords(X_train, stopwords)
X_test = remove_stopwords(X_test, stopwords)

In [6]:
def remove_empty(X, y):
    return zip(*[(x, label) for x, label in zip(X, y) if len(x) > 0])

X_train, y_train = remove_empty(X_train, y_train)
X_test, y_test = remove_empty(X_test, y_test)

X_train, y_train = list(X_train), list(y_train)
X_test, y_test = list(X_test), list(y_test)

In [7]:
max_len = 70

def trim_samples(X, y, max_len):
    return zip(*[(x, label) for x, label in zip(X, y) if len(x) <= max_len])

X_train, y_train = trim_samples(X_train, y_train, max_len)
X_test, y_test = trim_samples(X_test, y_test, max_len)

X_train, y_train = list(X_train), list(y_train)
X_test, y_test = list(X_test), list(y_test)

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

total_count = len(tokenizer.word_index)
threshold = 3
rare_count = sum(1 for word, freq in tokenizer.word_counts.items() if freq < threshold)
rare_freq = sum(freq for word, freq in tokenizer.word_counts.items() if freq < threshold)
total_freq = sum(tokenizer.word_counts.values())

vocab_size = total_count - rare_count + 1

In [9]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# 정수 인코딩
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# 빈 샘플 제거
X_train, y_train = zip(*[(x, y) for x, y in zip(X_train, y_train) if len(x) > 0])
X_test, y_test = zip(*[(x, y) for x, y in zip(X_test, y_test) if len(x) > 0])

In [10]:
pad_X_train = pad_sequences(X_train, maxlen=max_len)
pad_X_test = pad_sequences(X_test, maxlen=max_len)

print(f"\n 최종 학습 데이터: {len(pad_X_train)}개")
print(f" 최종 테스트 데이터: {len(pad_X_test)}개")


 최종 학습 데이터: 19689개
 최종 테스트 데이터: 4928개


In [11]:
with open('LSTM/X_train_sequences.pickle', 'wb') as fw:
    pickle.dump(pad_X_train, fw)
with open('LSTM/X_test_sequences.pickle', 'wb') as fw:
    pickle.dump(pad_X_test, fw)
with open('LSTM/y_train_filterd.pickle', 'wb') as fw:
    pickle.dump(list(y_train), fw)
with open('LSTM/y_test_filterd.pickle', 'wb') as fw:
    pickle.dump(list(y_test), fw)

In [12]:
limited_word_index = {word: idx for word, idx in tokenizer.word_index.items() if idx < vocab_size}

with open('data/max_word_index.json', 'w', encoding='utf-8') as f:
    json.dump(limited_word_index, f, ensure_ascii=False)

print(f"\n 단어 사전 저장 완료 ({len(limited_word_index)}개)")


 단어 사전 저장 완료 (7609개)


145393 / 145162