In [58]:
# !pip install pyarrow
# !pip install fastparquet
# !pip install scikit-learn

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [60]:
# 학습 데이터 저장
df = pd.read_parquet("data/train-00000-of-00001.parquet")

X_train = df['sentence']
Y_train = df['label']

In [61]:
# 테스트 데이터 저장
df = pd.read_parquet("data/validation-00000-of-00001.parquet")

X_test = df['sentence']
Y_test = df['label']

In [62]:
# 학습 데이터 토큰화 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# 단어 개수 저장
word_to_index = tokenizer.word_index
vocab_size = len(word_to_index) + 1

In [63]:
# 단어를 정수로 인코딩
X_train_encoded = tokenizer.texts_to_sequences(X_train)

# 가장 긴 문장에 길이를 맞추기 위한 패딩 진행
max_len = max(len(sample) for sample in X_train_encoded)
X_train_padded = pad_sequences(X_train_encoded, maxlen=max_len)

In [82]:
from tensorflow.keras.layers import GRU, LSTM, Bidirectional, Embedding, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Precision, Recall

# 임베딩 벡터: 64차원 / 은닉 상태: 64
embedding_dim = 64
hidden_units = 64

# 모델은 임베딩 / Bi-LSTM / Sigmoid로 구성 
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim)) # 임베딩 레이어
model.add(Bidirectional(LSTM(hidden_units))) # Bi-LSTM 레이어
model.add(Dense(1, activation='sigmoid')) # 시그모이드

# 옵티마이저: adam / 손실함수: BCE
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train_padded, Y_train, epochs=2, batch_size=64)

Epoch 1/2
Epoch 2/2


In [83]:
# 테스트 데이터도 인코딩, 패딩 진행
X_test_encoded = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_encoded, maxlen=max_len)
predictions = model.predict(X_test_padded)

# 결과값이 0부터 1 사이의 실수이므로 반올림해 0 혹은 1로 변경
predictions = np.round(predictions).astype(int)



In [84]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(Y_test, predictions)
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)

print(f'정확도: {accuracy}\n정밀도: {precision}\n재현율: {recall}\nF1 점수: {f1}')

정확도: 0.8371559633027523
정밀도: 0.8431818181818181
재현율: 0.8355855855855856
F1 점수: 0.8393665158371041
