In [1]:
import tensorflow as tf

Xây dựng model LSTM From Scratch

In [2]:
class LSTM_From_Scratch(tf.keras.layers.Layer):
  def __init__(self, units, inp_shape):
    super(LSTM_From_Scratch, self).__init__()
    # Kích thước hidden state và cell state
    self.units = units
    # Kích thước embedding
    self.inp_shape = inp_shape
    #Tạo 4 ma trận W và U cho các cổng Input, Forget, Output và nơi học thông tin mới
    self.W = self.add_weight(name = "W", shape = (4, self.units, self.inp_shape))
    self.U = self.add_weight(name = "U", shape = (4, self.units, self.units))
  def call(self, pre_layer, x):
    # pre_h (ở thời điểm t-1): hidden state có kích thước (batch_size, units) với mỗi dòng là hidden state của 1 câu
    # pre_c (ở thời điểm t-1): cell state cũng giống hidden state
    pre_h, pre_c = tf.unstack(pre_layer)

    #Input Gate
    # Mô hình sẽ học lấy bao nhiêu % thông tin mới đưa vào cell state
    i_t = tf.nn.sigmoid(
        tf.matmul(x, tf.transpose(self.W[0])) +
        tf.matmul(pre_h, tf.transpose(self.U[0]))
    )
    #Forget Gate
    # Mô hình sẽ học bao nhiêu % quên từ cell state trước
    f_t = tf.nn.sigmoid(
        tf.matmul(x, tf.transpose(self.W[1])) +
        tf.matmul(pre_h, tf.transpose(self.U[1]))
    )
    #Output Gate
    # Mô hình học lấy bao nhiêu % từ cell state để làm hidden state tiếp theo
    o_t = tf.nn.sigmoid(
        tf.matmul(x, tf.transpose(self.W[2]))+
        tf.matmul(pre_h, tf.transpose(self.U[2]))
    )
    # Học thông tin mới bằng cách lấy hidden state thời điểm t-1 kết hợp với embedding token thời điểm t
    n_c_t = tf.nn.tanh(
        tf.matmul(x, tf.transpose(self.W[3])) +
        tf.matmul(pre_h, tf.transpose(self.U[3]))
    )
    # Cell state ở thời điểm t
    c = tf.multiply(f_t, pre_c) + tf.multiply(i_t, n_c_t)
    # Hidden state ở thời điểm t
    h = tf.multiply(o_t, tf.nn.tanh(c))

    return tf.stack([h, c])

Kết hợp lớp Embedding, lớp LSTM tự cài đặt, và một mạng Fully Connected để phân loại

In [3]:
class LSTM_From_Scratch_Model(tf.keras.Model):
  def __init__(self, units, embedding_size, vocab_size, input_length):
    super(LSTM_From_Scratch_Model, self).__init__()
    self.input_length = input_length
    self.units = units

    self.embedding = tf.keras.layers.Embedding(
        vocab_size,
        embedding_size,
        input_length = input_length
    )
    self.lstm = LSTM_From_Scratch(units, embedding_size)
    self.classfication_model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, input_shape=(units,), activation = "relu"),
        tf.keras.layers.Dense(1, activation = "sigmoid")
    ])
  def call(self, sentence):
    batch_size = tf.shape(sentence)[0]
    pre_layer = tf.stack([
        tf.zeros([batch_size, self.units]),
        tf.zeros([batch_size, self.units])
    ])

    embedding_sentence = self.embedding(sentence)

    for i in range(self.input_length):
      word = embedding_sentence[:, i, :]
      pre_layer = self.lstm(pre_layer, word)

    h, _ = tf.unstack(pre_layer)

    return self.classfication_model(h)

Import thư viện xử lý ngôn ngữ

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Đọc file data

In [None]:
df = pd.read_csv("./IMDB Dataset.csv")

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Tạo một cột nội dung review mới đã được loại bỏ stopwords và đưa về dạng nghĩa gốc

In [8]:
stop_words = set(stopwords.words("english"))
lemma = WordNetLemmatizer()

def preprocessing_text_with_lemma_stop_words(text):
  tokens = nltk.word_tokenize(text)
  lemma_stopwords = [lemma.lemmatize(token) for token in tokens if token not in stop_words]
  return " ".join(lemma_stopwords)
df["preprocessing_review"] = df["review"].apply(preprocessing_text_with_lemma_stop_words)

In [9]:
df.head()

Unnamed: 0,review,sentiment,preprocessing_review
0,One of the other reviewers has mentioned that ...,positive,One reviewer mentioned watching 1 Oz episode '...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production . < br / > < br ...
2,I thought this was a wonderful way to spend ti...,positive,I thought wonderful way spend time hot summer ...
3,Basically there's a family where a little boy ...,negative,Basically 's family little boy ( Jake ) think ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Mattei 's `` Love Time Money '' visuall...


Xây dựng Pipeline để huấn luyện mô hình

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = df["preprocessing_review"]
y = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.2, random_state=42)
tokenizer = Tokenizer(num_words = 5000, oov_token = "<OOV>")
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

pad_train = pad_sequences(sequences_train, maxlen=200, truncating="post", padding = "post")
pad_test = pad_sequences(sequences_test, maxlen=200, truncating="post", padding = "post")

model = LSTM_From_Scratch_Model(units = 128, embedding_size = 300, vocab_size = vocab_size, input_length = 200)
model.compile(tf.keras.optimizers.Adam(0.0005), loss = "binary_crossentropy", metrics=["accuracy"])
model.fit(pad_train, y_train, epochs = 10, validation_data=[pad_test, y_test])




Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m331s[0m 81ms/step - accuracy: 0.5096 - loss: 0.6918 - val_accuracy: 0.6746 - val_loss: 0.6298
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 42ms/step - accuracy: 0.6239 - loss: 0.6571 - val_accuracy: 0.6276 - val_loss: 0.6575
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 43ms/step - accuracy: 0.6525 - loss: 0.6267 - val_accuracy: 0.5694 - val_loss: 0.6605
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 41ms/step - accuracy: 0.7732 - loss: 0.4743 - val_accuracy: 0.8662 - val_loss: 0.3173
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 41ms/step - accuracy: 0.8995 - loss: 0.2583 - val_accuracy: 0.8780 - val_loss: 0.2953
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 42ms/step - accuracy: 0.9195 - loss: 0.2081 - val_accuracy: 0.8805 - val_loss: 0.2993
Epoch 7/10
[1

<keras.src.callbacks.history.History at 0x79f55eaa3140>

Đánh giá mô hình

In [12]:
loss, accuracy = model.evaluate(pad_test, y_test)
print(f"Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8642 - loss: 0.4300
Accuracy: 0.8661999702453613


Dự đoán

In [11]:
def inference(text, model):
  lemma_stopwords = preprocessing_text_with_lemma_stop_words(text)
  sequences_text = tokenizer.texts_to_sequences([lemma_stopwords])
  pad_text = pad_sequences(sequences_text, maxlen=200, truncating="post", padding="post")
  predict = model.predict(pad_text)

  if predict >= 0.5:
    return "positive"
  else:
    return "negative"

inference("I'm very hate movie. Because it's very bad", model)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 33s/step


'negative'