## Import Library


In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model, save_model, Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Bidirectional, GRU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn import metrics
from keras.optimizers import Adam
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import keras

In [2]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

## Data Preparation


In [3]:
# Đây là bộ dữ liệu về đánh giá những bình luận về phim với các nhãn lần lượt là 0 - negative, 1 - positive
train_data = pd.read_csv('D:\Embedding2\Data\Train.csv')
test_data = pd.read_csv('D:\Embedding2\Data\Test.csv')
valid_data=pd.read_csv('D:\Embedding2\Data\Valid.csv')

In [4]:
train_data.head(5)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [5]:
print(train_data['label'].value_counts())
print(test_data['label'].value_counts())
print(valid_data['label'].value_counts())

label
0    20019
1    19981
Name: count, dtype: int64
label
1    2505
0    2495
Name: count, dtype: int64
label
1    2514
0    2486
Name: count, dtype: int64


## Data processing

In [6]:
# Loại bỏ những kí tự đặc biệt trong câu
def cleantext(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText

In [7]:
train_data['text'] = train_data['text'].apply(lambda x: cleantext(x))
test_data['text'] = test_data['text'].apply(lambda x: cleantext(x))
valid_data['text'] = valid_data['text'].apply(lambda x: cleantext(x))


In [8]:
train_data['text'][0]

'I grew up b 1965 watching and loving the Thunderbirds All my mates at school watched We played Thunderbirds before school during lunch and after school We all wanted to be Virgil or Scott No one wanted to be Alan Counting down from 5 became an art form I took my children to see the movie hoping they would get a glimpse of what I loved as a child How bitterly disappointing The only high point was the snappy theme tune Not that it could compare with the original score of the Thunderbirds Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created Jonatha Frakes should hand in his directors chair his version was completely hopeless A waste of film Utter rubbish A CGI remake may be a eptable but replacing marionettes with Homo sapiens subsp sapiens was a huge error of judgment '

In [9]:
x_train = train_data['text']
y_train = train_data['label']
x_test = test_data['text']
y_test = test_data['label']
x_valid = valid_data['text']
y_valid = valid_data['label']

In [10]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [11]:
vocab_size = 90000 #Số lượng từ vựng trong mô hình
ed = 64 #Số chiều của không gian nhúng
ml = 140 #Độ dài tối đa của chuỗi đầu vào

## Word Embedding

In [12]:
tokenizers = Tokenizer(num_words = vocab_size, oov_token="<OOV>")
tokenizers.fit_on_texts(x_train)

In [13]:
tokenizers.word_index

{'<OOV>': 1,
 'the': 2,
 'a': 3,
 'and': 4,
 'of': 5,
 'to': 6,
 'is': 7,
 'br': 8,
 'it': 9,
 'in': 10,
 'i': 11,
 'this': 12,
 'that': 13,
 's': 14,
 'was': 15,
 'as': 16,
 'movie': 17,
 'for': 18,
 'with': 19,
 'but': 20,
 'film': 21,
 'you': 22,
 't': 23,
 'on': 24,
 'not': 25,
 'he': 26,
 'are': 27,
 'his': 28,
 'have': 29,
 'one': 30,
 'be': 31,
 'all': 32,
 'at': 33,
 'they': 34,
 'by': 35,
 'an': 36,
 'who': 37,
 'so': 38,
 'from': 39,
 'like': 40,
 'there': 41,
 'or': 42,
 'just': 43,
 'her': 44,
 'about': 45,
 'out': 46,
 'if': 47,
 'has': 48,
 'what': 49,
 'some': 50,
 'good': 51,
 'can': 52,
 'more': 53,
 'when': 54,
 'very': 55,
 'she': 56,
 'up': 57,
 'no': 58,
 'time': 59,
 'even': 60,
 'my': 61,
 'would': 62,
 'which': 63,
 'only': 64,
 'really': 65,
 'story': 66,
 'see': 67,
 'their': 68,
 'had': 69,
 'me': 70,
 'well': 71,
 'were': 72,
 'we': 73,
 'much': 74,
 'than': 75,
 'get': 76,
 'bad': 77,
 'been': 78,
 'other': 79,
 'will': 80,
 'people': 81,
 'do': 82,
 'also'

In [14]:
x_train = tokenizers.texts_to_sequences(x_train)

In [15]:
x_train

[[11,
  2177,
  57,
  486,
  7752,
  151,
  4,
  1668,
  2,
  8868,
  32,
  61,
  5440,
  33,
  369,
  293,
  73,
  254,
  8868,
  162,
  369,
  302,
  5964,
  4,
  102,
  369,
  73,
  32,
  467,
  6,
  31,
  16043,
  42,
  1049,
  58,
  30,
  467,
  6,
  31,
  1784,
  8009,
  179,
  39,
  447,
  890,
  36,
  505,
  821,
  11,
  550,
  61,
  418,
  6,
  67,
  2,
  17,
  1398,
  34,
  62,
  76,
  3,
  3146,
  5,
  49,
  11,
  425,
  16,
  3,
  506,
  89,
  12195,
  1351,
  2,
  64,
  303,
  223,
  15,
  2,
  8010,
  770,
  3018,
  25,
  13,
  9,
  98,
  1617,
  19,
  2,
  214,
  564,
  5,
  2,
  8868,
  2455,
  402,
  2318,
  22156,
  30,
  694,
  1190,
  133,
  304,
  7681,
  5,
  2,
  200,
  11754,
  2259,
  4,
  28,
  323,
  1054,
  57651,
  25367,
  144,
  525,
  10,
  28,
  976,
  2983,
  28,
  328,
  15,
  342,
  4572,
  3,
  456,
  5,
  21,
  2042,
  1862,
  3,
  1570,
  1061,
  201,
  31,
  3,
  3001,
  20,
  9431,
  32961,
  19,
  11383,
  28467,
  57652,
  28467,
  15,
  3,
  

In [16]:
# Chuẩn hóa độ dài của các chuỗi trong x_train thành cùng một độ dài.
ptrs = pad_sequences(x_train, maxlen=ml, padding='post', truncating='post')

In [17]:
ptrs[0].shape

(140,)

In [18]:
x_valid = tokenizers.texts_to_sequences(x_valid)
ptes = pad_sequences(x_valid, maxlen= ml, truncating= 'post', padding = "post")
x_test = tokenizers.texts_to_sequences(x_test)
ptst = pad_sequences(x_test, maxlen = ml, truncating= 'post', padding = 'post')

In [34]:
scores_model = []

# LSTM


In [37]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, ed, input_length=ml, trainable=False),
    tf.keras.layers.LSTM(100, dropout=0.3, recurrent_dropout=0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
earlystoping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience = 2, min_lr = 1e-6)

In [38]:
model.fit(ptrs, y_train, epochs=10, validation_data=(ptes, y_valid), shuffle=True, callbacks=[earlystoping, reduce_lr])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1eddccc7650>

In [39]:
scores = model.predict(ptst)
print("Auc: %.2f%%" % (roc_auc(scores,y_test)))

Auc: 0.64%


In [40]:
scores_model.append({'Model': 'LSTM','AUC_Score': (roc_auc(scores,y_test))})

In [41]:
model.save('lstm.h5py')

INFO:tensorflow:Assets written to: lstm.h5py\assets


INFO:tensorflow:Assets written to: lstm.h5py\assets


# BIDIRECTIONAL

In [29]:
model1 = Sequential()
embedding = Embedding(vocab_size, ed, input_length=ml, trainable=False)
model1.add(embedding)
model1.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3))) 
model1.add(Dense(1, activation='sigmoid'))


model1.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
earlystoping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience = 2, min_lr = 1e-6)






In [30]:
model1.fit(ptrs, y_train, epochs = 10, validation_data = (ptes, y_valid), callbacks = [earlystoping, reduce_lr], shuffle=True)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1ed935f0890>

In [31]:
scores1 = model1.predict(ptst)
print("Auc: %.2f%%" % (roc_auc(scores1,y_test)))

Auc: 0.74%


In [35]:
scores_model.append({'Model': 'Bidirectional','AUC_Score': (roc_auc(scores1,y_test))})

In [36]:
model1.save("bidirectional.h5py")

INFO:tensorflow:Assets written to: bidirectional.h5py\assets


INFO:tensorflow:Assets written to: bidirectional.h5py\assets


# GRU

In [44]:
model2 = Sequential()
embedding = Embedding(vocab_size, ed, input_length=ml, trainable=False)
model2.add(embedding)
model2.add(GRU(100, dropout=0.3, recurrent_dropout=0.3))
model2.add(Dense(1, activation = 'sigmoid'))

model2.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics =['accuracy']   )
earlystoping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience = 2, min_lr = 1e-6)


In [45]:
model2.fit(ptrs, y_train, epochs = 10, validation_data = (ptes, y_valid), callbacks = [earlystoping, reduce_lr], shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1edd7c6a0d0>

In [46]:
scores2 = model.predict(ptst)
print("Auc: %.2f%%" % (roc_auc(scores2,y_test)))

Auc: 0.64%


In [47]:
scores_model.append({'Model': 'Gru','AUC_Score': (roc_auc(scores2,y_test))})

In [48]:
model2.save("Gru.h5py")

INFO:tensorflow:Assets written to: Gru.h5py\assets


INFO:tensorflow:Assets written to: Gru.h5py\assets


# Model evaluation

In [50]:
results = pd.DataFrame(scores_model).sort_values(by='AUC_Score',ascending=False)
results.style.background_gradient(cmap='Blues')

Unnamed: 0,Model,AUC_Score
0,Bidirectional,0.740875
1,LSTM,0.636178
2,Gru,0.636178


# Model predict

In [49]:
Gru = load_model("Gru.h5py")
LSTM = load_model("lstm.h5py")
BIDR = load_model("bidirectional.h5py")


In [56]:
test_sen = [""""Your Lie in April" is a deceptively masterful series. It's not just a first love adventure, but delves into the reality of PTSD, childhood abuse, and chronic diseases. This is a series about learning to heal, move on, accept parts of ourselves we'd rather pretend are missing. The animation is beautiful with bright, vibrant colors that bring the characters to life. The sound is perfectly timed and the soundtrack cleverly combines the emotions meant to be portrayed in each scene. Structurally, it's a solid early life story that dramatizes certain aspects but never detracts from intended message.
            """]

test_seq = tokenizers.texts_to_sequences(test_sen)

padded_test_seq = pad_sequences(test_seq, maxlen=ml, truncating= "post", padding= "post")
y_pred = BIDR.predict(padded_test_seq)
print(y_pred)
y_pred = y_pred.astype(float)
if y_pred < 0.5:
    print('Predict: Negative')
else:
    print('Predict: Positive')

[[0.6471304]]
Predict: Positive
