In [None]:
import os
import re
import pickle
import pandas as pd
import numpy as np
import seaborn as sn
import keras
from keras.preprocessing import text, sequence
from keras.layers import Input, Dense, Embedding, Flatten, Conv1D, MaxPooling1D, Bidirectional, LSTM, GRU, concatenate, GlobalMaxPooling1D, GlobalAveragePooling1D, SpatialDropout1D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.models import Model, Sequential
from keras import backend as K
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from tensorflow.keras.callbacks import EarlyStopping
early_stop=EarlyStopping(monitor='loss', patience=10)#, verbose=1)
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
lb_enc = LabelEncoder()
import nltk
import string 

In [None]:
data_train = pd.read_excel('/kaggle/input/spam-review-detection/data_train.xlsx')
data_dev = pd.read_excel('/kaggle/input/spam-review-detection/data_dev.xlsx')
data_test = pd.read_excel('/kaggle/input/spam-review-detection/data_test.xlsx')

In [None]:
y_train = lb_enc.fit_transform(data_train['Label'])

y_dev = lb_enc.fit_transform(data_dev['Label'])

y_test = lb_enc.fit_transform(data_test['Label'])

In [None]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(data_train['transformed_text'])
tokenizer.fit_on_texts(data_dev['transformed_text'])
tokenizer.fit_on_texts(data_test['transformed_text'])

text_to_sequence_train = tokenizer.texts_to_sequences(data_train['transformed_text']) 
text_to_sequence_dev = tokenizer.texts_to_sequences(data_dev['transformed_text']) 
text_to_sequence_test = tokenizer.texts_to_sequences(data_test['transformed_text']) 

In [None]:
max_length_sequence_train = max([len(i) for i in text_to_sequence_train])
 
padded_train = pad_sequences(text_to_sequence_train, maxlen=max_length_sequence_train, 
                                    padding = "pre") 
max_length_sequence_train
padded_train

In [None]:
padded_dev = pad_sequences(text_to_sequence_dev, maxlen=max_length_sequence_train, 
                                    padding = "pre") 
len(padded_dev)

In [None]:
padded_test = pad_sequences(text_to_sequence_test, maxlen=max_length_sequence_train, 
                                    padding = "pre") 
len(padded_test)

In [None]:
embed_size = 400 # how big is each word vector
max_features = 10000

In [None]:
VOC_SIZE = len(tokenizer.word_index)+1
embedding_matrix = np.zeros((VOC_SIZE, embed_size))

def get_model():
    inp = Input(shape=(max_length_sequence_train,), dtype='int32')
    x = Embedding(VOC_SIZE ,embed_size,weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.3)(x)
    
    conv_0 = Conv1D(128, kernel_size=3, kernel_initializer='normal', padding='valid', activation='elu')(x)    
    maxpool_0 = MaxPooling1D(3, strides=1, padding='valid')(conv_0)
    
    conv_1 = Conv1D(128, kernel_size=5, kernel_initializer='normal', padding='valid', activation='elu')(x)    
    maxpool_1 = MaxPooling1D(3, strides=1, padding='valid')(conv_1)
    
    conv_2 = Conv1D(128, kernel_size=6, kernel_initializer='normal', padding='valid', activation='elu')(x)    
    maxpool_2 = MaxPooling1D(3, strides=1, padding='valid')(conv_2)
    
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2]) 
#     flatten = Flatten()(z)
#     dropout = Dropout(drop)(flatten)
    z = Bidirectional(GRU(40, return_sequences=True))(z)
    avg_pool = GlobalAveragePooling1D()(z)
    max_pool = GlobalMaxPooling1D()(z)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation="sigmoid")(conc)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
model = get_model()
model.summary()

In [None]:
hist = model.fit(padded_train, y_train,epochs=40, batch_size=64, 
                        validation_data=(padded_dev, y_dev), callbacks = [early_stop], verbose= 2)

In [None]:
sms_test = ['quyểnnnnnnnnnnnnnnnnnnnn vở này đẹp nhaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa']
sms_seq = tokenizer.texts_to_sequences(sms_test)

sms_pad = pad_sequences(sms_seq, maxlen=max_length_sequence_train, padding='pre')
# tokenizer.fit_on_texts
sms_pad
model.predict(sms_pad)

In [None]:
y_pred = model.predict(padded_train)
y_pred_train = np.round(y_pred)

# Evaluate performance on test set
acc = accuracy_score(y_train, y_pred_train)
prec = precision_score(y_train, y_pred_train)
y_pred_classes_train = np.round(y_pred_train)
# y_pred_classes = np.argmax(y_pred, axis=1)
f1_macro = f1_score(y_train, y_pred_classes_train, average='macro')
cm = confusion_matrix(y_train, y_pred_train)
print("Accuracy: ", acc)
print("Precision: ", prec)
print("F1 Macro: ", f1_macro)
print("Confusion Matrix: \n", cm)

In [None]:
y_pred = model.predict(padded_dev)
y_pred_dev = np.round(y_pred)

# Evaluate performance on test set
acc = accuracy_score(y_dev, y_pred_dev)
prec = precision_score(y_dev, y_pred_dev)
y_pred_classes_dev = np.round(y_pred_dev)
f1_macro = f1_score(y_dev, y_pred_classes_dev, average='macro')
cm = confusion_matrix(y_dev, y_pred_dev)
print("Accuracy: ", acc)
print("Precision: ", prec)
print("F1 Macro: ", f1_macro)
print("Confusion Matrix: \n", cm)

In [None]:
y_pred = model.predict(padded_test)
y_pred_test = np.round(y_pred)

# Evaluate performance on test set
acc = accuracy_score(y_test, y_pred_test)
prec = precision_score(y_test, y_pred_test)
y_pred_classes_test = np.round(y_pred_test)
# y_pred_classes = np.argmax(y_pred, axis=1)
f1_macro = f1_score(y_test, y_pred_classes_test, average='macro')
cm = confusion_matrix(y_test, y_pred_test)
print("Accuracy: ", acc)
print("Precision: ", prec)
print("F1 Macro: ", f1_macro)
print("Confusion Matrix: \n", cm)