In [123]:
!pip install pyvi



In [124]:
import pandas as pd 
import numpy as np
from string import digits
from pyvi import ViTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
%matplotlib inline

In [151]:
# Load train and test data from csv files
data_train = pd.read_csv("vlsp_sentiment_train.csv", sep='\t')
data_train.columns =['Class', 'Data']
train_labels = data_train.iloc[:, 0].values
train_texts = data_train.iloc[:, 1].values
print("Training data\n", data_train.head(5))

data_test = pd.read_csv("vlsp_sentiment_test.csv", sep='\t')
data_test.columns =['Class', 'Data']
test_labels = data_test.iloc[:, 0].values
test_texts = data_test.iloc[:, 1].values
print("Testing data\n", data_test.head(5))

Training data
    Class                                               Data
0     -1  Mình đã dùng anywhere thế hệ đầu, quả là đầy t...
1     -1  Quan tâm nhất là độ trễ có cao không, dùng thi...
2     -1  dag xài con cùi bắp 98k....pin trâu, mỗi tội đ...
3     -1  logitech chắc hàng phải tiền triệu trở lên dùn...
4     -1  Đang xài con m175 cùi mía , nhà xài nhiều chuộ...
Testing data
    Class                                               Data
0     -1  Nói thiệt là mình thì thì chuột nào mình cũng ...
1     -1  Đang dùng mx1. Cũng ngon nhưng chưa đầy năm mà...
2     -1  Chưa thấy đc điểm thuyết phục để mua, nhất là ...
3     -1  Những phần xem báo tra cứu bản đồ, dịch vụ.. d...
4     -1  ĐÚNG LÀ MUA Ở VIỆT NAM KHÔNG ỨNG DỤNG ĐƯỢC GÌ ...


In [126]:
print(data_train.shape)
print(data_test.shape)

(5100, 2)
(1050, 2)


In [127]:
labels = data_train.iloc[:, 0].values
reviews = data_train.iloc[:, 1].values

In [131]:
# Encoding labels
# -1 = Negative = [1,0,0]
#  0 = Neutral   = [0,1,0]
#  1 = Positive = [0,0,1]
def encoding_labels(labels):
    encoded_labels = []
    for label in labels:
        if label == -1:
            encoded_labels += [[1, 0, 0]]
        elif label == 0:
            encoded_labels += [[0, 1, 0]]
        else:
            encoded_labels += [[0 ,0, 1]]
    return np.array(encoded_labels)

In [133]:
EMBEDDING_DIM = 400 # how big is each word vector
MAX_VOCAB_SIZE = 10000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 300 # max number of words in a comment to use

texts_processed = []
for review in train_texts:
    text_cool_one = ''.join([char for char in review if char not in digits])
    texts_processed.append(text_cool_one)
    
texts_tokenized = []
for review in texts_processed:
    review = ViTokenizer.tokenize(review.lower())
    texts_tokenized.append(review.split())

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(texts_tokenized)
sequences_train = tokenizer.texts_to_sequences(texts_tokenized)
word_index = tokenizer.word_index

X_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
y_train = encoding_labels(train_labels)

In [134]:
print('Shape of X train and X validation tensor:', X_train.shape)
print('Shape of label train and validation tensor:', y_train.shape)

Shape of X train and X validation tensor: (5100, 300)
Shape of label train and validation tensor: (5100, 3)


In [135]:
from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format(fname='./vi-model-CBOW.bin', binary=True)

vocabulary_size=min(len(word_index)+1,MAX_VOCAB_SIZE)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=MAX_VOCAB_SIZE:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [136]:
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, concatenate, Flatten, Reshape, Dropout, Dense
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, CSVLogger, ModelCheckpoint

In [1]:
# CNN hyperparameters
SEQUENCE_LENGTH = 300
FILTER_SIZES = [3, 4, 5]
NUM_FILTERS = 100
DROP_RATE = 0.5
REGULIZERS_LAMBDA = 0.01

# Optimizer hyperparameters
LEARNING_RATE = 0.001
BETA_1 = 0.9
BETA_2 = 0.999
EPSILON = 1e-08

# Define the CNN model
input_layer = Input(shape=(SEQUENCE_LENGTH,))

embedding_layer = Embedding(input_dim=vocabulary_size, 
                            output_dim=EMBEDDING_DIM, 
                            weights=[embedding_matrix], 
                            trainable=True)(input_layer)

conv_layer_0 = Conv1D(NUM_FILTERS, 
                      FILTER_SIZES[0], 
                      activation='relu', 
                      kernel_regularizer=regularizers.l2(REGULIZERS_LAMBDA))(embedding_layer)
conv_layer_1 = Conv1D(NUM_FILTERS, 
                      FILTER_SIZES[1], 
                      activation='relu', 
                      kernel_regularizer=regularizers.l2(REGULIZERS_LAMBDA))(embedding_layer)
conv_layer_2 = Conv1D(NUM_FILTERS, 
                      FILTER_SIZES[2], 
                      activation='relu', 
                      kernel_regularizer=regularizers.l2(REGULIZERS_LAMBDA))(embedding_layer)

maxpool_layer_0 = MaxPooling1D(SEQUENCE_LENGTH - FILTER_SIZES[0] + 1, strides=1)(conv_layer_0)
maxpool_layer_1 = MaxPooling1D(SEQUENCE_LENGTH - FILTER_SIZES[1] + 1, strides=1)(conv_layer_1)
maxpool_layer_2 = MaxPooling1D(SEQUENCE_LENGTH - FILTER_SIZES[2] + 1, strides=1)(conv_layer_2)

merged_tensor = concatenate([maxpool_layer_0, maxpool_layer_1, maxpool_layer_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3 * NUM_FILTERS,))(flatten)
dropout = Dropout(DROP_RATE)(flatten)
output_layer = Dense(units=3, activation='softmax', kernel_regularizer=regularizers.l2(REGULIZERS_LAMBDA))(dropout)


model = Model(input_layer, output_layer)

model.compile(loss='categorical_crossentropy', 
              optimizer=Adam(learning_rate=LEARNING_RATE, beta_1=BETA_1, beta_2=BETA_2, epsilon=EPSILON), 
              metrics=['accuracy'])

print(model.summary())

# Early stopping hyperparameters
MIN_DELTA = 0.01 
PATIENCE = 4

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=MIN_DELTA, patience=PATIENCE, verbose=1)
csv_logger = CSVLogger('CNN_log.csv', append=False, separator=';')
checkpoint = ModelCheckpoint('CNN_checkpoint.keras', 
                              save_best_only=False, 
                              verbose=0)
callbacks_list = [early_stopping, csv_logger, checkpoint]

NameError: name 'Input' is not defined

In [141]:
model.fit(X_train, y_train, validation_split=0.2,
          epochs=5, batch_size=256, callbacks=callbacks_list, shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x154f408e530>

In [168]:
reviews_test = data_test.iloc[:, 1].values

texts_processed = []
for review_test in reviews_test:
    text_cool_one = ''.join([char for char in review_test if char not in digits])
    texts_processed.append(text_cool_one)
    
#Use PyVi for Vietnamese word tokenizer
word_reviews_test = []
all_words = []
for review in texts_processed:
    review = ViTokenizer.tokenize(review.lower())
    word_reviews_test.append(review.split())
    
sequences_test = tokenizer.texts_to_sequences(word_reviews_test)

X_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
y_test = encoding_labels(test_labels)  

In [170]:
score = model.evaluate(X_test, y_test)

print("%s: %.2f%%" % (model.metrics_names[0], score[0]))
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))


loss: 3.93%
accuracy: 57.71%
