In [1]:
!pip install sentencepiece



In [2]:
!pip install tf-keras



In [3]:
!pip install vncorenlp



In [4]:
import sys
import numpy as np
import pandas as pd
import tensorflow as tf

In [5]:
TRAIN_PATH = 'datasets/vlsp2018_hotel/1-VLSP2018-SA-Hotel-train.csv'
VAL_PATH = 'datasets/vlsp2018_hotel/2-VLSP2018-SA-Hotel-dev.csv'
TEST_PATH = 'datasets/vlsp2018_hotel/3-VLSP2018-SA-Hotel-test.csv'

PRETRAINED_MODEL = 'vinai/phobert-base'
MODEL_NAME = 'HotelReviewPhoBert'
MAX_LENGTH = 256
BATCH_SIZE = 25
EPOCHS = 20

In [6]:
from preprocessors.vlsp2018_preprocessping import VLSP2018Loader

raw_datasets = VLSP2018Loader.load(TRAIN_PATH, VAL_PATH, TEST_PATH)
raw_datasets

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['Review', 'FACILITIES#CLEANLINESS', 'FACILITIES#COMFORT', 'FACILITIES#DESIGN&FEATURES', 'FACILITIES#GENERAL', 'FACILITIES#MISCELLANEOUS', 'FACILITIES#PRICES', 'FACILITIES#QUALITY', 'FOOD&DRINKS#MISCELLANEOUS', 'FOOD&DRINKS#PRICES', 'FOOD&DRINKS#QUALITY', 'FOOD&DRINKS#STYLE&OPTIONS', 'HOTEL#CLEANLINESS', 'HOTEL#COMFORT', 'HOTEL#DESIGN&FEATURES', 'HOTEL#GENERAL', 'HOTEL#MISCELLANEOUS', 'HOTEL#PRICES', 'HOTEL#QUALITY', 'LOCATION#GENERAL', 'ROOMS#CLEANLINESS', 'ROOMS#COMFORT', 'ROOMS#DESIGN&FEATURES', 'ROOMS#GENERAL', 'ROOMS#MISCELLANEOUS', 'ROOMS#PRICES', 'ROOMS#QUALITY', 'ROOM_AMENITIES#CLEANLINESS', 'ROOM_AMENITIES#COMFORT', 'ROOM_AMENITIES#DESIGN&FEATURES', 'ROOM_AMENITIES#GENERAL', 'ROOM_AMENITIES#MISCELLANEOUS', 'ROOM_AMENITIES#PRICES', 'ROOM_AMENITIES#QUALITY', 'SERVICE#GENERAL'],
        num_rows: 3000
    })
    val: Dataset({
        features: ['Review', 'FACILITIES#CLEANLINESS', 'FACILITIES#COMFORT', 'FACILITIES#DESIGN&FEATUR

In [7]:
from preprocessors.vietnamese_preprocessing import VietnameseTextPreprocessor

# You should be carefull when using single word replacement for acronyms, because it can cause misinterpretation.
# For example, 'giá': ['price', 'gia'] can replace the word 'gia' in 'gia đình', making it become 'giá đình'.
vn_preprocessor = VietnameseTextPreprocessor(vncorenlp_dir='../processors/VnCoreNLP', extra_teencodes={
    'khách sạn': ['ks'], 'nhà hàng': ['nhahang'], 'nhân viên': ['nv'],
    'cửa hàng': ['store', 'sop', 'shopE', 'shop'],
    'sản phẩm': ['sp', 'product'], 'hàng': ['hàg'],
    'giao hàng': ['ship', 'delivery', 'síp'], 'đặt hàng': ['order'],
    'chuẩn chính hãng': ['authentic', 'aut', 'auth'], 'hạn sử dụng': ['date', 'hsd'],
    'điện thoại': ['dt'],  'facebook': ['fb', 'face'],
    'nhắn tin': ['nt', 'ib'], 'trả lời': ['tl', 'trl', 'rep'],
    'feedback': ['fback', 'fedback'], 'sử dụng': ['sd'], 'xài': ['sài'],
}, max_correction_length=MAX_LENGTH)

2024-10-10 09.04.41 INFO VnCoreNLPServer - Using annotators: wseg
2024-10-10 09:04:41 INFO  WordSegmenter:24 - Loading Word Segmentation model
2024-10-10 09.04.41 INFO VnCoreNLPServer - VnCoreNLPServer is listening on http://127.0.0.1:62029
2024-10-10 09.04.41 INFO log - Logging initialized @339ms to org.eclipse.jetty.util.log.Slf4jLog
2024-10-10 09.04.41 INFO EmbeddedJettyServer - == Spark has ignited ...
2024-10-10 09.04.41 INFO EmbeddedJettyServer - >> Listening on 127.0.0.1:62029
2024-10-10 09.04.41 INFO Server - jetty-9.4.z-SNAPSHOT, build timestamp: 2017-11-22T04:27:37+07:00, git hash: 82b8fb23f757335bb3329d540ce37a2a2615f0a8
2024-10-10 09.04.41 INFO session - DefaultSessionIdManager workerName=node0
2024-10-10 09.04.41 INFO session - No SessionScavenger set, using defaults
2024-10-10 09.04.41 INFO session - Scavenging every 600000ms
2024-10-10 09.04.41 INFO AbstractConnector - Started ServerConnector@1064b16b{HTTP/1.1,[http/1.1]}{127.0.0.1:62029}
2024-10-10 09.04.41 INFO Server 

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokens = tokenizer.encode('Tôi là sinh_viên trường đại_học Công_nghệ thông_tin .')
print('Encode:', tokens, '\nDecode:', tokenizer.decode(tokens))
tokenizer.model_input_names

Encode: [0, 218, 8, 649, 212, 956, 2413, 195, 5, 2] 
Decode: <s> Tôi là sinh_viên trường đại_học Công_nghệ thông_tin . </s>




['input_ids', 'token_type_ids', 'attention_mask']

In [None]:
preprocessed_datasets = VLSP2018Loader.preprocess_and_tokenize(raw_datasets, vn_preprocessor, tokenizer, BATCH_SIZE * 2, MAX_LENGTH)
preprocessed_datasets.save_to_disk('../datasets/preprocessed_hotel')
display(preprocessed_datasets)
pd.DataFrame({
    'raw_datasets': raw_datasets['train']['Review'][1480:1490],
    'encoded_input_ids': preprocessed_datasets['train']['input_ids'][1480:1490],
    'decoded_input_ids': [tokenizer.decode(preprocessed_datasets['train'][i]['input_ids']) for i in range(1480, 1490)]
})