In [None]:
%cd /content/drive/MyDrive/Đồ Án Thu Thập Tiền Xử Lý/code

/content/drive/MyDrive/Đồ Án Thu Thập Tiền Xử Lý/code


In [None]:
!pip install underthesea
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
TRAIN_DIR = 'data/train.csv'
VAL_DIR   = 'data/dev.csv'
TEST_DIR  = 'data/test.csv'

#1. Data Preparation

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [None]:
train = pd.read_csv(TRAIN_DIR)
test = pd.read_csv(TEST_DIR)
dev = pd.read_csv(VAL_DIR)

In [None]:
def read_file(df):

    X = df.pop('review')
    y = df.replace({np.nan: 0,
                    'Positive': 1,
                    'Negative': 2,
                    'Neutral': 3}).astype(np.uint8)

    print('X.shape:', X.shape, 'y.shape:', y.shape)
    return X, y

In [None]:
aspects = ['Hotel#General',
          'Hotel#Price',
          'Hotel#Designs_Features',
          'Hotel#Cleanliness',
          'Hotel#Comfort',
          'Hotel#Quality',
          'Hotel#Miscellaneous',
          'Rooms#General',
          'Rooms#Price',
          'Rooms#Designs_Features',
          'Rooms#Cleanliness',
          'Rooms#Comfort',
          'Rooms#Quality',
          'Rooms#Miscellaneous',
          'Room_amenities#General',
          'Room_amenities#Price',
          'Room_amenities#Designs_Features',
          'Room_amenities#Cleanliness',
          'Room_amenities#Comfort',
          'Room_amenities#Quality',
          'Room_amenities#Miscellaneous',
          'Facilities#General',
          'Facilities#Price',
          'Facilities#Designs_Features',
          'Facilities#Cleanliness',
          'Facilities#Comfort',
          'Facilities#Quality',
          'Facilities#Miscellaneous',
          'Service#General',
          'Location#General',
          'Food_Drinks#Price',
          'Food_Drinks#Quality',
          'Food_Drinks#Style_Options',
          'Food_Drinks#Miscellaneous'
          ]

replacements = {0: None, 1: 'positive', 2: 'negative', 3: 'neutral'}

In [None]:
X_train, y_train = read_file(train)
X_dev,   y_dev   = read_file(dev)
X_test,  y_test  = read_file(test)

X.shape: (4805,) y.shape: (4805, 34)
X.shape: (1602,) y.shape: (1602, 34)
X.shape: (1602,) y.shape: (1602, 34)


#2. Text Preprocess


## 2.1. Chuẩn hóa Unicode tiếng Việt

In [None]:
import regex as re

uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"


def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic


dicchar = loaddicchar()

# Đưa toàn bộ dữ liệu qua hàm này để chuẩn hóa lại
def covert_unicode(txt):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

## 2.2. Chuấn hóa kiểu gõ dấu

In [None]:
import re
import os
import sys
import pandas as pd
# from Logger import LogEventSourcing
from datetime import datetime
import dateutil.parser
import traceback
import time
import requests


# logger = LogEventSourcing()

def call_api(data, url, method, timeout=3):
    headers = {
        'content-type': "application/x-www-form-urlencoded",
        'cache-control': "no-cache",
        'postman-token': "6a410524-a8e2-79c7-bd9d-53e4b68c84c7"
    }
    response = requests.request(method, url, data=data, headers=headers, timeout=timeout)
    return response


uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"


def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic


dicchar = loaddicchar()


def convertwindown1525toutf8(txt):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

"""
    Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
    Ví dụ: thủy = thuyr, tượng = tuwowngj
"""
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
                  ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
                  ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
                  ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
                  ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
                  ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
                  ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
                  ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
                  ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
                  ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
                  ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
                  ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']

nguyen_am_to_ids = {}

for i in range(len(bang_nguyen_am)):
    for j in range(len(bang_nguyen_am[i]) - 1):
        nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)


def vn_word_to_telex_type(word):
    dau_cau = 0
    new_word = ''
    for char in word:
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1:
            new_word += char
            continue
        if y != 0:
            dau_cau = y
        new_word += bang_nguyen_am[x][-1]
    new_word += bang_ky_tu_dau[dau_cau]
    return new_word


def vn_sentence_to_telex_type(sentence):
    """
    Chuyển câu tiếng việt có dấu về kiểu gõ telex.
    :param sentence:
    :return:
    """
    words = sentence.split()
    for index, word in enumerate(words):
        words[index] = vn_word_to_telex_type(word)
    return ' '.join(words)


"""
    End section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
"""

"""
    Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
    Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF
"""


def chuan_hoa_dau_tu_tieng_viet(word):
    if not is_valid_vietnam_word(word):
        return word

    chars = list(word)
    dau_cau = 0
    nguyen_am_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1:
            continue
        elif x == 9:  # check qu
            if index != 0 and chars[index - 1] == 'q':
                chars[index] = 'u'
                qu_or_gi = True
        elif x == 5:  # check gi
            if index != 0 and chars[index - 1] == 'g':
                chars[index] = 'i'
                qu_or_gi = True
        if y != 0:
            dau_cau = y
            chars[index] = bang_nguyen_am[x][0]
        if not qu_or_gi or index != 1:
            nguyen_am_index.append(index)
    if len(nguyen_am_index) < 2:
        if qu_or_gi:
            if len(chars) == 2:
                x, y = nguyen_am_to_ids.get(chars[1])
                chars[1] = bang_nguyen_am[x][dau_cau]
            else:
                x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
                if x != -1:
                    chars[2] = bang_nguyen_am[x][dau_cau]
                else:
                    chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
            return ''.join(chars)
        return word

    for index in nguyen_am_index:
        x, y = nguyen_am_to_ids[chars[index]]
        if x == 4 or x == 8:  # ê, ơ
            chars[index] = bang_nguyen_am[x][dau_cau]
            # for index2 in nguyen_am_index:
            #     if index2 != index:
            #         x, y = nguyen_am_to_ids[chars[index]]
            #         chars[index2] = bang_nguyen_am[x][0]
            return ''.join(chars)

    if len(nguyen_am_index) == 2:
        if nguyen_am_index[-1] == len(chars) - 1:
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            # chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
        else:
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
    else:
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
        # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
        x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
        chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
        # chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
    return ''.join(chars)


def is_valid_vietnam_word(word):
    chars = list(word)
    nguyen_am_index = -1
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x != -1:
            if nguyen_am_index == -1:
                nguyen_am_index = index
            else:
                if index - nguyen_am_index != 1:
                    return False
                nguyen_am_index = index
    return True


def chuan_hoa_dau_cau_tieng_viet(sentence):
    """
        Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
        :param sentence:
        :return:
        """
    sentence = sentence.lower()
    words = sentence.split()
    for index, word in enumerate(words):
        words[index] = chuan_hoa_dau_tu_tieng_viet(word)
    return ' '.join(words)


"""
    End section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
    Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF
"""
if __name__ == '__main__':
    # with open('C:/Users/htv/Desktop/testunicode.txt') as f:
    #     content = f.read()
    #     output = decodetounicode(content)
    #     wirtefile('C:/Users/htv/Desktop/unicode.txt', output)
    txt = 'oà uý'
    # print(is_valid_vietnam_word(txt))
    txt = chuan_hoa_dau_cau_tieng_viet(txt)
    print(txt)

òa úy


## 2.3. Chuẩn hoá teencode (Read teencode.xlsx)

In [None]:
teencode = pd.read_excel("teencode.xlsx")

def chuan_hoa_teen_code(sentence):
    result = [x.strip() for x in sentence.split()]
    for i in range(0, len(result)):
        for j in range(0, len(teencode)):
            if (result[i] == teencode.at[j, "code"]):
                result[i] = teencode.at[j, "chuanhoa"]
    x = " ".join(result)
    x.strip()
    return x

##2.4. Tách từ tiếng việt sử dụng Underthesea

In [None]:
from underthesea import word_tokenize

##2.5. Loại bỏ Stop Word (Read Stopword.txt)

In [None]:
with open("Stopword.txt",'rb') as f:
    contents = f.read()
contents = contents.decode("utf-16")
contents = contents.split("\r\n")
stopword = set(contents)

def remove_stopwords(line):
    words = []
    for word in line.strip().split():
        if word not in stopword:
            words.append(word)
    return ' '.join(words)

##2.6. Xử lý tổng hợp

In [None]:
def text_preprocess(document):
    # đưa về lower
    document = document.lower()
    # chuẩn hóa unicode
    document = covert_unicode(document)
    # chuẩn hóa cách gõ dấu tiếng Việt
    document = chuan_hoa_dau_cau_tieng_viet(document)
    # xóa các ký tự không cần thiết
    document = re.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]',' ',document)
     # xóa khoảng trắng thừa
    document = re.sub(r'\s+', ' ', document).strip()
    # chuẩn hoá teencode
    document = chuan_hoa_teen_code(document)
    # tách từ
    document = word_tokenize(document, format="text")
    return document

In [None]:
text_preprocess("ksan 4* nằm ở vị trí gần trung tâm, thuận tiện đi lại, tham quan các địa điểm nổi tiếng. Phòng rộng rãi, ấm cúng, sạch sẽ, view đẹp nhìn ra sân vườn.")

'khách_sạn 4 nằm ở vị_trí gần trung_tâm thuận_tiện đi_lại tham_quan các địa_điểm nổi_tiếng phòng rộng raĩ ấm_cúng sạch_sẽ view đẹp nhìn ra sân vườn'

# 3. Train Model

##Text Preprocess

In [None]:
X_train_pre = X_train.apply(text_preprocess)
X_test_pre = X_test.apply(text_preprocess)
X_dev_pre = X_dev.apply(text_preprocess)

In [None]:
# Count the number of unique words
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import FreqDist

all_words = ' '.join(X_train)
all_words = word_tokenize(all_words)
dist = FreqDist(all_words)
num_unique_word = len(dist)
print ('number unique word:',num_unique_word)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


number unique word: 5809


##Feature extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range = (1, 2))

X_train_tf = vectorizer.fit_transform(X_train_pre)
X_dev_tf   = vectorizer.transform(X_dev_pre)
X_test_tf  = vectorizer.transform(X_test_pre)

In [None]:
X_train_tf.shape

(4805, 58166)

##Report metrics

###Aspect Detection

In [None]:
from sklearn.metrics import classification_report

def Aspect_Detection_Report(y_true, y_pred):
    aspect_test = []
    aspect_pred = []

    for row_test, row_pred in zip(y_true.values.tolist(), y_pred):
        for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
            aspect_test.append(bool(col_test) * aspects[index])
            aspect_pred.append(bool(col_pred) * aspects[index])
    aspect_report = classification_report(aspect_test, aspect_pred, digits=4, zero_division=1, output_dict=True)
    return classification_report(aspect_test, aspect_pred, digits=4, zero_division=1)

###Polarity Detection

In [None]:
def Polarity_Detection_Report(y_true, y_pred):
    y_test_flat = y_true.values.flatten()
    y_pred_flat = y_pred.flatten()
    target_names = list(map(str, replacements.values()))
    polarity_report = classification_report(y_test_flat, y_pred_flat, digits=4, output_dict=True)
    return classification_report(y_test_flat, y_pred_flat, target_names=target_names, digits=4)

###Aspect + Polarity

In [None]:
def Aspect_Polarity_Report(y_true, y_pred):
      aspect_polarity_test = []
      aspect_polarity_pred = []

      replacements = {0: None, 1: 'positive', 2: 'negative', 3: 'neutral'}

      for row_test, row_pred in zip(y_true.values.tolist(), y_pred):
          for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
              aspect_polarity_test.append(f'{aspects[index]},{replacements[col_test]}')
              aspect_polarity_pred.append(f'{aspects[index]},{replacements[col_pred]}')
      aspect_polarity_report = classification_report(aspect_polarity_test, aspect_polarity_pred, digits=4, zero_division=1, output_dict=True)
      return aspect_polarity_report
      return(classification_report(aspect_polarity_test, aspect_polarity_pred, digits=4, zero_division=1))

In [None]:
def quickf1(y_true, y_pred):
  aspect_polarity_report = Aspect_Polarity_Report(y_true, y_pred)
  f1 = aspect_polarity_report['macro avg']
  f1 = f1['f1-score']
  return f1

In [None]:
def quicksummary(y_true, y_pred):
  aspect_polarity_report = Aspect_Polarity_Report(y_true, y_pred)
  summary = aspect_polarity_report['macro avg']
  summary['accuracy'] = aspect_polarity_report['accuracy']
  return summary

In [None]:
# optuna là một thư viện dùng để tối ưu tham số
import optuna
from optuna.samplers import TPESampler
from sklearn.multioutput import MultiOutputClassifier as MOC

In [None]:
def callback(study, trial):
    """
    save hyperparameters best trial
    """
    if study.best_trial.number == trial.number:
        study.set_user_attr(key='best_model', value=trial.user_attrs['model'])

##Baseline model


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

def logistic_objective(trial):
    params = dict(
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        C=trial.suggest_categorical('C',[0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]),
        random_state=5,
        max_iter=1000
    )

    clf = MOC(LogisticRegression(**params))
    clf.fit(X_train_tf, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_dev_tf)
    return quickf1(y_dev, y_pred)

sampler = TPESampler(seed=22)
logistic_study = optuna.create_study(sampler=sampler, direction='maximize')
logistic_study.optimize(logistic_objective, n_trials=10, callbacks=[callback])


clf0 = logistic_study.user_attrs['best_model']

[I 2023-06-13 05:12:50,318] A new study created in memory with name: no-name-d0331b9f-b13a-4fd7-b5c8-2d5426bfbee9
[I 2023-06-13 05:14:06,585] Trial 0 finished with value: 0.2965142157165898 and parameters: {'class_weight': None, 'C': 0.001}. Best is trial 0 with value: 0.2965142157165898.
[I 2023-06-13 05:18:49,810] Trial 1 finished with value: 0.49758923750935247 and parameters: {'class_weight': 'balanced', 'C': 100}. Best is trial 1 with value: 0.49758923750935247.
[I 2023-06-13 05:19:47,323] Trial 2 finished with value: 0.37432516252185294 and parameters: {'class_weight': 'balanced', 'C': 0.0001}. Best is trial 1 with value: 0.49758923750935247.
[I 2023-06-13 05:21:02,524] Trial 3 finished with value: 0.2769718081129762 and parameters: {'class_weight': None, 'C': 0.0001}. Best is trial 1 with value: 0.49758923750935247.
[I 2023-06-13 05:22:15,942] Trial 4 finished with value: 0.440535769822671 and parameters: {'class_weight': 'balanced', 'C': 0.001}. Best is trial 1 with value: 0.49

In [None]:
print('train:', quickf1(y_train, clf0.predict(X_train_tf)))
print('dev:  ', quickf1(y_dev  , clf0.predict(X_dev_tf)))
print('test:', quickf1(y_test  , clf0.predict(X_test_tf)))

print(clf0.estimators_[0].get_params())
print(logistic_study.best_params)

train: 0.997218352483867
dev:   0.5090691739207619
test: 0.50587304277196
{'C': 1.0, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 5, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
{'class_weight': 'balanced', 'C': 1.0}


In [None]:
quicksummary(y_test  , clf0.predict(X_test_tf))

{'precision': 0.6807660631448433,
 'recall': 0.4988075090928572,
 'f1-score': 0.50587304277196,
 'support': 54468,
 'accuracy': 0.9411948299919218}

Save model

In [None]:
from sklearn.pipeline import make_pipeline
import joblib

pipe = make_pipeline(vectorizer, clf0)
joblib.dump(pipe, 'result_model/model_lr.joblib')

['result_model/model_lr.joblib']

Save file predict

In [None]:
y_pred = clf0.predict(X_test_tf)

vlsp_results = []
for index, pred in enumerate(y_pred):
    sentiments = map(lambda x: replacements[x], pred)
    result = {
        'id': f'#{index + 1}',
        'text': X_test[index],
        'acsa': []
    }
    for aspect, sentiment in zip(aspects, sentiments):
        if sentiment: result['acsa'].append('{'+ aspect + ', ' + sentiment + '}')
    vlsp_results.append(result)
vlsp_results[:3]

with open('result_data/model_lr.txt', 'w', encoding='utf-8') as f:
    for result in vlsp_results:
        id, text, acsa = result.values()
        f.write(f"{id}\n{text}\n{', '.join(acsa)}\n\n")

###Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

def Multi_NB_objective(trial):
    params = dict(
        alpha = trial.suggest_categorical('alpha', [0.00001, 0.00005, 0.0001, 0.0005, 0.005, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 100, 1000])
    )

    clf = MOC(MultinomialNB(**params))
    clf.fit(X_train_tf, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_dev_tf)
    return quickf1(y_dev, y_pred)

sampler = TPESampler(seed=22)
Multi_NB_study = optuna.create_study(sampler=sampler, direction='maximize')
Multi_NB_study.optimize(Multi_NB_objective, n_trials = 20, callbacks=[callback])


clf2 = Multi_NB_study.user_attrs['best_model']

[I 2023-06-13 05:35:19,050] A new study created in memory with name: no-name-d1670050-234a-4a2e-9321-7c957b82e126
[I 2023-06-13 05:35:21,441] Trial 0 finished with value: 0.3740944778752464 and parameters: {'alpha': 0.0005}. Best is trial 0 with value: 0.3740944778752464.
[I 2023-06-13 05:35:24,052] Trial 1 finished with value: 0.274468298543855 and parameters: {'alpha': 1000}. Best is trial 0 with value: 0.3740944778752464.
[I 2023-06-13 05:35:26,182] Trial 2 finished with value: 0.39807376053220217 and parameters: {'alpha': 0.005}. Best is trial 2 with value: 0.39807376053220217.
[I 2023-06-13 05:35:28,008] Trial 3 finished with value: 0.3629124094733906 and parameters: {'alpha': 0.5}. Best is trial 2 with value: 0.39807376053220217.
[I 2023-06-13 05:35:30,177] Trial 4 finished with value: 0.28899862307761004 and parameters: {'alpha': 5}. Best is trial 2 with value: 0.39807376053220217.
[I 2023-06-13 05:35:32,035] Trial 5 finished with value: 0.3629124094733906 and parameters: {'alph

In [None]:
print('train:', quickf1(y_train, clf2.predict(X_train_tf)))
print('dev:  ', quickf1(y_dev  , clf2.predict(X_dev_tf)))
print('test:', quickf1(y_test  , clf2.predict(X_test_tf)))


print(clf2.estimators_[0].get_params())
print(Multi_NB_study.best_params)

In [None]:
quicksummary(y_test, clf2.predict(X_test_tf))

In [None]:
from sklearn.pipeline import make_pipeline
import joblib

pipe = make_pipeline(vectorizer, clf2)
joblib.dump(pipe, 'result_model/model_mnb.joblib')

In [None]:
y_pred = clf2.predict(X_test_tf)

vlsp_results = []
for index, pred in enumerate(y_pred):
    sentiments = map(lambda x: replacements[x], pred)
    result = {
        'id': f'#{index + 1}',
        'text': X_test[index],
        'acsa': []
    }
    for aspect, sentiment in zip(aspects, sentiments):
        if sentiment: result['acsa'].append('{'+ aspect + ', ' + sentiment + '}')
    vlsp_results.append(result)
vlsp_results[:3]

with open('result_data/model_mnb.txt', 'w', encoding='utf-8') as f:
    for result in vlsp_results:
        id, text, acsa = result.values()
        f.write(f"{id}\n{text}\n{', '.join(acsa)}\n\n")

###Linear SVC

In [None]:
from sklearn.svm import LinearSVC

def linearsvc_objective(trial):
    params = dict(
        C=trial.suggest_categorical('C', [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 3.0, 5.0]),
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        loss=trial.suggest_categorical('loss', ['hinge', 'squared_hinge']),
        max_iter=2000,
        random_state=5
    )

    clf = MOC(LinearSVC(**params))
    clf.fit(X_train_tf, y_train)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(X_dev_tf)
    return quickf1(y_dev, y_pred)

sampler = TPESampler(seed=22)
linearsvc_study = optuna.create_study(sampler=sampler, direction='maximize')
linearsvc_study.optimize(linearsvc_objective, n_trials = 20, callbacks=[callback])


clf3 = linearsvc_study.user_attrs['best_model']

In [None]:
print('train:', quickf1(y_train, clf3.predict(X_train_tf)))
print('dev:  ', quickf1(y_dev  , clf3.predict(X_dev_tf)))
print('test: ', quickf1(y_test , clf3.predict(X_test_tf)))

print(clf3.estimators_[0].get_params())
print(linearsvc_study.best_params)

train: 0.9959622256462535
dev:   0.4866129642555515
test:  0.48585293376683664
{'C': 0.1, 'class_weight': 'balanced', 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'hinge', 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': 5, 'tol': 0.0001, 'verbose': 0}
{'C': 0.1, 'class_weight': 'balanced', 'loss': 'hinge'}


In [None]:
quicksummary(y_test  , clf3.predict(X_test_tf))

In [None]:
from sklearn.pipeline import make_pipeline
import joblib

pipe = make_pipeline(vectorizer, clf3)
joblib.dump(pipe, 'result_model/model_linearsvc.joblib')

In [None]:
y_pred = clf3.predict(X_test_tf)

vlsp_results = []
for index, pred in   enumerate(y_pred):
    sentiments = map(lambda x: replacements[x], pred)
    result = {
        'id': f'#{index + 1}',
        'text': X_test[index],
        'acsa': []
    }
    for aspect, sentiment in zip(aspects, sentiments):
        if sentiment: result['acsa'].append('{'+ aspect + ', ' + sentiment + '}')
    vlsp_results.append(result)
vlsp_results[:3]

with open('result_data/model_linearsvc.txt', 'w', encoding='utf-8') as f:
    for result in vlsp_results:
        id, text, acsa = result.values()
        f.write(f"{id}\n{text}\n{', '.join(acsa)}\n\n")

#4. Prediction

In [None]:
def print_acsa_pred(replacements, categories, sentence_pred):
    sentiments = map(lambda x: replacements[x], sentence_pred)
    for category, sentiment in zip(categories, sentiments):
        if sentiment: print(f'=> {category},{sentiment}')

In [None]:
ydf = clf0.predict(X_test_tf[33])
ydf

In [None]:
ydf = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 2, 0, 0, 0, 0, 0]

In [None]:
replacements = {0: None, 1: 'positive', 2: 'negative', 3: 'neutral'}
print('Example:', X_test[33])
print_acsa_pred(replacements, aspects, ydf)

Example: không gian phòng rộng rãi, sạch sẽ , thức ăn sáng ngon và dịch vụ rất tiện ích , kêu gọi món lên phòng rẻ , ngon và nhân viên rất thân thiện, không có điểm nào để chê ạ
=> Facilities#Quality,negative


#5. VLSP FORMAT

Test File

In [None]:
vlsp_results = []
for index, pred in enumerate(y_test.values):
    sentiments = map(lambda x: replacements[x], pred)
    result = {
        'id': f'#{index + 1}',
        'text': X_test[index],
        'acsa': []
    }
    for aspect, sentiment in zip(aspects, sentiments):
        if sentiment: result['acsa'].append('{'+ aspect + ', ' + sentiment + '}')
    vlsp_results.append(result)
vlsp_results[:3]

[{'id': '#1',
  'text': 'Khách sạn tuyệt vời, mình ở 3n2đ đi chơi không kịp báo dọn phòng nhưng khách sạn đã dọn giúp sạch sẽ tinh tươm trước khi mình về, mình book được giá tốt nữa. Rất hài lòng, nv nhiệt tình dễ thương nhẹ nhàng, có dịp quay lại mình sẽ ở lại đây.',
  'acsa': ['{Hotel#General, positive}',
   '{Hotel#Price, positive}',
   '{Hotel#Comfort, positive}',
   '{Rooms#Cleanliness, positive}',
   '{Service#General, positive}']},
 {'id': '#2',
  'text': 'Nhân viên phục vụ dễ thương! Khuyết điểm là khách sạn ít đèn quá tối, rất khó chịu. Vòi sen bồn rửa mặt ngắn quá, rửa mặt xong nước chảy tràn ra tùm lum. Phòng tắm không có bồn. Ăn sáng tạm ổn, nói chung với mức tiền đó thì chấp nhận được',
  'acsa': ['{Hotel#Price, neutral}',
   '{Hotel#Designs_Features, negative}',
   '{Hotel#Comfort, negative}',
   '{Room_amenities#Designs_Features, negative}',
   '{Service#General, positive}',
   '{Food_Drinks#Quality, neutral}']},
 {'id': '#3',
  'text': 'Khách sạn rất gần chợ đêm. Xung q

In [None]:
with open('result_data/y_test.txt', 'w', encoding='utf-8') as f:
    for result in vlsp_results:
        id, text, acsa = result.values()
        f.write(f"{id}\n{text}\n{', '.join(acsa)}\n\n")