In [1]:
import glob
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import unicodedata
import re

In [2]:
class Flattening(object):
    def __init__(self):
        self.accent2unicode = {'<6>': '\u0302', '<8>': '\u0306', '<F>': '\u0300', \
                               '<S>': '\u0301', '<R>': '\u0309', '<X>': '\u0303', '<J>': '\u0323'}
        self.circumflex_unicodes = ['00C2', '00E2', '00CA', '00EA', '00D4', '00F4'] # â, Â, Ê, ...
        self.breve_unicodes = ['0102', '0103'] # ă, Ă
        self.underdot_unicodes = ['1EA0', '1EA1', '1EB8', '1EB9', '1ECC', '1ECD']
        self.accent_letters = 'À Á Ả Ã Ạ Â Ầ Ấ Ẩ Ẫ Ậ Ă Ằ Ắ Ẳ Ẵ Ặ à á ả ã ạ â ầ ấ ẩ ẫ ậ ă ằ ắ ẳ ẵ ặ\
        È É Ẻ Ẽ Ẹ Ê Ề Ế Ể Ễ Ệ è é ẻ ẽ ẹ ê ề ế ể ễ ệ\
        Ì Í Ỉ Ĩ Ị ì í ỉ ĩ ị\
        Ò Ó Ỏ Õ Ọ Ô Ồ Ố Ổ Ỗ Ộ Ơ Ờ Ớ Ở Ỡ Ợ ò ó ỏ õ ọ ô ồ ố ổ ỗ ộ ơ ờ ớ ở ỡ ợ\
        Ù Ú Ủ Ũ Ụ Ư Ừ Ứ Ử Ữ Ự ù ú ủ ũ ụ ư ừ ứ ử ữ ự\
        Ỳ Ý Ỷ Ỹ Ỵ ỳ ý ỷ ỹ ỵ'
        self.accent_letters = self.accent_letters.split()
        
    def get_unaccent(self, letter):
        raise NotImplementedError()
        
    def get_accents(self, letter):
        raise NotImplementedError()
    
    def flatten_letter(self, letter):
        flattened_letter = []
        if letter not in self.accent_letters:
            return letter
        unaccent_letter = self.get_unaccent(letter)
        mark_accent, vowel_accent = self.get_accents(letter)
        flattened_letter.append(unaccent_letter)
        if mark_accent != None:
            flattened_letter.append(mark_accent)
        if vowel_accent != None:
            flattened_letter.append(vowel_accent)
        return flattened_letter
    
    '''
    Types:
    ------
        - word: list of accent-letters
        Return:
        - flattened_word: list of unaccent-letters [and <accent-letters> (if any)]
    '''
    def flatten_word(self, word):
        flattened_word = []
        for letter in word:
            flattened_letter = self.flatten_letter(letter)
            flattened_word.extend(flattened_letter)
        return flattened_word
    
    def invert(self, flattened_word):
        raise NotImplementedError()

In [3]:
'''
Flatten without đ, Đ, ơ, Ơ, ư, Ư
'''
class Flattening_1(Flattening):
    def __init__(self):
        super().__init__()
        
    def get_unaccent(self, letter):
        letter = letter.encode('utf-8').decode('utf-8')
        letter = re.sub(u'[àáảãạâầấẩẫậăằắẳẵặ]', 'a', letter)
        letter = re.sub(u'[ÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶ]', 'A', letter)
        letter = re.sub(u'[èéẹẻẽêềếệểễ]', 'e', letter)
        letter = re.sub(u'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', letter)
        letter = re.sub(u'[òóọỏõôồốộổỗ]', 'o', letter)
        letter = re.sub(u'[ÒÓỌỎÕÔỒỐỘỔỖ]', 'O', letter)
        letter = re.sub(u'[ơờớợởỡ]', 'ơ', letter)
        letter = re.sub(u'[ƠỜỚỢỞỠ]', 'Ơ', letter)
        letter = re.sub(u'[ìíịỉĩ]', 'i', letter)
        letter = re.sub(u'[ÌÍỊỈĨ]', 'I', letter)
        letter = re.sub(u'[ùúụủũ]', 'u', letter)
        letter = re.sub(u'[ÙÚỤỦŨ]', 'U', letter)
        letter = re.sub(u'[ưừứựửữ]', 'ư', letter)
        letter = re.sub(u'[ƯỪỨỰỬỮ]', 'Ư', letter)
        letter = re.sub(u'[ỳýỵỷỹ]', 'y', letter)
        letter = re.sub(u'[ỲÝỴỶỸ]', 'Y', letter)
        return letter
        
    def get_accents(self, letter):
        mark_accent, vowel_accent = None, None
        bi_unicode = unicodedata.decomposition(letter).split()

        if bi_unicode[1]=='0302' or (bi_unicode[0] in self.circumflex_unicodes) or letter=='ậ' or letter=='Ậ':
            mark_accent = '<6>' # VNI '<CIRCUMFLEX>'
        elif bi_unicode[1]=='0306' or (bi_unicode[0] in self.breve_unicodes) or letter=='ặ' or letter=='Ặ':
            mark_accent = '<8>' # '<BREVE>'
            
        if bi_unicode[1]=='0300':
            vowel_accent = '<F>'
        elif bi_unicode[1]=='0301':
            vowel_accent = '<S>'
        elif bi_unicode[1]=='0303':
            vowel_accent = '<X>'
        elif bi_unicode[1]=='0309':
            vowel_accent = '<R>'
        elif bi_unicode[1]=='0323' or (bi_unicode[0] in self.underdot_unicodes):
            vowel_accent = '<J>'

        return mark_accent, vowel_accent
    
    '''
    Types:
    ------
        - flattened_word: list of unaccent-letters [and <accent-letters> (if any)]
        Return:
        - accent_word: list of accent-letters
    '''
    def invert(self, flattened_word):
        accent_word = []
        for letter in flattened_word:
            if (len(letter) == 1) or (len(accent_word) == 0) or (letter not in self.accent2unicode):
                accent_word.append(letter)
            else: # accent
                accent_letter = unicodedata.normalize('NFC', accent_word[-1] + self.accent2unicode[letter])
                accent_word[-1] = accent_letter
        return accent_word

In [4]:
'''
Flatten with đ, Đ, ơ, Ơ, ư, Ư
'''
class Flattening_2(Flattening):
    def __init__(self):
        super().__init__()
#         self.accent2unicode['<7>'] = '\u031B'
        self.accent2unicode.update({'<7>': '\u031B', '<9>': None})
        self._7_unicodes = ['01A0', '01A1', '01AF', '01B0']
        self.accent_letters.extend(['đ', 'Đ'])
        
    def get_unaccent(self, letter):
        letter = letter.encode('utf-8').decode('utf-8')
        letter = re.sub(u'đ', 'd', letter)
        letter = re.sub(u'Đ', 'D', letter)
        return ''.join(c for c in unicodedata.normalize('NFD', letter)\
                       if unicodedata.category(c) != 'Mn')
        
    def get_accents(self, letter):
        mark_accent, vowel_accent = None, None
        bi_unicode = unicodedata.decomposition(letter).split()

        if letter=='đ' or letter=='Đ':
            mark_accent = '<9>'
        elif bi_unicode[1]=='0302' or (bi_unicode[0] in self.circumflex_unicodes) or letter=='ậ' or letter=='Ậ':
            mark_accent = '<6>' # VNI '<CIRCUMFLEX>'
        elif bi_unicode[1]=='0306' or (bi_unicode[0] in self.breve_unicodes) or letter=='ặ' or letter=='Ặ':
            mark_accent = '<8>' # '<BREVE>'
        elif bi_unicode[1]=='031B' or (bi_unicode[0] in self._7_unicodes):
            mark_accent = '<7>'
            
        if letter=='đ' or letter=='Đ':
            vowel_accent = None
        elif bi_unicode[1]=='0300':
            vowel_accent = '<F>'
        elif bi_unicode[1]=='0301':
            vowel_accent = '<S>'
        elif bi_unicode[1]=='0303':
            vowel_accent = '<X>'
        elif bi_unicode[1]=='0309':
            vowel_accent = '<R>'
        elif bi_unicode[1]=='0323' or (bi_unicode[0] in self.underdot_unicodes):
            vowel_accent = '<J>'

        return mark_accent, vowel_accent
    
    '''
    Types:
    ------
        - flattened_word: list of unaccent-letters [and <accent-letters> (if any)]
        Return:
        - accent_word: list of accent-letters
    '''
    def invert(self, flattened_word):
        accent_word = []
        for letter in flattened_word:
            if (len(letter) == 1) or (len(accent_word) == 0) or (letter not in self.accent2unicode):
                accent_word.append(letter)
            else: # accent
                if letter == '<9>':
                    if accent_word[-1] in ['d', 'D']:
                        accent_letter = ('đ' if accent_word[-1]=='d' else 'Đ')
                        accent_word[-1] = accent_letter
                    else:
                        accent_word.append(letter)
                else:
                    accent_letter = unicodedata.normalize('NFC', accent_word[-1] + self.accent2unicode[letter])
                    accent_word[-1] = accent_letter
        return accent_word

# type_1

In [5]:
flattening = Flattening_1()

string1 = 'quý hóa hoàn khoáng gì gìn đoán đứng lặng HĐND ơn'
for word in string1.split():
    word = re.findall(r'\w+', word)[0]
    l_word = list(word)
    flattened_word = flattening.flatten_word(l_word)
    accent_word = flattening.invert(flattened_word)
    print(f'{word: <{10}} - {str(flattened_word): <{40}} - {str(accent_word)}')
    
# Special case
print()
s_cases = [['h', 'o', '<start>', 'a'], ['<9>', 'a', 'n'], ['q', '<9>', 'a', 'n']]
for case in s_cases:
    print(flattening.invert(case))

quý        - ['q', 'u', 'y', '<S>']                   - ['q', 'u', 'ý']
hóa        - ['h', 'o', '<S>', 'a']                   - ['h', 'ó', 'a']
hoàn       - ['h', 'o', 'a', '<F>', 'n']              - ['h', 'o', 'à', 'n']
khoáng     - ['k', 'h', 'o', 'a', '<S>', 'n', 'g']    - ['k', 'h', 'o', 'á', 'n', 'g']
gì         - ['g', 'i', '<F>']                        - ['g', 'ì']
gìn        - ['g', 'i', '<F>', 'n']                   - ['g', 'ì', 'n']
đoán       - ['đ', 'o', 'a', '<S>', 'n']              - ['đ', 'o', 'á', 'n']
đứng       - ['đ', 'ư', '<S>', 'n', 'g']              - ['đ', 'ứ', 'n', 'g']
lặng       - ['l', 'a', '<8>', '<J>', 'n', 'g']       - ['l', 'ặ', 'n', 'g']
HĐND       - ['H', 'Đ', 'N', 'D']                     - ['H', 'Đ', 'N', 'D']
ơn         - ['ơ', 'n']                               - ['ơ', 'n']

['h', 'o', '<start>', 'a']
['<9>', 'a', 'n']
['q', '<9>', 'a', 'n']


In [6]:
df = pd.read_csv('../data/VNOnDB/all_word.csv', sep='\t')
ground_truth = df.loc[:, 'label'].astype(str)
accent_words = []
for word in ground_truth:
    l_word = list(word)
    l_flattened_word = flattening.flatten_word(l_word)
    l_accent_word = flattening.invert(l_flattened_word)
    accent_word = ''.join(l_accent_word)
    accent_words.append(accent_word)
    if word!=accent_word:
        print(word, '-', accent_word)
sum(ground_truth==accent_words)==len(ground_truth)

True

# type_2

In [7]:
flattening = Flattening_2()

string1 = 'quý hóa hoàn khoáng gì gìn đoán đứng lặng HĐND ơn'
for word in string1.split():
    word = re.findall(r'\w+', word)[0]
    l_word = list(word)
    flattened_word = flattening.flatten_word(l_word)
    accent_word = flattening.invert(flattened_word)
    print(f'{word: <{10}} - {str(flattened_word): <{40}} - {str(accent_word)}')
    
# Special case
print()
s_cases = [['h', 'o', '<start>', 'a'], ['<9>', 'a', 'n'], ['q', '<9>', 'a', 'n']]
for case in s_cases:
    print(flattening.invert(case))

quý        - ['q', 'u', 'y', '<S>']                   - ['q', 'u', 'ý']
hóa        - ['h', 'o', '<S>', 'a']                   - ['h', 'ó', 'a']
hoàn       - ['h', 'o', 'a', '<F>', 'n']              - ['h', 'o', 'à', 'n']
khoáng     - ['k', 'h', 'o', 'a', '<S>', 'n', 'g']    - ['k', 'h', 'o', 'á', 'n', 'g']
gì         - ['g', 'i', '<F>']                        - ['g', 'ì']
gìn        - ['g', 'i', '<F>', 'n']                   - ['g', 'ì', 'n']
đoán       - ['d', '<9>', 'o', 'a', '<S>', 'n']       - ['đ', 'o', 'á', 'n']
đứng       - ['d', '<9>', 'u', '<7>', '<S>', 'n', 'g'] - ['đ', 'ứ', 'n', 'g']
lặng       - ['l', 'a', '<8>', '<J>', 'n', 'g']       - ['l', 'ặ', 'n', 'g']
HĐND       - ['H', 'D', '<9>', 'N', 'D']              - ['H', 'Đ', 'N', 'D']
ơn         - ['o', '<7>', 'n']                        - ['ơ', 'n']

['h', 'o', '<start>', 'a']
['<9>', 'a', 'n']
['q', '<9>', 'a', 'n']


In [8]:
df = pd.read_csv('../data/VNOnDB/all_word.csv', sep='\t')
ground_truth = df.loc[:, 'label'].astype(str)
accent_words = []
for word in ground_truth:
    l_word = list(word)
    l_flattened_word = flattening.flatten_word(l_word)
    l_accent_word = flattening.invert(l_flattened_word)
    accent_word = ''.join(l_accent_word)
    accent_words.append(accent_word)
    if word!=accent_word:
        print(word, '-', accent_word)
sum(ground_truth==accent_words)==len(ground_truth)

True