In [88]:
import glob
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import unicodedata
import re

In [2]:
def split_letters_counter(all_letters_count):
    upper_chars_cnt, lower_chars_cnt, numbers_cnt = Counter(), Counter(), Counter()
    for letter in all_letters_count.keys():
        if letter.isupper():
            upper_chars_cnt[letter]=all_letters_count[letter]
        elif letter.islower():
            lower_chars_cnt[letter]=all_letters_count[letter]
        else:
            numbers_cnt[letter]=all_letters_count[letter]
    return upper_chars_cnt, lower_chars_cnt, numbers_cnt

In [3]:
def plot_bar(cnt, title, img_name):
    fig = plt.figure(figsize=(20,10))
    plt.bar(cnt.keys(), cnt.values())
    plt.title(title, fontsize=20)
    plt.xlabel('Letter', fontsize=10)
    plt.ylabel('Frequency', fontsize=10)
    fig.savefig(img_name, dpi=300)

In [102]:
accent_letters = 'À Á Ả Ã Ạ Â Ầ Ấ Ẩ Ẫ Ậ Ă Ằ Ắ Ẳ Ẵ Ặ à á ả ã ạ â ầ ấ ẩ ẫ ậ ă ằ ắ ẳ ẵ ặ\
    È É Ẻ Ẽ Ẹ Ê Ề Ế Ể Ễ Ệ è é ẻ ẽ ẹ ê ề ế ể ễ ệ\
    Ì Í Ỉ Ĩ Ị ì í ỉ ĩ ị\
    Ò Ó Ỏ Õ Ọ Ô Ồ Ố Ổ Ỗ Ộ Ơ Ờ Ớ Ở Ỡ Ợ ò ó ỏ õ ọ ô ồ ố ổ ỗ ộ ơ ờ ớ ở ỡ ợ\
    Ù Ú Ủ Ũ Ụ Ư Ừ Ứ Ử Ữ Ự ù ú ủ ũ ụ ư ừ ứ ử ữ ự\
    Ỳ Ý Ỷ Ỹ Ỵ ỳ ý ỷ ỹ ỵ'

def get_unaccent(s):
    s = re.sub(u'[àáảãạâầấẩẫậăằắẳẵặ]', 'a', s)
    s = re.sub(u'[ÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶ]', 'A', s)
    s = re.sub(u'[èéẹẻẽêềếệểễ]', 'e', s)
    s = re.sub(u'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', s)
    s = re.sub(u'[òóọỏõôồốộổỗ]', 'o', s)
    s = re.sub(u'[ÒÓỌỎÕÔỒỐỘỔỖ]', 'O', s)
    s = re.sub(u'[ơờớợởỡ]', 'ơ', s)
    s = re.sub(u'[ƠỜỚỢỞỠ]', 'Ơ', s)
    s = re.sub(u'[ìíịỉĩ]', 'i', s)
    s = re.sub(u'[ÌÍỊỈĨ]', 'I', s)
    s = re.sub(u'[ùúụủũ]', 'u', s)
    s = re.sub(u'[ÙÚỤỦŨ]', 'U', s)
    s = re.sub(u'[ưừứựửữ]', 'ư', s)
    s = re.sub(u'[ƯỪỨỰỬỮ]', 'Ư', s)
    s = re.sub(u'[ỳýỵỷỹ]', 'y', s)
    s = re.sub(u'[ỲÝỴỶỸ]', 'Y', s)
    return s

def get_accents(s):
    mark_accent, vowel_accent = None, None
    bi_unicode = unicodedata.decomposition(s).split()
    circumflex_unicodes = ['00C2', '1EA0', '00E2', '1EA1', '00CA', '00EA', '00D4', '00F4'] # â, Â, Ê, ...
    breve_unicodes = ['0102', '0103'] # ă, Ă
    j_unicodes = ['1EA0', '1EA1', '1EB8', '1EB9', '1ECC', '1ECD']
    
    if bi_unicode[1]=='0302' or (bi_unicode[0] in circumflex_unicodes):
        mark_accent = '<CIRCUMFLEX>'
    if bi_unicode[1]=='0306' or (bi_unicode[0] in breve_unicodes):
        mark_accent = '<BREVE>'
    if bi_unicode[1]=='0300':
        vowel_accent = '<F>'
    elif bi_unicode[1]=='0301':
        vowel_accent = '<S>'
    elif bi_unicode[1]=='0303':
        vowel_accent = '<X>'
    elif bi_unicode[1]=='0309':
        vowel_accent = '<R>'
    elif bi_unicode[1]=='0323' or (bi_unicode[0] in j_unicodes):
        vowel_accent = '<J>'
        
    return mark_accent, vowel_accent
    
def flatten_accent(letter):
    if letter not in accent_letters.split():
        return letter, None, None
    unaccent_letter = get_unaccent(letter)
    mark_accent, vowel_accent = get_accents(letter)
    return unaccent_letter, mark_accent, vowel_accent

str = 'Tôi là ai, đấu là đây'
for letter in str:
    unaccent_letter, mark_accent, vowel_accent = flatten_accent(letter)
    print(letter, '=', unaccent_letter, '+', mark_accent, '+', vowel_accent)

T = T + None + None
ô = o + <CIRCUMFLEX> + None
i = i + None + None
  =   + None + None
l = l + None + None
à = a + None + <F>
  =   + None + None
a = a + None + None
i = i + None + None
, = , + None + None
  =   + None + None
đ = đ + None + None
ấ = a + <CIRCUMFLEX> + <S>
u = u + None + None
  =   + None + None
l = l + None + None
à = a + None + <F>
  =   + None + None
đ = đ + None + None
â = a + <CIRCUMFLEX> + None
y = y + None + None
