In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sb

# Importing the required packages
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

In [2]:
hangul_syllables = np.array([chr(code) for code in range(44032, 55204)])
hangul_syllables = hangul_syllables.reshape(19, 21, 28)

print(f"'가'와 관련된 음절 리스트: {hangul_syllables[0][0]}")
print(f"'개'와 관련된 음절 리스트: {hangul_syllables[0][1]}")

print()

print(f"'ㄱ'와 관련된 마지막 음절 리스트: {hangul_syllables[0][20]}")
print(f"'ㄲ'와 관련된 첫번째 음절 리스트: {hangul_syllables[1][0]}")

'가'와 관련된 음절 리스트: ['가' '각' '갂' '갃' '간' '갅' '갆' '갇' '갈' '갉' '갊' '갋' '갌' '갍' '갎' '갏' '감' '갑'
 '값' '갓' '갔' '강' '갖' '갗' '갘' '같' '갚' '갛']
'개'와 관련된 음절 리스트: ['개' '객' '갞' '갟' '갠' '갡' '갢' '갣' '갤' '갥' '갦' '갧' '갨' '갩' '갪' '갫' '갬' '갭'
 '갮' '갯' '갰' '갱' '갲' '갳' '갴' '갵' '갶' '갷']

'ㄱ'와 관련된 마지막 음절 리스트: ['기' '긱' '긲' '긳' '긴' '긵' '긶' '긷' '길' '긹' '긺' '긻' '긼' '긽' '긾' '긿' '김' '깁'
 '깂' '깃' '깄' '깅' '깆' '깇' '깈' '깉' '깊' '깋']
'ㄲ'와 관련된 첫번째 음절 리스트: ['까' '깍' '깎' '깏' '깐' '깑' '깒' '깓' '깔' '깕' '깖' '깗' '깘' '깙' '깚' '깛' '깜' '깝'
 '깞' '깟' '깠' '깡' '깢' '깣' '깤' '깥' '깦' '깧']


In [3]:
'''
chosung_list = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ',
                'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
'''
chosung_list_test = ['ㄱ', 'ㄴ', 'ㄷ',  'ㄹ', 'ㅁ', 'ㅂ',  'ㅅ',
                'ㅇ', 'ㅈ',  'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']


jungsung_list = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ',
                 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']

jongsung_list = ['', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ',
                 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ',
                 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

In [4]:
def get_unicode_number(chosung_index, jungsung_index, jongsung_index):
    return ((chosung_index * 588) + (jungsung_index * 28) + jongsung_index) + 44032


# 박 / ㅂ:7 / ㅏ:0 / ㄱ:1
print(get_unicode_number(7, 0, 1), chr(get_unicode_number(7, 0, 1)))

# 놔 / ㄴ:2 / ㅘ:9 / '':0
print(get_unicode_number(2, 9, 0), chr(get_unicode_number(2, 9, 0)))

# 밝 / ㅂ:7 / ㅏ:0 / ㄺ:9
print(get_unicode_number(7, 0, 9), chr(get_unicode_number(7, 0, 9)))

48149 박
45460 놔
48157 밝


In [5]:
import re

class SeparateJaMo:
    """
    한글 자모를 분리하는 클래스

    target_text = 자모를 분리할 문자열
    blank_str = 공백을 처리할 문자
    remove_blank = 공백 제거 여부
    remove_special_character = 특수 문자 제거 여부
    refine_blank = 두개 이상의 공백 정제 여부
    refine_english = 영어 정제 여부
    """

    def __init__(self, original_text, blank_str=" ",
                 remove_blank=False, remove_special_character=False,
                 refine_blank=False, refine_english=False):
        self.__chosung_list = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ',
                               'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
        self.__jungsung_list = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ',
                                'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']
        self.__jongsung_list = ['', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ',
                                'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ',
                                'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
        self.__original_text = original_text
        self.__blank_str = blank_str
        self.__remove_blank = remove_blank
        self.__remove_special_character = remove_special_character
        self.__refine_blank = refine_blank
        self.__refine_english = refine_english
        self.__processed_text = self.__make_processed_text()
        self.__jamo_list = self.__make_jamo_list()

    def __make_processed_text(self):
        """
        옵션에 알맞게 문자열을 정제하여 반환
        """
        processed_text = self.__original_text
        if self.__refine_blank:
            processed_text = " ".join(
                [word.strip() for word in processed_text.split(" ") if not len(word.strip()) == 0]
            )
        if self.__remove_blank:
            processed_text = re.sub(r'\s', '', processed_text)
        if self.__remove_special_character:
            processed_text = re.sub(r'[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', processed_text)
            processed_text = re.sub(r'\W \S', '', processed_text)
        if self.__refine_english:
            processed_text = re.sub(r'[^ㄱ-ㅎㅣ가-힣]+', '', processed_text)
        processed_text = re.sub(r'\s', self.__blank_str, processed_text)
        return processed_text

    def __make_jamo_list(self):
        """
        전체 자모 리스트 반환
        """
        jamo_list = list()
        for syllable in list(self.__processed_text):
            if re.match(r'[ㄱ-ㅎㅣ가-힣]+', syllable):
                syllable_code = ord(syllable)
                chosung_index = int((syllable_code - 44032) / 588)
                jungsung_index = int((syllable_code - 44032 - (chosung_index * 588)) / 28)
                jongsung_index = int(syllable_code - 44032 - (chosung_index * 588) - (jungsung_index * 28))
                jamo_list.append(
                    [
                        self.__chosung_list[chosung_index],
                        self.__jungsung_list[jungsung_index],
                        self.__jongsung_list[jongsung_index]
                    ]
                )
            else:
                jamo_list.append([syllable, syllable, syllable])
        return jamo_list

    def get_full_jamo_list(self):
        return self.__jamo_list

    def get_chosung_list(self):
        return [jamo_list[0] for jamo_list in self.__jamo_list]

    def get_jungsung_list(self):
        return [jamo_list[1] for jamo_list in self.__jamo_list]

    def get_jongsung_list(self):
        return [jamo_list[2] for jamo_list in self.__jamo_list]


if __name__ == "__main__":
    text = 'facebook의 창시자 마크 저커버그는       "사람과 사람의 연결이 곧 비즈니스" 라고 말했다.'
    separate_jamo = SeparateJaMo(text, remove_special_character=True)
    print(separate_jamo.get_full_jamo_list())
    print(separate_jamo.get_chosung_list())
    print(separate_jamo.get_jungsung_list())
    print(separate_jamo.get_jongsung_list())

[['f', 'f', 'f'], ['a', 'a', 'a'], ['c', 'c', 'c'], ['e', 'e', 'e'], ['b', 'b', 'b'], ['o', 'o', 'o'], ['o', 'o', 'o'], ['k', 'k', 'k'], ['ㅇ', 'ㅢ', ''], [' ', ' ', ' '], ['ㅊ', 'ㅏ', 'ㅇ'], ['ㅅ', 'ㅣ', ''], ['ㅈ', 'ㅏ', ''], [' ', ' ', ' '], ['ㅁ', 'ㅏ', ''], ['ㅋ', 'ㅡ', ''], [' ', ' ', ' '], ['ㅈ', 'ㅓ', ''], ['ㅋ', 'ㅓ', ''], ['ㅂ', 'ㅓ', ''], ['ㄱ', 'ㅡ', ''], ['ㄴ', 'ㅡ', 'ㄴ'], [' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' '], ['ㄹ', 'ㅏ', 'ㅁ'], ['ㄱ', 'ㅘ', ''], [' ', ' ', ' '], ['ㅅ', 'ㅏ', ''], ['ㄹ', 'ㅏ', 'ㅁ'], ['ㅇ', 'ㅢ', ''], [' ', ' ', ' '], ['ㅇ', 'ㅕ', 'ㄴ'], ['ㄱ', 'ㅕ', 'ㄹ'], ['ㅇ', 'ㅣ', ''], [' ', ' ', ' '], ['ㄱ', 'ㅗ', 'ㄷ'], [' ', ' ', ' '], ['ㅂ', 'ㅣ', ''], ['ㅈ', 'ㅡ', ''], ['ㄴ', 'ㅣ', ''], ['ㅅ', 'ㅡ', ''], [' ', ' ', ' '], ['ㄹ', 'ㅏ', ''], ['ㄱ', 'ㅗ', ''], [' ', ' ', ' '], ['ㅁ', 'ㅏ', 'ㄹ'], ['ㅎ', 'ㅐ', 'ㅆ'], ['ㄷ', 'ㅏ', '']]
['f', 'a', 'c', 'e', 'b', 'o', 'o', 'k', 'ㅇ', ' ', 'ㅊ', 'ㅅ', 'ㅈ', ' ', 'ㅁ', 'ㅋ', ' ', 'ㅈ', 'ㅋ', 'ㅂ', 'ㄱ', 'ㄴ', ' ', ' ', ' ', ' ', ' ', 'ㄹ', 'ㄱ', ' ', '

In [15]:
nums=['ㄱ','ㄴ','ㄷ','ㄹ','ㅁ','ㅂ','ㅅ','ㅇ','ㅈ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ']
j=0
for num in nums:
    path = '../data/자음/%s.csv'%num
    num = pd.read_csv(path, engine='python', encoding='cp949')
    num.columns =["aX", "aY", "aZ", "gX","gY","gZ",'little','ring','middle','index','thumb']
    num.drop(['aX','aY','aZ','gX','gY','gZ'], axis=1, inplace=True)
    data = num
    
    for k in data.columns:     #data.columns[w:] if you have w column of line description 
        data[k] = data[k].fillna(data[k].median())
    data = data.filter(['little','ring','middle','index','thumb'])
    data.insert(5,'Character',j)
    j=j+1
    #finaldata = pd.concat([finaldata, data], sort=False, ignore_index=True)
   
    print(data)
#print(num)

     little  ring  middle  index  thumb  Character
0       264   312     224    472    418          0
1       268   307     226    466    413          0
2       268   310     226    458    416          0
3       269   314     227    470    414          0
4       270   302     228    470    418          0
..      ...   ...     ...    ...    ...        ...
649     423   521     402    446    346          0
650     409   511     388    426    333          0
651     399   511     387    433    332          0
652     399   511     389    434    334          0
653     394   509     389    435    337          0

[654 rows x 6 columns]
     little  ring  middle  index  thumb  Character
0       245   276     212    473    429          1
1       241   281     216    475    433          1
2       241   282     212    479    436          1
3       242   283     212    474    433          1
4       245   285     220    479    432          1
..      ...   ...     ...    ...    ...        ...
499    

In [113]:
nums=['ㄱ','ㄴ','ㄷ','ㄹ','ㅁ','ㅂ','ㅅ','ㅇ','ㅈ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ']
finaldata_jy = pd.DataFrame(columns=(['aX', 'aY', 'aZ', 'gX','gY','gZ','index','middle','ring','little','thumb']))
j=0
for num in nums:
    path = '../data/자음/%s.csv'%num
    num_jy = pd.read_csv(path, engine='python', encoding='cp949')
    num_jy.columns =['aX', 'aY', 'aZ', 'gX','gY','gZ','little','ring','middle','index','thumb']
    data_jy = num_jy
    
    for k in data_jy.columns:     #data.columns[w:] if you have w column of line description 
        data_jy[k] = data_jy[k].fillna(data_jy[k].median())
    data_jy = data_jy.filter(['aX', 'aY', 'aZ', 'gX','gY','gZ','little','ring','middle','index','thumb'])
    data_jy.insert(11,'Character',j)
    j=j+1
    finaldata_jy = pd.concat([finaldata_jy,data_jy[180:]],sort=False,ignore_index = True)
    #print(finaldata)
#finaldata.isnull().any()

finaldata_jy

Unnamed: 0,aX,aY,aZ,gX,gY,gZ,index,middle,ring,little,thumb,Character
0,0.38,0.87,0.31,1.71,-2.14,-3.48,472,236,322,252,438,0.0
1,0.38,0.87,0.31,-2.44,-3.23,3.11,475,237,320,253,438,0.0
2,0.42,0.87,0.32,-0.06,-3.23,-2.32,475,239,318,251,432,0.0
3,0.40,0.86,0.30,-1.22,-3.23,-1.59,472,240,322,252,438,0.0
4,0.38,0.86,0.32,5.43,-3.11,-1.40,476,236,320,250,436,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4705,0.68,-0.66,0.35,0.49,-2.56,-2.81,300,223,313,252,401,13.0
4706,0.68,-0.66,0.36,-0.92,-3.36,-2.99,298,219,310,252,400,13.0
4707,0.68,-0.66,0.36,1.65,-2.44,-2.08,303,218,314,255,402,13.0
4708,0.68,-0.65,0.36,-2.81,-4.03,-2.62,301,222,311,251,400,13.0


In [114]:
nums=['ㄱ','ㄴ','ㄷ','ㄹ','ㅁ','ㅂ','ㅅ','ㅇ','ㅈ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ']
finaldata_yj = pd.DataFrame(columns=(['aX', 'aY', 'aZ', 'gX','gY','gZ','index','middle','ring','little','thumb']))
j=0
for num in nums:
    path = '../data/자음/yj/%s.csv'%num
    num_yj = pd.read_csv(path, engine='python', encoding='cp949')
    num_yj.columns =['aX', 'aY', 'aZ', 'gX','gY','gZ','little','ring','middle','index','thumb']
    data_yj = num_yj
    
    for k in data_yj.columns:     #data.columns[w:] if you have w column of line description 
        data_yj[k] = data_yj[k].fillna(data_yj[k].median())
    data_yj = data_yj.filter(['aX', 'aY', 'aZ', 'gX','gY','gZ','little','ring','middle','index','thumb'])
    data_yj.insert(11,'Character',j)
    j=j+1
    finaldata_yj = pd.concat([finaldata_yj,data_yj[150:]],sort=False,ignore_index = True)
    #print(finaldata_yj)
#finaldata.isnull().any()

finaldata_yj

Unnamed: 0,aX,aY,aZ,gX,gY,gZ,index,middle,ring,little,thumb,Character
0,0.52,0.72,0.42,2.08,-1.95,-5.86,322,247,468,418,238,0.0
1,0.53,0.78,0.41,0.24,-3.30,-0.85,320,244,464,414,235,0.0
2,0.55,0.75,0.42,4.27,-4.46,0.06,319,246,463,419,235,0.0
3,0.57,0.71,0.40,-2.50,-2.93,-0.85,320,247,468,419,237,0.0
4,0.56,0.71,0.44,0.55,-5.00,-4.09,317,244,462,420,234,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4931,0.93,-0.38,-0.13,3.30,-3.36,-2.38,283,235,312,388,214,13.0
4932,0.92,-0.38,-0.12,1.59,-3.11,-2.62,295,235,316,385,211,13.0
4933,0.91,-0.38,-0.12,0.55,-2.32,-2.44,285,234,314,387,210,13.0
4934,0.91,-0.38,-0.13,-0.55,-3.72,-1.34,285,235,316,385,209,13.0


In [118]:
nums=['ㄱ','ㄴ','ㄷ','ㄹ','ㅁ','ㅂ','ㅅ','ㅇ','ㅈ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ']
finaldata_nh = pd.DataFrame(columns=(['aX', 'aY', 'aZ', 'gX','gY','gZ','index','middle','ring','little','thumb']))
j=0
for num in nums:
    path = '../data/자음/nh/%s.csv'%num
    num_nh = pd.read_csv(path, engine='python', encoding='cp949')
    num_nh.columns =['aX', 'aY', 'aZ', 'gX','gY','gZ','little','ring','middle','index','thumb']
    data_nh = num_nh
    
    for k in data_nh.columns:     #data.columns[w:] if you have w column of line description 
        data_nh[k] = data_nh[k].fillna(data_nh[k].median())
    data_nh = data_nh.filter(['aX', 'aY', 'aZ', 'gX','gY','gZ','little','ring','middle','index','thumb'])
    data_nh.insert(11,'Character',j)
    j=j+1
    finaldata_nh = pd.concat([finaldata_nh,data_nh[150:]],sort=False,ignore_index = True)
    #print(finaldata_nh)
#finaldata.isnull().any()

finaldata_nh

Unnamed: 0,aX,aY,aZ,gX,gY,gZ,index,middle,ring,little,thumb,Character
0,0.29,0.91,0.08,5.49,9.22,-6.29,287,236,469,428,237,0.0
1,0.29,0.99,0.05,3.11,-1.83,0.12,289,237,473,426,239,0.0
2,0.27,0.93,0.10,-1.46,5.07,1.83,289,235,464,425,239,0.0
3,0.30,0.94,0.08,4.70,-4.88,-0.85,287,236,473,426,237,0.0
4,0.30,0.94,0.06,-0.98,-4.76,0.12,290,235,469,427,239,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5044,0.81,-0.53,0.30,0.85,-2.93,-2.81,276,222,305,397,229,13.0
5045,0.81,-0.52,0.31,-1.04,-2.99,-2.50,276,221,309,394,227,13.0
5046,0.80,-0.52,0.30,0.79,-3.23,-2.93,281,219,313,397,226,13.0
5047,0.80,-0.52,0.31,0.98,-3.05,-2.99,275,220,313,393,226,13.0


In [119]:
finaldata = pd.concat([finaldata_jy,finaldata_nh,finaldata_yj],sort=False,ignore_index = True)
finaldata

Unnamed: 0,aX,aY,aZ,gX,gY,gZ,index,middle,ring,little,thumb,Character
0,0.38,0.87,0.31,1.71,-2.14,-3.48,472,236,322,252,438,0.0
1,0.38,0.87,0.31,-2.44,-3.23,3.11,475,237,320,253,438,0.0
2,0.42,0.87,0.32,-0.06,-3.23,-2.32,475,239,318,251,432,0.0
3,0.40,0.86,0.30,-1.22,-3.23,-1.59,472,240,322,252,438,0.0
4,0.38,0.86,0.32,5.43,-3.11,-1.40,476,236,320,250,436,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
14690,0.93,-0.38,-0.13,3.30,-3.36,-2.38,283,235,312,388,214,13.0
14691,0.92,-0.38,-0.12,1.59,-3.11,-2.62,295,235,316,385,211,13.0
14692,0.91,-0.38,-0.12,0.55,-2.32,-2.44,285,234,314,387,210,13.0
14693,0.91,-0.38,-0.13,-0.55,-3.72,-1.34,285,235,316,385,209,13.0
