### import

In [1]:
import os 
import pandas as pd 
import numpy as np 
import re
import tqdm 
import matplotlib.pyplot as plt


from nltk import ngrams
import seaborn as sns
import matplotlib as mpl
import matplotlib.font_manager as fm
from collections import Counter
import sentencepiece as spm

### import Tokenizer

In [None]:
data_path = os.getenv('HOME') +'/PocLab/Model'
spm_4000 = spm.SentencePieceProcessor()
spm_4000.Load(data_path + '/spm_dec_v.model')


data_path = os.getenv('HOME') +'/PocLab/Preprocessing'
spm_8000 = spm.SentencePieceProcessor()
spm_8000.Load(data_path + '/spm_dec_8009.model')

msp_4000 = spm.SentencePieceProcessor()
msp_4000.Load(data_path + '/spm_dec_mecab_4009.model')

msp_8000 = spm.SentencePieceProcessor()
msp_8000.Load(data_path + '/spm_dec_msp8009.model')

custom_msp_4000 = spm.SentencePieceProcessor()
custom_msp_4000.Load(data_path + '/spm_dec_custom_msp4009.model')

custom_msp_8000 =  spm.SentencePieceProcessor()
custom_msp_8000.Load(data_path + '/spm_dec_custom_msp8009.model')


In [None]:
# 토크나이저 dict 만들기

tokenizers_4000 = {'spm' : spm_4000, 'msp' : msp_4000, 'custom_msp' : custom_msp_4000}
tokenizers_8000 = {'spm' : spm_8000, 'msp' : msp_8000, 'custom_msp' : custom_msp_8000}

### unk count
- OOV 개수 확인

#### 4000

In [None]:
tok_spm= []
tok_msp= []
tok_coustom = []
split_token = set()
dial = df_train['dial'].values
for i in tqdm.tqdm(dial) :
    split_token |= set(i.split())
    tok_spm.extend(spm_4000.encode(i))
    tok_msp.extend(msp_4000.encode(i))
    tok_coustom.extend(custom_msp_4000.encode(i))

In [None]:
print('split unk cnt : ', 4000 -  len(split_token)*-1)
print('spm unk cnt : ', tok_spm.count(spm_4000.unk_id()))
print('msp unk cnt : ', tok_msp.count(msp_4000.unk_id()))
print('custom_msp unk cnt : ', tok_coustom.count(custom_msp_4000.unk_id()))

In [None]:
fig, ax = plt.subplots()
bar = ax.bar(x=['split','spm','msp','custom'], height = [752112,tok_spm.count(spm_4000.unk_id()),tok_msp.count(spm_4000.unk_id()),custom_msp_4000.unk_id())])
ax.bar_label(bar,fontsize=12)
sns.despine()
plt.title('4000_data : Unknown token count by tokenizer')
plt.yticks(fontsize=10)
plt.show()

#### 8000
- msp_8000data는 없음

In [None]:
tok_spm= []
tok_msp= []
tok_coustom = []
split_token = set()
dial = df_train['dial'].values
for i in tqdm.tqdm(dial) :
    split_token |= set(i.split())
    tok_spm.extend(spm_8000.encode(i))
    tok_msp.extend(msp_8000.encode(i))
    tok_coustom.extend(custom_msp_8000.encode(i))

In [None]:
print('split unk cnt : ', 8000 -  len(split_token)*-1)
print('spm unk cnt : ', tok_spm.count(spm_8000.unk_id()))
print('msp unk cnt : ', tok_msp.count(msp_8000.unk_id()))
print('custom_msp unk cnt : ', tok_coustom.count(custom_msp_8000.unk_id()))

In [None]:
fig, ax = plt.subplots()
bar = ax.bar(x=['split','spm','msp','custom'], height = [752112,tok_spm.count(spm_8000.unk_id()),msp_8000.unk_id()),custom_msp_8000.unk_id())])
ax.bar_label(bar,fontsize=12)
sns.despine()
plt.title('8000_data : Unknown token count by tokenizer')
plt.yticks(fontsize=10)
plt.show()

### subword fertility
- 토큰화된 단어마다 생성되는 부분단어의 평균 개수

In [None]:
# 4000_data

tok_spm= []
tok_msp= []
tok_custom= []
dial = df_train['dial'].values
for i in tqdm.tqdm(dial) :
    #split by empty space
    tok_spm_len_split = len(i.split())
    
    #split by tokenzier
    tok_spm_len = len(spm_4000.encode(i))
    tok_msp_len = len(msp_4000.encode(i))
    tok_custom_len = len(custom_msp_4000.encode(i))
    
    # find percentage of subword split
    tok_spm.append(tok_spm_len / tok_spm_len_split )
    tok_msp.append(tok_msp_len / tok_spm_len_split)
    tok_custom.append(tok_custom_len / tok_spm_len_split)    

In [None]:
print('spm subword fertility :',sum(tok_spm)/len(tok_spm))
print('msp subword fertility :',sum(tok_msp)/len(tok_msp))
print('custom_msp subword fertility :',sum(tok_custom)/len(tok_custom))

In [None]:
fig, ax = plt.subplots()
bar = ax.bar(x=['spm','msp'], height = [round(sum(tok_spm)/len(tok_spm),2),round(sum(tok_msp)/len(tok_msp),2),round(sum(tok_custom)/len(tok_custom),2)])
ax.bar_label(bar,fontsize=12)
sns.despine()
plt.title('4000_data subword fertility by tokenizer')
plt.yticks(fontsize=10)
plt.show()

In [None]:
# 8000_data

tok_spm= []
# tok_msp= []
tok_custom= []
dial = df_train['dial'].values
for i in tqdm.tqdm(dial) :
    #split by empty space
    tok_spm_len_split = len(i.split())
    
    #split by tokenzier
    tok_spm_len = len(spm_8000.encode(i))
#     tok_msp_len = len(msp_8000.encode(i))
    tok_custom_len = len(custom_msp_8000.encode(i))
    
    # find percentage of subword split
    tok_spm.append(tok_spm_len / tok_spm_len_split )
#     tok_msp.append(tok_msp_len / tok_spm_len_split)
    tok_custom.append(tok_custom_len / tok_spm_len_split)    

In [None]:
print('spm subword fertility :',sum(tok_spm)/len(tok_spm))
# print('msp subword fertility :',sum(tok_msp)/len(tok_msp))
print('custom_msp subword fertility :',sum(tok_custom)/len(tok_custom))

In [None]:
fig, ax = plt.subplots()
bar = ax.bar(x=['spm','msp'], height = [round(sum(tok_spm)/len(tok_spm),2),round(sum(tok_custom)/len(tok_custom),2)])
ax.bar_label(bar,fontsize=12)
sns.despine()
plt.title('8000_data subword fertility by tokenizer')
plt.yticks(fontsize=10)
plt.show()

### proportion of continued words
- 말뭉치에서 적어도 두 개의 부분 토큰으로 분할된 토큰화된 단어의 비율 

In [None]:
# 4000_data

tok_spm= []
tok_msp= []
tok_split = []
tok_custom= []
dial = df_train['dial'].values
for i in tqdm.tqdm(dial) :
    total_cnt = 0
    spm_temp = 0
    msp_temp = 0
    custom_temp = 0    
    for k in i.split() :
        if k == ' ' or k ==  '\u200b' or k =='\x9f':
            continue
        total_cnt += 1
        word_spm = spm_4000.encode_as_pieces(k)
        word_msp = msp_4000.encode_as_pieces(k)
        word_custom = custom_msp_4000.encode_as_pieces(k)        
        
        if word_spm[0] == '▁' :
            word_spm.remove('▁')
            
        if word_msp[0] == '▁' :
            word_msp.remove('▁')
            
        if word_custom[0] == '▁' :
            word_custom.remove('▁')
            
        if len(word_spm) > 1 : spm_temp+=1
        if len(word_msp) > 1 : msp_temp+=1
        if len(word_custom) > 1 : msp_temp+=1            
                        
    tok_spm.append(spm_temp)
    tok_msp.append(msp_temp)
    tok_split.append(total_cnt)
    tok_custom.append(custom_temp)    

In [None]:
print('spm proportion of continued words :',sum(tok_spm)/sum(tok_split) * 100)
print('msp proportion of continued words :',sum(tok_msp)/sum(tok_split) * 100)
print('custom_msp proportion of continued words :',sum(tok_custom)/sum(tok_split) * 100)

In [None]:
fig, ax = plt.subplots()
bar = ax.bar(x=['spm','msp','custom_msp'], height = [round(sum(tok_spm)/sum(tok_split) * 100,2),round(sum(tok_msp)/sum(tok_split) * 100,2),round(sum(tok_custom)/sum(tok_split) * 100,2)])
ax.bar_label(bar,fontsize=12)
sns.despine()
plt.title('4000_data : proportion of continued words by tokenizer')
plt.yticks(fontsize=10)
plt.show()

In [None]:
# 8000_data

tok_spm= []
tok_msp= []
tok_split = []
tok_custom= []
dial = df_train['dial'].values
for i in tqdm.tqdm(dial) :
    total_cnt = 0
    spm_temp = 0
    msp_temp = 0
    custom_temp = 0    
    for k in i.split() :
        if k == ' ' or k ==  '\u200b' or k =='\x9f':
            continue
        total_cnt += 1
        word_spm = spm_8000.encode_as_pieces(k)
#         word_msp = msp_4000.encode_as_pieces(k)
        word_custom = custom_msp_8000.encode_as_pieces(k)        
        
        if word_spm[0] == '▁' :
            word_spm.remove('▁')
            
        if word_msp[0] == '▁' :
            word_msp.remove('▁')
            
        if word_custom[0] == '▁' :
            word_custom.remove('▁')
            
        if len(word_spm) > 1 : spm_temp+=1
        if len(word_msp) > 1 : msp_temp+=1
        if len(word_custom) > 1 : msp_temp+=1            
                        
    tok_spm.append(spm_temp)
    tok_msp.append(msp_temp)
    tok_split.append(total_cnt)
    tok_custom.append(custom_temp)    

In [None]:
print('spm proportion of continued words :',sum(tok_spm)/sum(tok_split) * 100)
print('msp proportion of continued words :',sum(tok_msp)/sum(tok_split) * 100)
print('custom_msp proportion of continued words :',sum(tok_custom)/sum(tok_split) * 100)

In [None]:
fig, ax = plt.subplots()
bar = ax.bar(x=['spm','msp','custom_msp'], height = [round(sum(tok_spm)/sum(tok_split) * 100,2),round(sum(tok_msp)/sum(tok_split) * 100,2),round(sum(tok_custom)/sum(tok_split) * 100,2)])
ax.bar_label(bar,fontsize=12)
sns.despine()
plt.title('8000_data : proportion of continued words by tokenizer')
plt.yticks(fontsize=10)
plt.show()