In [1]:
import pandas as pd
import tokenizers
import glob
import os
import pickle

import torch
from torch.utils.data import Dataset, DataLoader

import sentencepiece as spm

## Make spm model of each data

In [40]:
tokenized_data = {}
tokenized_data['train'] = {}
tokenized_data['test'] = {}
for name in ["gyafc", "korpora"]:
    for data_type in ["train", "test"]:
        files = glob.glob(f"../data/processed/raw/{name}/*{data_type}.txt")


        parameter = '--input={} \
        --pad_id={} --pad_piece={} \
        --bos_id={} --bos_piece={} \
        --eos_id={} --eos_piece={} \
        --unk_id={} --unk_piece={} \
        --user_defined_symbols={} \
        --model_prefix={} \
        --vocab_size={} \
        --max_sentence_length={} \
        --character_coverage={} \
        --model_type={}'


        pad_id = 0
        pad_piece = "[PAD]"
        bos_id = 1
        bos_piece = "[BOS]"
        eos_id = 2
        eos_piece = "[EOS]"
        unk_id = 3
        unk_piece = "[UNK]"
        user_defined_symbols = "[SEP],[CLS],[MASK]"
        vocab_size = 5000
        max_sentence_length = 300
        character_coverage = 1.0 # default 
        model_type = 'bpe' # default: unigram


        for train_input_file in files:
            if "fr" in train_input_file:
                continue
            if 'kor_' in train_input_file:
                character_coverage = 0.9995
                vocab_size = 16000
                
            prefix = (("_").join(train_input_file.split("\\")[-1].split("_")[:-1]))                       
            model_prefix = f'../data/tokenizer/{data_type}_{prefix}_spm_bpe'

            cmd = parameter.format(train_input_file, 
                                   pad_id, pad_piece,
                                   bos_id, bos_piece, 
                                   eos_id, eos_piece, 
                                   unk_id, unk_piece,
                                   user_defined_symbols,
                                   model_prefix,
                                   vocab_size,
                                   max_sentence_length,
                                   character_coverage,
                                   model_type)
            spm.SentencePieceProcessor()
            spm.SentencePieceTrainer.Train(cmd)
            print(f"Train Compelte: {data_type} {prefix} model & vocab")

            sp = spm.SentencePieceProcessor()
            sp.Load(f"{model_prefix}.model")

            # BOS, EOS 추가
            sp.SetEncodeExtraOptions('bos:eos')

            # Tokenization And Padding
            with open(train_input_file, "r", encoding="utf-8") as f:
                tokenized_data[data_type][prefix] = [sp.EncodeAsIds(line) for line in f]
                print(f"Make Complete: {data_type} {prefix} tokenized data")


# Save Data
processed_path = f"../data/processed/tokenized/{data_type}_{prefix}_spm_bpe.pkl"
with open(processed_path, 'wb') as file:
    pickle.dump(tokenized_data, file)
print("Saving Tokenized Data is Done!")

Train Compelte: train em_formal model & vocab
Make Complete: train em_formal tokenized data
Train Compelte: train em_informal model & vocab
Make Complete: train em_informal tokenized data
Train Compelte: test em_formal model & vocab
Make Complete: test em_formal tokenized data
Train Compelte: test em_informal model & vocab
Make Complete: test em_informal tokenized data
Train Compelte: train pair_eng model & vocab
Make Complete: train pair_eng tokenized data
Train Compelte: train pair_kor model & vocab
Make Complete: train pair_kor tokenized data
Train Compelte: test pair_eng model & vocab
Make Complete: test pair_eng tokenized data
Train Compelte: test pair_kor model & vocab
Make Complete: test pair_kor tokenized data
Saving Tokenized Data is Done!


In [32]:
sp = spm.SentencePieceProcessor()
spm_dir = "../data/tokenizer"
src_lang_model = sp.Load(os.path.join(spm_dir, "train_em_informal_spm.model"))

In [None]:
sp.

In [27]:
# [sp.Encode("Make Complete: train pair_kor tokenized data")]
sp.EncodeAsPieces("Make Complete: train pair_kor tokenized data")

['[BOS]',
 '▁',
 'Ma',
 'ke',
 '▁Co',
 'm',
 'p',
 'l',
 'e',
 't',
 'e',
 ':',
 '▁',
 't',
 'ra',
 'in',
 '▁',
 'p',
 'a',
 'i',
 'r',
 '_',
 'k',
 'or',
 '▁to',
 'ke',
 'n',
 'i',
 'z',
 'ed',
 '▁',
 'd',
 'a',
 't',
 'a',
 '[EOS]']

In [29]:
import pandas as pf