In [1]:
import pandas as pd
import tokenizers
import glob
import os
import pickle

import torch
from torch.utils.data import Dataset, DataLoader

import sentencepiece as spm

## Download Data

In [2]:
from Korpora import Korpora

In [3]:
download_path = "../kor_dataset/"
Korpora.fetch("korean_parallel_koen_news", root_dir=download_path)

[korean_parallel] download korean-english-park.train.tar.gz: 8.72MB [00:13, 656kB/s]                                   


decompress C:\workspace\kcc\kor_dataset\korean_parallel\korean-english-park.train.tar.gz


[korean_parallel] download korean-english-park.dev.tar.gz: 115kB [00:00, 574kB/s]                                      


decompress C:\workspace\kcc\kor_dataset\korean_parallel\korean-english-park.dev.tar.gz


[korean_parallel] download korean-english-park.test.tar.gz: 238kB [00:00, 1.10MB/s]                                    

decompress C:\workspace\kcc\kor_dataset\korean_parallel\korean-english-park.test.tar.gz





## Load Data

In [4]:
from Korpora import KoreanParallelKOENNewsKorpus
corpus = KoreanParallelKOENNewsKorpus()


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : KakaoBrain
    Repository : https://github.com/jungyeul/korean-parallel-corpora
    References :
        - Jungyeul Park, Jeen-Pyo Hong and Jeong-Won Cha (2016) Korean Language Resources for Everyone.
          In Proceedings of the 30th Pacific Asia Conference on Language, Information and Computation
          (PACLIC 30). October 28 - 30, 2016. Seoul, Korea. 
          (https://www.aclweb.org/anthology/Y16-2002/)

    # License
    Creative Commons Attribution Noncommercial No-Derivative-Works 3.0
    Details in https://creativecommons.org/licenses/by-nc-nd/3.0/



In [5]:
kor = corpus.test.get_all_texts()
eng = corpus.test.get_all_pairs()

In [7]:
with open("../data/processed/raw/korpora/pair_kor_test.txt", "w", encoding="utf-8") as f:
    for line in kor:
        f.write(f"{line}\n")
with open("../data/processed/raw/korpora/pair_eng_test.txt", "w", encoding="utf-8") as f:
    for line in eng:
        f.write(f"{line}\n")
    

## Make spm model of each data

In [82]:
files = glob.glob(f"./data/processed/raw/kopora/*{data_type}.txt")

In [84]:
tokenized_data = {}
for corpus in ["korpora", "gyafc"]:
    tokenized_data[corpus] = {}
    tokenized_data[corpus]['train'] = {}
    tokenized_data[corpus]['test'] = {}

    for data_type in ["train", "test"]:
        files = glob.glob(f"./data/processed/raw/{corpus}/*{data_type}.txt")
        
        parameter = '--input={} \
        --pad_id={} --pad_piece={} \
        --bos_id={} --bos_piece={} \
        --eos_id={} --eos_piece={} \
        --unk_id={} --unk_piece={} \
        --user_defined_symbols={} \
        --model_prefix={} \
        --vocab_size={} \
        --max_sentence_length={} \
        --character_coverage={} \
        --model_type={}'

        pad_id = 0
        pad_piece = "[PAD]"
        bos_id = 1
        bos_piece = "[BOS]"
        eos_id = 2
        eos_piece = "[EOS]"
        unk_id = 3
        unk_piece = "[UNK]"
        user_defined_symbols = "[SEP],[CLS],[MASK]"
        if corpus == "korpora":
            vocab_size = 2400
        elif corpus == "gyafc":
            vocab_size = 1800
        max_sentence_length = 9999
        character_coverage = 1.0  # default
        model_type = 'unigram'  # default: unigram

        for train_input_file in files:        
            prefix = (("_").join(train_input_file.split("\\")[-1].split("_")[:-1]))
            model_prefix = f'./data/tokenizer/{data_type}_{prefix}_spm'

            cmd = parameter.format(train_input_file,
                                   pad_id, pad_piece,
                                   bos_id, bos_piece,
                                   eos_id, eos_piece,
                                   unk_id, unk_piece,
                                   user_defined_symbols,
                                   model_prefix,
                                   vocab_size,
                                   max_sentence_length,
                                   character_coverage,
                                   model_type)
            spm.SentencePieceProcessor()
            spm.SentencePieceTrainer.Train(cmd)
            print(f"Train Compelte: {data_type} {prefix} model & vocab")

            sp = spm.SentencePieceProcessor()
            sp.Load(f"{model_prefix}.model")

            # BOS, EOS 추가
            sp.SetEncodeExtraOptions('bos:eos')

            # Tokenization And Padding
            with open(train_input_file, "r", encoding="utf-8") as f:
                tokenized_data[corpus][data_type][prefix] = [sp.EncodeAsIds(line) for line in f]
                print(f"Make Compelte: {data_type} {prefix} tokenized data")

    # Save Data
    processed_path = "./data/processed/tokenized/spm_tokenized_data.pkl"
    with open(processed_path, 'wb') as file:
        pickle.dump(tokenized_data, file)
    print("Saving Tokenized Data is Done!")

Train Compelte: train pair_eng model & vocab
Make Compelte: train pair_eng tokenized data
Train Compelte: train pair_kor model & vocab
Make Compelte: train pair_kor tokenized data
Train Compelte: test pair_eng model & vocab
Make Compelte: test pair_eng tokenized data
Train Compelte: test pair_kor model & vocab
Make Compelte: test pair_kor tokenized data
Saving Tokenized Data is Done!
Train Compelte: train em_formal model & vocab
Make Compelte: train em_formal tokenized data
Train Compelte: train em_informal model & vocab
Make Compelte: train em_informal tokenized data
Train Compelte: train fr_formal model & vocab
Make Compelte: train fr_formal tokenized data
Train Compelte: train fr_informal model & vocab
Make Compelte: train fr_informal tokenized data
Train Compelte: test em_formal model & vocab
Make Compelte: test em_formal tokenized data
Train Compelte: test em_informal model & vocab
Make Compelte: test em_informal tokenized data
Train Compelte: test fr_formal model & vocab
Make Com

In [85]:
tokenized_data.keys()

dict_keys(['korpora', 'gyafc'])

In [86]:
tokenized_data['korpora']['test']['pair_eng']

[[1,
  142,
  431,
  1014,
  14,
  15,
  8,
  1006,
  551,
  205,
  406,
  1208,
  33,
  8,
  385,
  13,
  883,
  669,
  7,
  16,
  677,
  1900,
  149,
  18,
  718,
  599,
  9,
  2],
 [1,
  26,
  77,
  35,
  11,
  183,
  178,
  847,
  30,
  181,
  1571,
  1190,
  149,
  7,
  63,
  372,
  13,
  1162,
  178,
  1756,
  7,
  18,
  49,
  8,
  1122,
  218,
  372,
  17,
  13,
  299,
  1264,
  16,
  8,
  1752,
  7,
  99,
  2089,
  19,
  15,
  8,
  62,
  1538,
  44,
  676,
  9,
  2],
 [1,
  553,
  8,
  12,
  36,
  2303,
  1756,
  16,
  8,
  2224,
  616,
  143,
  865,
  475,
  83,
  204,
  19,
  64,
  118,
  254,
  10,
  96,
  8,
  926,
  7,
  63,
  32,
  39,
  239,
  18,
  63,
  768,
  1208,
  25,
  178,
  218,
  9,
  2],
 [1,
  235,
  34,
  346,
  11,
  70,
  161,
  322,
  209,
  7,
  74,
  305,
  420,
  7,
  16,
  572,
  10,
  11,
  12,
  283,
  113,
  7,
  7,
  355,
  12,
  17,
  771,
  36,
  156,
  25,
  671,
  10,
  15,
  8,
  1923,
  16,
  969,
  7,
  123,
  66,
  87,
  17,
  363,
  310,
