In [1]:
from datasets import load_dataset, DatasetDict

raw_dataset = load_dataset('eaglewatch/Korean_Wikipedia_Dataset_for_GPT2_August_2022')



In [2]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 334420
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 83605
    })
})

In [3]:
raw_dataset['valid'].to_pandas()

Unnamed: 0,text
0,《'''레이디언트 실버건'''》은 트레저가 제작한 1998년 진행형 슈팅 게임이다....
1,thumb '''2020년 하계 올림픽 아메리칸사모아 선수단'''은 2021년 일본...
2,"==선== '''선'''(善)이란, 《보살영락본업경》 하권의 〈7. 대중수학품(大衆..."
3,《'''유아 넥스트'''》(You're Next)는 2011년 공개된 미국의 공포 ...
4,《'''BEST OF SOUL'''》(베스트 오브 소울)은 보아가 일본에서 발매한 ...
...,...
83600,"별명: '''백룡부대'''), 또는 '''해병대 제9여단'''은 2015년 12월 ..."
83601,대한민국의 카드사다. == 연혁 == 1986년: '''익스프레스 크레티트 카드''...
83602,"'''애덤 로비텔'''(, 1978년 5월 28일 )은 미국의 공포 영화 전문 영화..."
83603,'''1993년 코파 오로'''(1993 Copa de Oro)는 1993년 7월 ...


In [4]:
sampled_dataset = DatasetDict(
    {
        "train": raw_dataset['train'].shuffle(),
        "valid": raw_dataset['valid'].shuffle()
    }
)

In [5]:
sampled_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 334420
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 83605
    })
})

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2')
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GPT2TokenizerFast(name_or_path='skt/kogpt2-base-v2', vocab_size=51200, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [7]:
def get_training_corpus(ds):
    return(
        ds[i:i+1000]['text'] for i in range(0, len(ds), 1000)
    )

training_corpus = get_training_corpus(raw_dataset['train'])

In [8]:
%%time

tokenizer = tokenizer.train_new_from_iterator(training_corpus, vocab_size=50257)

Wall time: 5min 7s


In [9]:
sample_text = "너 때문에 비행기 시간도 뺐겼어"

tokenizer.tokenize(sample_text)

['▁너', '▁때문에', '▁비행기', '▁시간', '도', '▁', '뺐', '겼', '어']

In [10]:
tokenizer(sample_text, return_length=True)

{'input_ids': [20383, 19253, 36610, 19552, 15318, 3377, 16139, 14770, 16621], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'length': [9]}

In [11]:
context_length = 128

def tokenize(batch):
    outputs = tokenizer(
        batch['text'],
        max_length=context_length,
        truncation=True,
        return_overflowing_tokens=True,
        return_length=True   
    )
    
    input_batch=[]
    for length, input_ids in zip(outputs['length'], outputs['input_ids']):
        if length==context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [12]:
tokenized_datasets = sampled_dataset.map(tokenize, batched=True, remove_columns=raw_dataset['train'].column_names)

Map:   0%|          | 0/334420 [00:00<?, ? examples/s]

Map:   0%|          | 0/83605 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1309161
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 323466
    })
})

In [14]:
tokenizer.save_pretrained("kowiki")

('kowiki\\tokenizer_config.json',
 'kowiki\\special_tokens_map.json',
 'kowiki\\vocab.json',
 'kowiki\\merges.txt',
 'kowiki\\added_tokens.json',
 'kowiki\\tokenizer.json')

In [15]:
from transformers import LlamaConfig

configuration = LlamaConfig()

configuration

LlamaConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "transformers_version": "4.32.1",
  "use_cache": true,
  "vocab_size": 32000
}

In [16]:
tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.vocab_size

(375, 375, 50257)

In [17]:
configuration = LlamaConfig(**{
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "hidden_act": "silu",
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 1376,
  "max_position_embeddings": 128,
  "model_type": "llama",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "tie_word_embeddings": False,
  "transformers_version": "4.28.0",
  "use_cache": True,
  "vocab_size": 50257
}) 

In [18]:
from transformers import LlamaForCausalLM

model = LlamaForCausalLM(configuration)
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(50257, 512, padding_idx=0)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=1376, bias=False)
          (up_proj): Linear(in_features=512, out_features=1376, bias=False)
          (down_proj): Linear(in_features=1376, out_features=512, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_he

In [19]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [20]:
model.to(device)
0

0

In [21]:
prompt = "너 때문에 비행기 시간도 놓쳤어 "

inputs = tokenizer(prompt, return_tensors='pt')
inputs.to(device)

generate_ids = model.generate(inputs.input_ids, max_length=50)
generate_ids

tensor([[20383, 19253, 36610, 19552, 15318, 20689, 17224, 16621,  3377,   664,
           664,   664,   664,   664,   664,   664,   664,   664, 12764, 29428,
         11948, 10867, 11948, 10867, 11948, 26969, 26969, 26969, 12764, 12764,
         12764, 12764, 12764, 12764, 44613, 44613, 44613, 44613, 44613, 44613,
         44613, 44613, 21502, 21502, 21502, 21502, 21502, 21502, 48540, 26969]],
       device='cuda:0')

In [22]:
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

'너 때문에 비행기 시간도 놓쳤어 űűűűűűűűű輈 일종의螯繢螯繢螯 1910년 1910년 1910년輈輈輈輈輈輈 말미암 말미암 말미암 말미암 말미암 말미암 말미암 말미암천군천군천군천군천군천군 오치 1910년'

In [23]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [24]:
out = data_collator([tokenized_datasets['train'][i] for i in range(3)])

for key in out:
    print(f"{key}: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids: torch.Size([3, 128])
attention_mask: torch.Size([3, 128])
labels: torch.Size([3, 128])


In [25]:
out['input_ids'][0][:20], out['attention_mask'][0][:20], out['labels'][0][:20]

(tensor([18768, 26053, 15993, 47638, 19078, 22440, 18918, 25691,   441, 33782,
         20682, 21388, 19066, 19904, 36632, 19522, 28093, 28320, 36261, 19170]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([18768, 26053, 15993, 47638, 19078, 22440, 18918, 25691,   441, 33782,
         20682, 21388, 19066, 19904, 36632, 19522, 28093, 28320, 36261, 19170]))

In [26]:
from transformers import TrainingArguments

batch_size = 32
logging_steps = 1000
learning_rate=5e-4
num_epochs=1

args = TrainingArguments(
    output_dir='kowikimodel',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='steps',
    eval_steps=logging_steps,
    logging_steps=logging_steps,
    save_steps=logging_steps,
    gradient_accumulation_steps=8,
    num_train_epochs=4,
    weight_decay=0.1,
    warmup_steps=logging_steps,
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=False
)

In [27]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid']
)

In [28]:
trainer.train()

Step,Training Loss,Validation Loss
1000,7.9613,6.150519
2000,5.4351,5.091812
3000,4.9132,4.831055
4000,4.722,4.691142
5000,4.6031,4.597617
6000,4.4645,4.537406
7000,4.4158,4.478937
8000,4.3704,4.43009
9000,4.3274,4.385536
10000,4.2891,4.346845


TrainOutput(global_step=20456, training_loss=4.487922886269233, metrics={'train_runtime': 32035.1339, 'train_samples_per_second': 163.466, 'train_steps_per_second': 0.639, 'total_flos': 1.5437314156658688e+17, 'train_loss': 4.487922886269233, 'epoch': 4.0})

In [39]:
prompt = "안녕"

inputs = tokenizer(prompt, return_tensors='pt')
inputs.to(device)

generate_ids = model.generate(inputs.input_ids, max_length=50)
generate_ids

tensor([[28838, 39420,  3737, 19862, 18823, 17894, 16555, 19396, 20778, 18804,
         14698, 16282, 23353,  3737, 19862, 18823, 17894, 16555, 19396, 20778,
         18804, 14698, 16282, 23353,  3737, 19862, 18823, 17894, 16555, 19396,
         20778, 18804, 14698, 16282, 23353,  3737, 19862, 18823, 17894, 16555,
         19396, 20778, 18804, 14698, 16282, 23353,  3737, 19862, 18823, 17894]],
       device='cuda:0')

In [40]:
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

'안녕하세요》 ... 나훈아 2014년 KBS2 《감성세대》 ... 나훈아 2014년 KBS2 《감성세대》 ... 나훈아 2014년 KBS2 《감성세대》 ... 나훈아 2014년 KBS2 《감성세대》 ... 나훈'