# Train LLM from scratch

## Train a tokenizer

In [1]:
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path

In [2]:
VOCAB_SIZE = 4000
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"
MODEL_DIR = 'gpt'

In [3]:
paths = [str(x) for x in Path(".").glob("**/*.txt")]
tokenizer = ByteLevelBPETokenizer()
tokenizer.enable_padding(pad_token=PAD_TOKEN)
tokenizer.unk_token = UNK_TOKEN
tokenizer.train(files="ss.txt", vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=[PAD_TOKEN, UNK_TOKEN])

In [4]:
tokenizer.encode("hello my name is").tokens

['he', 'll', 'o', 'Ġmy', 'Ġname', 'Ġis']

In [5]:
!mkdir {MODEL_DIR}
tokenizer.save_model(f"{MODEL_DIR}")

['gpt\\vocab.json', 'gpt\\merges.txt']

In [6]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_DIR, pad_token=PAD_TOKEN, unk_token=UNK_TOKEN)

  from .autonotebook import tqdm as notebook_tqdm


## Train a Language Model from scratch

### Setup the model

In [7]:
import torch

In [8]:
torch.cuda.is_available()

True

In [9]:
from transformers import OpenAIGPTConfig
gpt_config = OpenAIGPTConfig(
    vocab_size=VOCAB_SIZE,
    n_embd=64,
    n_head=8,
    n_layer=10,
    n_positions=512
)

In [10]:
from transformers import OpenAIGPTLMHeadModel
model = OpenAIGPTLMHeadModel(config=gpt_config)

In [11]:
model.num_parameters()

788608

### Build dataset

In [12]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./ss.txt",
    block_size=128
)



In [13]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

### Setup trainer

In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt",
    overwrite_output_dir=True,
    num_train_epochs=500,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=3,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


In [15]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
  3%|▎         | 505/17000 [00:20<10:39, 25.78it/s]

{'loss': 7.5573, 'grad_norm': 0.8311532139778137, 'learning_rate': 4.8529411764705885e-05, 'epoch': 14.71}


  6%|▌         | 1003/17000 [00:41<10:13, 26.09it/s]

{'loss': 6.6074, 'grad_norm': 0.720300555229187, 'learning_rate': 4.705882352941177e-05, 'epoch': 29.41}


  9%|▉         | 1504/17000 [01:01<10:03, 25.68it/s]

{'loss': 6.3003, 'grad_norm': 0.5680872797966003, 'learning_rate': 4.558823529411765e-05, 'epoch': 44.12}


 12%|█▏        | 2002/17000 [01:21<09:53, 25.25it/s]

{'loss': 6.1501, 'grad_norm': 0.9082577228546143, 'learning_rate': 4.411764705882353e-05, 'epoch': 58.82}


 15%|█▍        | 2503/17000 [01:41<09:48, 24.64it/s]

{'loss': 5.9923, 'grad_norm': 0.7637717127799988, 'learning_rate': 4.2647058823529415e-05, 'epoch': 73.53}


 18%|█▊        | 3004/17000 [02:01<09:18, 25.07it/s]

{'loss': 5.8457, 'grad_norm': 0.814420223236084, 'learning_rate': 4.11764705882353e-05, 'epoch': 88.24}


 21%|██        | 3505/17000 [02:22<08:42, 25.83it/s]

{'loss': 5.713, 'grad_norm': 1.2423053979873657, 'learning_rate': 3.970588235294117e-05, 'epoch': 102.94}


 24%|██▎       | 4003/17000 [02:42<09:15, 23.42it/s]

{'loss': 5.5965, 'grad_norm': 1.14613676071167, 'learning_rate': 3.8235294117647055e-05, 'epoch': 117.65}


 26%|██▋       | 4504/17000 [03:02<08:42, 23.91it/s]

{'loss': 5.4904, 'grad_norm': 1.183404803276062, 'learning_rate': 3.6764705882352945e-05, 'epoch': 132.35}


 29%|██▉       | 5005/17000 [03:22<07:37, 26.25it/s]

{'loss': 5.4007, 'grad_norm': 1.3056813478469849, 'learning_rate': 3.529411764705883e-05, 'epoch': 147.06}


 32%|███▏      | 5503/17000 [03:43<08:53, 21.55it/s]

{'loss': 5.3206, 'grad_norm': 1.2659324407577515, 'learning_rate': 3.382352941176471e-05, 'epoch': 161.76}


 35%|███▌      | 6002/17000 [04:04<07:48, 23.47it/s]

{'loss': 5.2475, 'grad_norm': 1.4524303674697876, 'learning_rate': 3.235294117647059e-05, 'epoch': 176.47}


 38%|███▊      | 6503/17000 [04:25<07:27, 23.45it/s]

{'loss': 5.1801, 'grad_norm': 1.2668864727020264, 'learning_rate': 3.0882352941176475e-05, 'epoch': 191.18}


 41%|████      | 7004/17000 [04:45<06:50, 24.35it/s]

{'loss': 5.119, 'grad_norm': 1.6782963275909424, 'learning_rate': 2.9411764705882354e-05, 'epoch': 205.88}


 44%|████▍     | 7505/17000 [05:05<06:18, 25.07it/s]

{'loss': 5.063, 'grad_norm': 1.2664724588394165, 'learning_rate': 2.7941176470588236e-05, 'epoch': 220.59}


 47%|████▋     | 8003/17000 [05:25<06:07, 24.50it/s]

{'loss': 5.0117, 'grad_norm': 1.4334924221038818, 'learning_rate': 2.647058823529412e-05, 'epoch': 235.29}


 50%|█████     | 8504/17000 [05:46<05:37, 25.17it/s]

{'loss': 4.9645, 'grad_norm': 1.869936227798462, 'learning_rate': 2.5e-05, 'epoch': 250.0}


 53%|█████▎    | 9002/17000 [06:06<05:25, 24.59it/s]

{'loss': 4.9183, 'grad_norm': 1.66985023021698, 'learning_rate': 2.3529411764705884e-05, 'epoch': 264.71}


 56%|█████▌    | 9503/17000 [06:27<05:44, 21.77it/s]

{'loss': 4.8788, 'grad_norm': 2.1525886058807373, 'learning_rate': 2.2058823529411766e-05, 'epoch': 279.41}


 59%|█████▉    | 10001/17000 [06:47<05:31, 21.09it/s]

{'loss': 4.8395, 'grad_norm': 2.0288259983062744, 'learning_rate': 2.058823529411765e-05, 'epoch': 294.12}


 62%|██████▏   | 10505/17000 [07:07<04:16, 25.34it/s]

{'loss': 4.8048, 'grad_norm': 2.0491950511932373, 'learning_rate': 1.9117647058823528e-05, 'epoch': 308.82}


 65%|██████▍   | 11003/17000 [07:28<04:12, 23.75it/s]

{'loss': 4.7737, 'grad_norm': 1.8106434345245361, 'learning_rate': 1.7647058823529414e-05, 'epoch': 323.53}


 68%|██████▊   | 11504/17000 [07:49<03:40, 24.92it/s]

{'loss': 4.7433, 'grad_norm': 1.756524920463562, 'learning_rate': 1.6176470588235296e-05, 'epoch': 338.24}


 71%|███████   | 12002/17000 [08:09<03:33, 23.41it/s]

{'loss': 4.7159, 'grad_norm': 1.9238861799240112, 'learning_rate': 1.4705882352941177e-05, 'epoch': 352.94}


 74%|███████▎  | 12503/17000 [08:30<02:57, 25.32it/s]

{'loss': 4.6919, 'grad_norm': 1.9287679195404053, 'learning_rate': 1.323529411764706e-05, 'epoch': 367.65}


 76%|███████▋  | 13004/17000 [08:50<02:51, 23.25it/s]

{'loss': 4.669, 'grad_norm': 2.3781075477600098, 'learning_rate': 1.1764705882352942e-05, 'epoch': 382.35}


 79%|███████▉  | 13505/17000 [09:11<02:24, 24.27it/s]

{'loss': 4.6516, 'grad_norm': 1.7483677864074707, 'learning_rate': 1.0294117647058824e-05, 'epoch': 397.06}


 82%|████████▏ | 14003/17000 [09:31<02:01, 24.76it/s]

{'loss': 4.6345, 'grad_norm': 2.188845634460449, 'learning_rate': 8.823529411764707e-06, 'epoch': 411.76}


 85%|████████▌ | 14504/17000 [09:52<01:38, 25.35it/s]

{'loss': 4.6209, 'grad_norm': 1.9429000616073608, 'learning_rate': 7.3529411764705884e-06, 'epoch': 426.47}


 88%|████████▊ | 15002/17000 [10:12<01:17, 25.79it/s]

{'loss': 4.6099, 'grad_norm': 2.1574392318725586, 'learning_rate': 5.882352941176471e-06, 'epoch': 441.18}


 91%|█████████ | 15503/17000 [10:32<00:58, 25.76it/s]

{'loss': 4.5994, 'grad_norm': 1.9453603029251099, 'learning_rate': 4.411764705882353e-06, 'epoch': 455.88}


 94%|█████████▍| 16004/17000 [10:52<00:39, 25.28it/s]

{'loss': 4.593, 'grad_norm': 2.0819239616394043, 'learning_rate': 2.9411764705882355e-06, 'epoch': 470.59}


 97%|█████████▋| 16505/17000 [11:12<00:18, 26.38it/s]

{'loss': 4.5888, 'grad_norm': 1.8762872219085693, 'learning_rate': 1.4705882352941177e-06, 'epoch': 485.29}


100%|██████████| 17000/17000 [11:32<00:00, 24.55it/s]

{'loss': 4.5851, 'grad_norm': 2.2988784313201904, 'learning_rate': 0.0, 'epoch': 500.0}
{'train_runtime': 692.5469, 'train_samples_per_second': 1555.851, 'train_steps_per_second': 24.547, 'train_loss': 5.19054292566636, 'epoch': 500.0}
CPU times: total: 3min 50s
Wall time: 11min 32s





TrainOutput(global_step=17000, training_loss=5.19054292566636, metrics={'train_runtime': 692.5469, 'train_samples_per_second': 1555.851, 'train_steps_per_second': 24.547, 'total_flos': 50486196245760.0, 'train_loss': 5.19054292566636, 'epoch': 500.0})

In [16]:
trainer.save_model('./gpt')

In [17]:
from transformers import pipeline
pipeline("text-generation", model=model, tokenizer=tokenizer, device='cuda')("In loving thee")



[{'generated_text': "In loving thee to the world'st thou art, and thy self.     "}]