In [None]:
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM

dataset = load_dataset("json", data_files=["/content/our_mega_over_true_dataset.jsonl", "/ds.jsonl.xz"], split="train")

#--------[prompt_formatter.py]-------

ROLE_TOKEN = {
    'system': '[SYS]',
    'user':   '[USR]',
    'web':    '[WEB]',
    'bot':    '[BOT]'
}
S_END = '[/]'
S_PAD = '[_]'

def bulk_fmt(ds):
    O = []
    for chain in ds['messages']:
        c = ''
        for msg in chain:
            c += ROLE_TOKEN[msg['role']]
            c += msg['content']
            c += S_END
        O.append(c)
    return O

#-------------------------------------

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# print(tokenizer('[USR]Example Prompt[/][BOT]Some Answer[/]'))


tokenizer.add_special_tokens({
    'eos_token':                 S_END,
    'pad_token':                 S_PAD,
    'additional_special_tokens': list(ROLE_TOKEN.values())
})

model.resize_token_embeddings(len(tokenizer))

# print(tokenizer('[USR]Example Prompt[/][BOT]Some Answer[/]'))

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    max_seq_length=512,
    tokenizer=tokenizer,
    formatting_func=bulk_fmt,
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        save_strategy = 'steps',
        save_steps = 1000,
        output_dir = 'drive/MyDrive/gpt-oasst'
    ),
)
trainer.train()


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/82815 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,4.472
1000,3.2223
1500,3.1712
2000,3.1766
2500,3.0735
3000,3.0702
3500,2.962
4000,2.9877
4500,3.0484
5000,2.9662


Checkpoint destination directory drive/MyDrive/gpt-oasst/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory drive/MyDrive/gpt-oasst/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory drive/MyDrive/gpt-oasst/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory drive/MyDrive/gpt-oasst/checkpoint-4000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory drive/MyDrive/gpt-oasst/checkpoint-5000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory drive/MyDrive/gpt-oasst/checkpoint-6000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory drive/M

KeyboardInterrupt: 

In [None]:
from google.colab import files
files.download("/content/drive/MyDrive/gpt-oasst/checkpoint-79000")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r checkpoint-79000.zip /content/drive/MyDrive/gpt-oasst/checkpoint-79000

  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/ (stored 0%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/generation_config.json (deflated 24%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/model.safetensors (deflated 7%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/merges.txt (deflated 53%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/tokenizer.json (deflated 72%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/vocab.json (deflated 59%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/training_args.bin (deflated 51%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/scheduler.pt (deflated 55%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/trainer_state.json (deflated 80%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/config.json (deflated 51%)
  adding: content/drive/MyDrive/gpt-oasst/checkpoint-79000/tokenizer_config.json (deflated 79%)
  adding: content/drive/MyD

In [None]:
!pwd

/content


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/gpt-oasst/checkpoint-79000")

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/gpt-oasst/checkpoint-79000", device="cuda:0")


input_text = "[USR]What time is it now?[/][WEB]now 10:22[/][BOT]"
input_ids = tokenizer(input_text, return_tensors="pt")


outputs = model.generate(**input_ids, max_length = 512, repetition_penalty = 10., encoder_repetition_penalty = 10., eos_token_id = tokenizer(S_END)['input_ids'][0])
yt = tokenizer.decode(outputs[0])

print(yt[(yt.index('[BOT]') + 5):-3])
# print(input_ids)

Setting `pad_token_id` to `eos_token_id`:50257 for open-end generation.


[USR]What time is it now?[/][WEB]now 10:22[/][BOT]I'm sorry, but the time is not right now.[/]


In [None]:
tokenizer(S_END)['input_ids'][0]

[50257]

In [None]:
!xz --decompress ds.jsonl.xz

xz: ds.jsonl.xz: No such file or directory


In [None]:
!pip install accelerate
!pip install datasets
!pip install trl

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━