In [1]:
%%capture
!pip install transformers bitsandbytes datasets sentencepiece accelerate trl peft

In [2]:
import torch
from transformers import AutoTokenizer,AutoModelForCausalLM, BitsAndBytesConfig
from transformers import set_seed as transformers_set_seed, TrainingArguments
from transformers.utils import logging
from trl import SFTTrainer
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, get_peft_model

dataset = load_dataset("ChristophSchuhmann/essays-with-instructions",split="train")
dataset



Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['instructions', 'titles', 'essays', 'urls', '__index_level_0__'],
    num_rows: 2064
})

In [3]:
def prompt_formatting(example):
  text = f"Instructions: {example['instructions']}\nTitle: {example['titles']}\nEssays: {example['essays']}"
  return {"text":text}

train_dataset = dataset.map(prompt_formatting,batched=False,remove_columns=dataset.column_names)
train_dataset

Map:   0%|          | 0/2064 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 2064
})

In [4]:
model_name = "unsloth/llama-2-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, #if torch.cuda.is_bf16_supported() else torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype = torch.bfloat16 ,
    quantization_config=bnb_config,
    token=None
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length = 4096,
    padding_side="right",
    token=None
)


config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [5]:
tokenizer.add_special_tokens({"pad_token":tokenizer.unk_token})
tokenizer.pad_token = tokenizer.eos_token
config = model.config.update({"pad_token_id":tokenizer.unk_token_id})

In [6]:
transformers_set_seed(132)

lora_config = LoraConfig(r=16,
                         lora_alpha=32,
                         target_modules=["q_proj","k_proj","v_proj","gate_proj","up_proj","down_proj"],
                         lora_dropout=0.05,
                         bias="none",
                         task_type=TaskType.CAUSAL_LM
                         )

model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
model = get_peft_model(model,lora_config)

In [9]:
trainer = SFTTrainer(model=model,
                     train_dataset=train_dataset,
                     dataset_text_field="text",
                     max_seq_length=1024,
                     tokenizer=tokenizer,
                     args=TrainingArguments(
                         per_device_train_batch_size=2,
                         gradient_accumulation_steps=1,
                         warmup_steps=10,
                         max_steps=1000,
                         learning_rate=3e-4,
                         fp16=True,
                         logging_steps=100,
                         output_dir=".",
                         optim = "adamw_8bit",
                         weight_decay=0.05,
                         lr_scheduler_type="linear",
                         seed=132
                     )
)

Map:   0%|          | 0/2064 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

Step,Training Loss
100,1.5518
200,1.674
300,1.6857
400,1.676
500,1.6632
600,1.672


