In [1]:
!pip install accelerate peft trl datasets bitsandbytes auto-gptq optimum -q

In [2]:
import torch
import torch.nn as nn
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM,GPTQConfig, TrainingArguments
from peft import LoraConfig,prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer

dataset = datasets.load_dataset('iamtarun/python_code_instructions_18k_alpaca',split='train')



In [3]:
model_ckpt = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(
    model_ckpt
)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
quantization_config = GPTQConfig(bits=4,disable_exllama=True,tokenizer=tokenizer)
model = AutoModelForCausalLM.from_pretrained(
    model_ckpt,
    quantization_config=quantization_config,
    device_map='auto')
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.
You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. use_exllama, exllama_config, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


Downloading model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [5]:
lora_config = LoraConfig(r=16,
                        lora_alpha=32,
                        lora_dropout=0.05,
                        bias='none',
                        task_type='CAUSAL_LM',
                        target_modules=[
                                    "q_proj",
                                    "k_proj",
                                    "v_proj",
                                    "o_proj",
                                    "gate_proj",
                                    "up_proj",
                                    "down_proj",
                                        ]
)
model = get_peft_model(model,lora_config)

In [6]:
training_args = TrainingArguments(output_dir='.',
                                 dataloader_drop_last=True,
                                 save_strategy='epoch',
                                 num_train_epochs=1,
                                 logging_steps=100,
                                 max_steps=1000,
                                 per_device_train_batch_size=1,
                                 learning_rate=3e-4,
                                 lr_scheduler_type='cosine',
                                 warmup_steps=100,
                                 fp16=True,
                                 #gradient_accumulation_steps=2,
                                 weight_decay=0.05,
                                 report_to=None,
                                 run_name='finetuning-mistral-7b')

In [7]:
trainer = SFTTrainer(model=model,
                    args=training_args,
                    train_dataset = dataset,
                    dataset_text_field='prompt',
                    max_seq_length=1024,
                    tokenizer=tokenizer,
                    packing=False)

  0%|          | 0/19 [00:00<?, ?ba/s]



In [8]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,0.6874
200,0.6325
300,0.6316
400,0.6257
500,0.625
600,0.6082
700,0.5988
800,0.5616
900,0.5419
1000,0.5493


TrainOutput(global_step=1000, training_loss=0.6062065391540528, metrics={'train_runtime': 2775.3909, 'train_samples_per_second': 0.36, 'train_steps_per_second': 0.36, 'total_flos': 218754446622720.0, 'train_loss': 0.6062065391540528, 'epoch': 0.05})