pip install -q git+https://github.com/huggingface/trl

In [1]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# The instruction dataset to use
dataset_name = "StarkWizard/cairo-instruct"

# Fine-tuned model name
new_model = "StarkWizard/Mistral-7b-instruct-cairo-PEFT"



import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

hub_name = "StarkWizard/Mistral-7b-instruct-cairo-instruct"

max_steps = 1000 # to tweak to get the best out of the model 
nb_epochs = 3

In [2]:
import wandb
import random

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="mistral-cairo",
    
    # track hyperparameters and run metadata
    config={

    "epochs":nb_epochs,
    }
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpierre-emmanuel-chaut[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer
import os
 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side ="left"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name,
                                             trust_remote_code=True,
                                             load_in_4bit=True,
                                             quantization_config=bnb_config,
                                             low_cpu_mem_usage=True,
                                             device_map={"": 0},
                                             )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [4]:
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os

model.config.use_cache=False
model.config.pretraining_tp=1
model.config.window = 256 
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


Loading Dataset

In [5]:
from datasets import load_dataset

# Load the dataset
dataset_train = load_dataset(dataset_name, split="train", download_mode='force_redownload',ignore_verifications=True)
dataset_test = load_dataset(dataset_name, split="eval", download_mode='force_redownload',ignore_verifications=True)





Downloading readme:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/291k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3227 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/291k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3227 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [6]:



peft_config = LoraConfig(
    r=64, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM", target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",

    ],inference_mode = False
)



model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing = True,
    optim="paged_adamw_32bit",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_steps=5,
    save_strategy="epoch",
    logging_dir="./logs", 
    logging_steps=50,
    num_train_epochs=nb_epochs,
    group_by_length=True,
    fp16=False,
    report_to="wandb",
    push_to_hub=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=None,
    neftune_noise_alpha=5
)




Map:   0%|          | 0/3227 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



In [7]:

trainer.train()
trainer.model.push_to_hub(new_model)
tokenizer.push_to_hub(new_model)

  0%|          | 0/1209 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 3.9176, 'learning_rate': 1.993114384372975e-05, 'epoch': 0.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.3322510719299316, 'eval_runtime': 1.1187, 'eval_samples_per_second': 4.469, 'eval_steps_per_second': 0.894, 'epoch': 0.12}
{'loss': 2.5813, 'learning_rate': 1.9694339696585942e-05, 'epoch': 0.25}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.9762418270111084, 'eval_runtime': 1.0882, 'eval_samples_per_second': 4.595, 'eval_steps_per_second': 0.919, 'epoch': 0.25}
{'loss': 2.2759, 'learning_rate': 1.929276146260306e-05, 'epoch': 0.37}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.43976092338562, 'eval_runtime': 1.0143, 'eval_samples_per_second': 4.93, 'eval_steps_per_second': 0.986, 'epoch': 0.37}
{'loss': 1.9833, 'learning_rate': 1.8733234741963262e-05, 'epoch': 0.5}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.2481772899627686, 'eval_runtime': 1.0866, 'eval_samples_per_second': 4.602, 'eval_steps_per_second': 0.92, 'epoch': 0.5}
{'loss': 1.8403, 'learning_rate': 1.802526977541951e-05, 'epoch': 0.62}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.107473850250244, 'eval_runtime': 1.0261, 'eval_samples_per_second': 4.873, 'eval_steps_per_second': 0.975, 'epoch': 0.62}
{'loss': 1.6999, 'learning_rate': 1.7180899799326968e-05, 'epoch': 0.74}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.0070340633392334, 'eval_runtime': 1.0996, 'eval_samples_per_second': 4.547, 'eval_steps_per_second': 0.909, 'epoch': 0.74}
{'loss': 1.5187, 'learning_rate': 1.6214476517475636e-05, 'epoch': 0.87}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8367398977279663, 'eval_runtime': 1.0681, 'eval_samples_per_second': 4.681, 'eval_steps_per_second': 0.936, 'epoch': 0.87}
{'loss': 1.3952, 'learning_rate': 1.5142426166076644e-05, 'epoch': 0.99}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7963680028915405, 'eval_runtime': 0.9965, 'eval_samples_per_second': 5.018, 'eval_steps_per_second': 1.004, 'epoch': 0.99}




{'loss': 1.3404, 'learning_rate': 1.3982970318050471e-05, 'epoch': 1.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7200286388397217, 'eval_runtime': 1.0305, 'eval_samples_per_second': 4.852, 'eval_steps_per_second': 0.97, 'epoch': 1.12}
{'loss': 1.2952, 'learning_rate': 1.2755816172089164e-05, 'epoch': 1.24}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6555883884429932, 'eval_runtime': 1.0985, 'eval_samples_per_second': 4.551, 'eval_steps_per_second': 0.91, 'epoch': 1.24}
{'loss': 1.292, 'learning_rate': 1.1481821590629984e-05, 'epoch': 1.36}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.660041093826294, 'eval_runtime': 1.0782, 'eval_samples_per_second': 4.637, 'eval_steps_per_second': 0.927, 'epoch': 1.36}
{'loss': 1.2547, 'learning_rate': 1.0182640580069249e-05, 'epoch': 1.49}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.598638653755188, 'eval_runtime': 1.0891, 'eval_samples_per_second': 4.591, 'eval_steps_per_second': 0.918, 'epoch': 1.49}
{'loss': 1.2256, 'learning_rate': 8.880355238966923e-06, 'epoch': 1.61}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5672248601913452, 'eval_runtime': 1.0651, 'eval_samples_per_second': 4.694, 'eval_steps_per_second': 0.939, 'epoch': 1.61}
{'loss': 1.2464, 'learning_rate': 7.597100429995461e-06, 'epoch': 1.73}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5793023109436035, 'eval_runtime': 1.0384, 'eval_samples_per_second': 4.815, 'eval_steps_per_second': 0.963, 'epoch': 1.73}
{'loss': 1.216, 'learning_rate': 6.354687555060303e-06, 'epoch': 1.86}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5899643898010254, 'eval_runtime': 1.0393, 'eval_samples_per_second': 4.811, 'eval_steps_per_second': 0.962, 'epoch': 1.86}
{'loss': 1.1889, 'learning_rate': 5.174233828262855e-06, 'epoch': 1.98}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.569914698600769, 'eval_runtime': 1.0876, 'eval_samples_per_second': 4.597, 'eval_steps_per_second': 0.919, 'epoch': 1.98}




{'loss': 1.1494, 'learning_rate': 4.075803347930245e-06, 'epoch': 2.11}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5493152141571045, 'eval_runtime': 1.0423, 'eval_samples_per_second': 4.797, 'eval_steps_per_second': 0.959, 'epoch': 2.11}
{'loss': 1.1279, 'learning_rate': 3.0780660683881625e-06, 'epoch': 2.23}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5586141347885132, 'eval_runtime': 1.1004, 'eval_samples_per_second': 4.544, 'eval_steps_per_second': 0.909, 'epoch': 2.23}
{'loss': 1.1415, 'learning_rate': 2.1979804679123108e-06, 'epoch': 2.35}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.548916220664978, 'eval_runtime': 1.0845, 'eval_samples_per_second': 4.611, 'eval_steps_per_second': 0.922, 'epoch': 2.35}
{'loss': 1.1566, 'learning_rate': 1.4505053065314612e-06, 'epoch': 2.48}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5443440675735474, 'eval_runtime': 0.9798, 'eval_samples_per_second': 5.103, 'eval_steps_per_second': 1.021, 'epoch': 2.48}
{'loss': 1.143, 'learning_rate': 8.483453729167623e-07, 'epoch': 2.6}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5394967794418335, 'eval_runtime': 1.0381, 'eval_samples_per_second': 4.817, 'eval_steps_per_second': 0.963, 'epoch': 2.6}
{'loss': 1.1344, 'learning_rate': 4.0173554188154273e-07, 'epoch': 2.73}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5391888618469238, 'eval_runtime': 1.0463, 'eval_samples_per_second': 4.779, 'eval_steps_per_second': 0.956, 'epoch': 2.73}
{'loss': 1.1182, 'learning_rate': 1.182668128528286e-07, 'epoch': 2.85}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.538561224937439, 'eval_runtime': 1.0593, 'eval_samples_per_second': 4.72, 'eval_steps_per_second': 0.944, 'epoch': 2.85}
{'loss': 1.1438, 'learning_rate': 2.7572861278046813e-09, 'epoch': 2.97}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.538438320159912, 'eval_runtime': 1.0759, 'eval_samples_per_second': 4.647, 'eval_steps_per_second': 0.929, 'epoch': 2.97}
{'train_runtime': 11391.8564, 'train_samples_per_second': 0.85, 'train_steps_per_second': 0.106, 'train_loss': 1.5134143095750074, 'epoch': 3.0}


adapter_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/StarkWizard/Mistral-7b-instruct-cairo-PEFT/commit/e6dfe392d076aea94bf1397f1f3924deb12fc343', commit_message='Upload tokenizer', commit_description='', oid='e6dfe392d076aea94bf1397f1f3924deb12fc343', pr_url=None, pr_revision=None, pr_num=None)