In [2]:
import os 
import torch 
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
os.environ["WANDB_NOTEBOOK_NAME"] = "SFT_mistral"

print(torch.cuda.device_count())
print(torch.cuda.get_device_name())

print(torch.cuda.mem_get_info())
print(torch.cuda.current_device())

2
Tesla V100-PCIE-32GB
(33337507840, 34079899648)
0


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import wandb
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login

  from .autonotebook import tqdm as notebook_tqdm
2024-02-08 19:44:54.651414: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-08 19:44:54.651466: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-08 19:44:54.653152: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-08 19:44:54.664640: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Use WeightsAndBiases 
to monitor system metrics and training process on the website

In [3]:
wandb.login()

wandb_project = "mistral-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

run = wandb.init(project='Fine tuning mistral 7B', job_type="training", anonymous="allow")

[34m[1mwandb[0m: Currently logged in as: [33mtrevahok[0m. Use [1m`wandb login --relogin`[0m to force relogin


##  Load dataset and model 
n22/Mistral-7B-Instruct-v0.1-sharded if you want to train on Colab 

In [4]:
base_model = "mistralai/Mistral-7B-v0.1" #
dataset_name, new_model = "philschmid/meta-shepherd-human-data", "trevahok/mistral_SFT_quantized"


In [5]:
dataset = load_dataset(dataset_name, split="train")

In [6]:
dataset['text'][:3]

['### Question: Jolene and Phil have four children, each with the same birthday.  They gave birth to their first child exactly 15 years ago.  They gave birth to their second child exactly one year after the birth of their first child.  They gave birth to their third child on the fourth birthday of their second child. Two years after the birth of their third child, they gave birth to their fourth child.  How old, in years, is their fourth child?\n          \n### Answer: The first child was born 15 years ago. The second child was born 1 year after the first child. The third child was born on the second child\'s fourth birthday. The fourth child was born 2 years after the third child. So the fourth child is 15 + 1 + 4 + 2 = 22 years old. The answer is 22.\n\n\n\n### Feedback: Firstly, The math is off in the previous to last sentence, since the ages of the children should be subtracted, not summed. 15 - 1 - 4 -2 = 8 is the right answer. Secondly, "So the fourth child is 15 + 1 + 4 + 2 = 22

## Quantization configuration

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type= "nf4", 
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)

## Load Model and tokenizer

In [14]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map= "auto",
    max_memory = { 0: '30.1GiB', 1: '30.1GiB', }
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()


Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.98s/it]


In [15]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token = True


## Add PEFT with LoRa for Training

In [16]:

model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=32,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

## Training Arguments
Hyperparameters should beadjusted based on the hardware you using. 

In [17]:
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 2,
    per_device_train_batch_size= 16, 
    auto_find_batch_size=True,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 200,
    logging_steps= 30,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

Map: 100%|██████████| 1317/1317 [00:00<00:00, 2704.74 examples/s]
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mtrevahok[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
30,1.474
60,1.2237


TrainOutput(global_step=82, training_loss=1.3039202573822766, metrics={'train_runtime': 2838.9806, 'train_samples_per_second': 0.928, 'train_steps_per_second': 0.029, 'total_flos': 3.297225324896256e+16, 'train_loss': 1.3039202573822766, 'epoch': 1.98})

In [19]:

trainer.model.save_pretrained("SFT_Mistral_with_EOS")
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Lin

In [59]:

def generate_feedback( question, answer):
    runtimeFlag = "cuda:0"
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.unk_token
    tokenizer.add_bos_token = True
    tokenizer.padding_side = "left"

    prompt =f"### Question: {question} \n### Answer: { answer} \n### Feedback: " 

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=False )

    output = trainer.model.generate(**inputs, streamer=streamer, max_new_tokens=200)

    return output

In [68]:
generate_feedback("What is 2+2 in an imaginary world where 2+2=99?", "99")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> ### Question: What is 2+2 in an imaginary world where 2+2=99? 
### Answer: 99 
### Feedback: 2+2=4 in the real world. The answer is incorrect because it says 2+2=99.</s>


tensor([[    1,   774, 22478, 28747,  1824,   349, 28705, 28750, 28806, 28750,
           297,   396, 28374,  1526,   970, 28705, 28750, 28806, 28750, 28746,
         28774, 28774, 28804, 28705,    13, 27332, 26307, 28747, 28705, 28774,
         28774, 28705,    13, 27332,  4615,   286,  1435, 28747, 28705, 28750,
         28806, 28750, 28746, 28781,   297,   272,  1353,  1526, 28723,   415,
          4372,   349, 16390,  1096,   378,  2627, 28705, 28750, 28806, 28750,
         28746, 28774, 28774, 28723,     2]], device='cuda:0')

 ## Compare against base model ( Mistral 7B)

In [48]:

vanilla_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    # quantization_config=bnb_config,
    # device_map= {"" : 1} ,
    device_map= "auto",
    max_memory = { 0: '30.1GiB', 1: '30.1GiB', }
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.56s/it]


In [64]:
def generate_feedback_vanilla( question, answer):
    vanilla_model.eval()
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_bos_token = True
    tokenizer.padding_side = "right"
    runtimeFlag = "cuda:0"

    prompt = f"### Question: {question} \n### Answer: { answer} \n### Feedback: "

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    output = vanilla_model.generate(**inputs, streamer=streamer, max_new_tokens=200)

    return output



In [67]:
generate_feedback_vanilla("What 2+2 in an imaginary world where 2+2 = 99?", "99")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
### 
###


tensor([[    1,   774, 22478, 28747,  1824, 28705, 28750, 28806, 28750,   297,
           396, 28374,  1526,   970, 28705, 28750, 28806, 28750,   327, 28705,
         28774, 28774, 28804, 28705,    13, 27332, 26307, 28747, 28705, 28774,
         28774, 28705,    13, 27332,  4615,   286,  1435, 28747, 28705,    13,
         27332, 28705,    13, 27332, 28705,    13, 27332, 28705,    13, 27332,
         28705,    13, 27332, 28705,    13, 27332, 28705,    13, 27332, 28705,
            13, 27332, 28705,    13, 27332, 28705,    13, 27332, 28705,    13,
         27332, 28705,    13, 27332, 28705,    13, 27332, 28705,    13, 27332,
         28705,    13, 27332, 28705,    13, 27332, 28705,    13, 27332, 28705,
            13, 27332, 28705,    13, 27332, 28705,    13, 27332, 28705,    13,
         27332, 28705,    13, 27332, 28705,    13, 27332, 28705,    13, 27332,
         28705,    13, 27332, 28705,    13, 27332, 28705,    13, 27332, 28705,
            13, 27332, 28705,    13, 27332, 28705,  

In [None]:
generate_feedback_vanilla("What color is the sun?", "Blue")

In [None]:
generate_feedback("What color is the sun?", "Blue")

## Clean up RAM and GPU 

In [1]:
import torch 
import gc 

# del model, tokenizer 

torch.cuda.empty_cache()

gc.collect()

0