# MIXTRAL 8x7B - Mixture of Experts

### Install Required Packages

In [1]:
# !pip install -i https://pypi.org/simple/ bitsandbytes

In [2]:
# !pip show transformers

In [3]:
# pip install transformers==4.37

In [4]:
# !pip install transformers trl accelerate torch bitsandbytes peft
# !pip install datasets


#### Load HF Dataset

We need a dataset to fine-tune a model, for this example we will be using a subset of the `mosaicml/instruct-v3` dataset.

In [1]:
from datasets import load_dataset
import os

relative_path_to_root = '../../../'
dataset_folder = 'Datasets/qa_synthetic_0_2_0'
data_files = {
    'test': os.path.join(relative_path_to_root, dataset_folder, 'qa_test.csv'),
    'train': os.path.join(relative_path_to_root, dataset_folder, 'qa_train.csv')
}
instruct_tune_dataset = load_dataset("csv", data_files=data_files)

In [6]:
instruct_tune_dataset

DatasetDict({
    test: Dataset({
        features: ['index', 'document', 'generated_text', 'article_number_trudkod', 'C', 'Q', 'A'],
        num_rows: 179
    })
    train: Dataset({
        features: ['index', 'document', 'generated_text', 'article_number_trudkod', 'C', 'Q', 'A'],
        num_rows: 715
    })
})

In [2]:
def create_prompt(sample):
  bos_token = "<s>"
  # original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
  # system_message = "[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM."
  system_message = "[INST]"
  input = sample['C'] + " " + sample['Q']
  response = sample["A"]
  eos_token = "</s>"
  full_prompt = ""
  full_prompt += bos_token
  full_prompt += system_message
  full_prompt += input
  full_prompt += "[/INST]"
  full_prompt += response
  full_prompt += eos_token
  return full_prompt

### Loading the Base Model

Load the model in `4bit`, with double quantization, with `bfloat16` as the compute dtype.

In this case we are using the instruct-tuned model - instead of the base model. For fine-tuning a base model will need a lot more data!

In [1]:
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

In [2]:
import torch
compute_dtype = getattr(torch, "float16")
compute_dtype

torch.float16

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_compute_dtype=torch.bfloat16,
   bnb_4bit_use_double_quant=True,
)

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=nf4_config,
    device_map = 'auto'
)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

# tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [14]:
# def generate_response(prompt, model):
#   encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
#   model_inputs = encoded_input.to('cuda')

#   generated_ids = model.generate(**model_inputs,
#                                  max_new_tokens=512,
#                                  do_sample=True,
#                                  pad_token_id=tokenizer.eos_token_id)

#   decoded_output = tokenizer.batch_decode(generated_ids)

#   return decoded_output[0].replace(prompt, "")

### Tokenization

In [7]:
def tokenize_prompts(prompt):
    return tokenizer(create_prompt(prompt))

tokenized_train_dataset = instruct_tune_dataset["train"].map(tokenize_prompts)
tokenized_val_dataset = instruct_tune_dataset["test"].map(tokenize_prompts)

NameError: name 'instruct_tune_dataset' is not defined

In [15]:
sum(map(len, tokenized_train_dataset['input_ids']))
# /len(tokenized_train_dataset['input_ids'])

244129

### Setting up the Training
we will be using the `huggingface` and the `peft` library!

In [16]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
# v3 alpha 16 r 64
# v4 alpha 32 r 128
# v5 alpha 64 r 32
peft_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.1,
    r=128,
    bias="none",
        target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    task_type="CAUSAL_LM"
)

In [17]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [18]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [19]:
print_trainable_parameters(model)

trainable params: 109051904 || all params: 23591653376 || trainable%: 0.46224782240544227


In [1]:
def activations_memory(num_layers, seq_len, batch_size, hidden_dim, num_heads, precision=2):
    "Returns amount of GPU VRAM (in GB) required to store intermediate activations for traditional Transformer Encoder block"
    mem_bytes = num_layers * precision * seq_len * batch_size * hidden_dim * (
        16 + 2/precision + 2*num_heads*seq_len/hidden_dim + num_heads*seq_len/(precision*hidden_dim))
    return round(mem_bytes / 10**9, 2)

In [4]:
activations_memory(32, 512, 4, 14366, 32) / 4

9.345

In [20]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MixtralForCausalLM(
      (model): MixtralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MixtralDecoderLayer(
            (self_attn): MixtralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): 

### Hyper-paramters for training
These parameters will depend on how long you want to run training for.
Most important to consider:

`num_train_epochs/max_steps`: How many iterations over the data you want to do, BE CAREFUL, don't try too many, you will over-fit!!!!!

`learning_rate`: Controls the speed of convergence


In [21]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    print(torch.cuda.device_count())
    model.is_parallelizable = True
    model.model_parallel = True

In [22]:
import wandb
# wandb.login(relogin=True)
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mai_d_ar[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [23]:
run = wandb.init(
    project="mixtral-adapter-4bit-v5",
    config={
        "lora_alpha":64,
        "lora_dropout":0.1,
        "r":32,
    }
)

In [24]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
model_save_path = 'Models/Mixtral_finetune_4bit_v5'
full_model_path = os.path.join(relative_path_to_root, model_save_path)
args = TrainingArguments(
  output_dir = full_model_path,
  num_train_epochs=25,
  per_device_train_batch_size = 20,
  warmup_steps = 0.03,
  logging_steps=2,
  save_strategy="epoch",
  evaluation_strategy="epoch",
  learning_rate=2.5e-5,
  bf16=True,
  load_best_model_at_end=True
)
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

Setting up the trainer.

`max_seq_length`: Context window size


In [25]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True,
  formatting_func=create_prompt, # this will aplly the create_prompt mapping to all training and test dataset
  args=args,
  train_dataset=instruct_tune_dataset["train"],
  eval_dataset=instruct_tune_dataset["test"],
  callbacks=[early_stopping]
)

In [26]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,0.9328,0.945367
2,0.7802,0.80552
3,0.7152,0.73017
4,0.6545,0.678926
5,0.6337,0.656827
6,0.6082,0.643551
7,0.5823,0.633464
8,0.5537,0.627393
9,0.5473,0.624867
10,0.5327,0.622129




TrainOutput(global_step=312, training_loss=0.643589888436672, metrics={'train_runtime': 5062.8557, 'train_samples_per_second': 2.36, 'train_steps_per_second': 0.119, 'total_flos': 8.895470118528614e+17, 'train_loss': 0.643589888436672, 'epoch': 13.0})

In [27]:
trainer.save_model(os.path.join(full_model_path, 'final_model'))