#Finetune Mixtral8x7B.
This is being run on A100 (40GB).


In [None]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy


In [None]:
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1",
                                             load_in_4bit=True,
                                             torch_dtype=torch.float16,
                                             device_map="auto",
                                            # attn_implementation="flash_attention_2",   #You can use flash attention on your local GPU with specific libraries
                                             )

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [None]:
tokenizer.pad_token = "!" #Not EOS, will explain another time.

In [None]:
CUTOFF_LEN = 256  #Our dataset has shot text
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

In [None]:
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=[ "w1", "w2", "w3"],  #Only Training the "expert" layers
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [None]:
def print_trainable_parameters(m):
    trainable_params = sum(p.numel() for p in m.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in m.parameters())
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params}")

print_trainable_parameters(model)

trainable params: 113246208 || all params: 23595847680 || trainable%: 0.4799412571898752


In [None]:
dataset = load_dataset("harpreetsahota/modern-to-shakesperean-translation") #Found a good small dataset for a quick test run!
print("dataset", dataset)
train_data = dataset["train"] # Not using evaluation data

dataset DatasetDict({
    train: Dataset({
        features: ['modern', 'shakespearean'],
        num_rows: 274
    })
})


In [None]:
def generate_prompt(user_query,  sep="\n\n### "):  #The prompt format is taken from the official Mixtral huggingface page
    sys_msg= "Translate the given text to Shakespearean style."
    p =  "<s> [INST]" + sys_msg +"\n"+ user_query["modern"] + "[/INST]" +  user_query["shakespearean"] + "</s>"
    return p

In [None]:
def tokenize(prompt):
    return tokenizer(
        prompt + tokenizer.eos_token,
        truncation=True,
        max_length=CUTOFF_LEN ,
        padding="max_length"
    )

In [None]:
train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=["modern" , "shakespearean"])

Map:   0%|          | 0/274 [00:00<?, ? examples/s]

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=6,
        learning_rate=1e-4,
        logging_steps=2,
        optim="adamw_torch",
        save_strategy="epoch",
        output_dir="mixtral-moe-lora-instruct-shapeskeare"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False


In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
2,6.5751
4,5.3463
6,3.9095
8,3.36
10,2.6038
12,2.1995
14,2.0692
16,1.8696
18,1.9141
20,1.8137
