In [1]:
%%capture
!pip install accelerate==0.26.1 peft==0.7.1 bitsandbytes==0.42.0 transformers==4.35.2 trl==0.7.10 datasets==2.16.1 
!pip uninstall wandb -y

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from trl import SFTTrainer
import os



# Data preparation

In [3]:
data = load_dataset('dair-ai/emotion', 'split')
idx2label = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
data['train'][0]

Downloading data:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'text': 'i didnt feel humiliated', 'label': 0}

In [4]:
PROMPT = "Identify the sentiment in the text:"

def prepare_data_chatml_format(sample):
    """
    <|im_start|>user
    {query} <|im_end|>
    <|im_start|>assistant
    {response} <|im_end|>
    """
    sample['text'] = f"<|im_start|>user\n{PROMPT} {sample['text']} <|im_end|>\n<|im_start|>assistant\n{idx2label[sample['label']]}<|im_end|>"
    return sample

train_data = data['train'].map(prepare_data_chatml_format, remove_columns=['label'])
valid_data = data['validation'].map(prepare_data_chatml_format, remove_columns=['label'])
print(train_data[0]['text'])

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

<|im_start|>user
Identify the sentiment in the text: i didnt feel humiliated <|im_end|>
<|im_start|>assistant
sadness<|im_end|>


# Setting Up Tokenizer, Model, Bits N Bytes Config and PEFT Config

In [5]:
model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True)
peft_config = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
model.config.use_cache=False
model.config.pretraining_tp=1

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

# Setting Up Trainer

In [6]:
training_arguments = TrainingArguments(
    output_dir="./logs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=1000,
    max_steps=6000,
    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=valid_data,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=1024
)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
finetuned_model_id = "./mistral-finetuned"

trainer.train()

trainer.model.save_pretrained(finetuned_model_id)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1000,1.0282
2000,0.9934
3000,0.9356
4000,0.9422
5000,0.8759
6000,0.8708


# Merging the LoRA with the base model

In [8]:
pretrained_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    load_in_8bit=False,
    device_map="auto",
    trust_remote_code=True
)
peft_model = PeftModel.from_pretrained(
    pretrained_model,
    finetuned_model_id,
    from_transformers=True,
    device_map="auto"
)

model = peft_model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Inference

In [9]:
generation_config = GenerationConfig(
    penalty_alpha=0.6, do_sample=True, 
    top_k=5,temperature=0.5,repetition_penalty=1.2, 
    max_new_tokens=32, pad_token_id=tokenizer.eos_token_id
)

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    outputs = model.generate(**inputs, generation_config=generation_config)
    generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    end_idx = generated_response.index('<|im_end|>', len(prompt)) + len('<|im_end|>')
    return generated_response[:end_idx]


In [10]:
def prepare_prompt_chatml_format(sample):
    sample['prompt'] = f"<|im_start|>user\n{PROMPT}{sample['text']}<|im_end|>\n<|im_start|>assistant\n"
    return sample

test_data = data['test'].map(prepare_prompt_chatml_format)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [11]:
sample_prompt = test_data[0]['prompt']
print(generate_response(sample_prompt))

<|im_start|>user
Identify the sentiment in the text:im feeling rather rotten so im not very ambitious right now<|im_end|>
<|im_start|>assistant
sadness<|im_end|>
