In [1]:
%%capture
!pip install accelerate==0.26.1 peft==0.7.1 transformers==4.35.2 trl==0.7.10 datasets==2.16.1 evaluate
!pip uninstall wandb -y

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from trl import SFTTrainer
import os

2024-04-09 13:03:49.667212: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-09 13:03:49.667333: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-09 13:03:49.795037: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data preparation

In [3]:
data = load_dataset('dair-ai/emotion', 'split')
idx2label = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
label2idx = {v:e for e, v in enumerate(idx2label)}
data['train'][0]

Downloading data:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'text': 'i didnt feel humiliated', 'label': 0}

In [4]:
PROMPT = "Identify the sentiment in the text: "

def prepare_data_chatml_format(sample):
    """
    <|im_start|>user
    {query} <|im_end|>
    <|im_start|>assistant
    {response} <|im_end|>
    """
    sample['text'] = f"<|im_start|>user\n{PROMPT} {sample['text']} <|im_end|>\n<|im_start|>assistant\n{idx2label[sample['label']]}<|im_end|>"
    return sample

train_data = data['train'].map(prepare_data_chatml_format, remove_columns=['label'])
valid_data = data['validation'].map(prepare_data_chatml_format, remove_columns=['label'])
print(train_data[0]['text'])

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

<|im_start|>user
Identify the sentiment in the text:  i didnt feel humiliated <|im_end|>
<|im_start|>assistant
sadness<|im_end|>


# Setting Up Tokenizer, Model, Bits N Bytes Config and PEFT Config

In [5]:
model_id = "openai-community/gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

peft_config = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
model.config.use_cache=False
model.config.pretraining_tp=1

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Setting Up Trainer

In [6]:
training_arguments = TrainingArguments(
    output_dir="./logs",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    fp16=True,
    evaluation_strategy="epoch",          
    save_strategy="epoch",
    num_train_epochs=5,
    load_best_model_at_end=True,
    learning_rate=5e-4    
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=valid_data,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=1024
)



Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
finetuned_model_id = "./lora_finetuned_model"

trainer.train()

trainer.model.save_pretrained(finetuned_model_id)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.5138,1.255023
2,1.3256,1.239192
3,1.3054,1.231426
4,1.2913,1.22884
5,1.2846,1.22691


# Merging the LoRA with the base model

In [8]:
pretrained_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True
)
peft_model = PeftModel.from_pretrained(
    pretrained_model,
    finetuned_model_id,
    from_transformers=True,
    device_map="auto"
)

model = peft_model.merge_and_unload()

# Inference

In [9]:
generation_config = GenerationConfig(
    penalty_alpha=0.6, do_sample=True, 
    top_k=5,temperature=0.5,repetition_penalty=1.2, 
    max_new_tokens=32, pad_token_id=tokenizer.eos_token_id
)

def generate_response(prompt):
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs, generation_config=generation_config)
        generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        end_idx = generated_response.index('<|im_end|>', len(prompt)) + len('<|im_end|>')
        return generated_response[:end_idx]
    except:
        return ""


In [10]:
def prepare_prompt_chatml_format(sample):
    sample['prompt'] = f"<|im_start|>user\n{PROMPT}{sample['text']}<|im_end|>\n<|im_start|>assistant\n"
    return sample

test_data = data['test'].map(prepare_prompt_chatml_format)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [11]:
sample_prompt = test_data[0]['prompt']
print(generate_response(sample_prompt))




# Evaluation

In [12]:
from tqdm.notebook import trange
from evaluate import load as load_metric

In [13]:
preds = [generate_response(test_data[i]['prompt']) for i in trange(len(test_data))]
references = data['test']['label']

  0%|          | 0/2000 [00:00<?, ?it/s]

In [14]:
predictions = []
for pred in preds:
    try:
        beg_index = pred.index('<|im_start|>assistant') + len('<|im_start|>assistant')
        end_index = pred.index('<|im_end|>', beg_index)
        predictions.append((pred[beg_index:end_index].strip()))
    except:
        predictions.append("")
        
predictions = [label2idx.get(pred, -1) for pred in predictions]        

In [15]:
f1_metric = load_metric('f1')

scores = f1_metric.compute(
    references=references, predictions=predictions, average=None
)
for label, score in zip(idx2label, scores['f1'][1:]):
    print(label, score)

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

sadness 0.0
joy 0.0
love 0.0
anger 0.0
fear 0.0
surprise 0.0


In [16]:
scores = f1_metric.compute(
    references=references, predictions=predictions, average='weighted'
)
scores

{'f1': 0.0}