In [None]:
# transformers==4.42.4
# bitsandbytes==0.43.1
# accelerate==0.32.1
# peft==0.11.1

In [1]:
import os
import copy
from dataclasses import dataclass
import re
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from datasets import Dataset
from scipy.special import softmax
from sklearn.preprocessing import LabelEncoder
from transformers import (
    BitsAndBytesConfig,
    LlamaPreTrainedModel,
    LlamaModel,
    AutoTokenizer,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import wandb
wandb.login(key=os.environ['WANDB_KEY'])

wandb.init(
    project="lmsys",
    name='sft convo all ultra feedback',
    notes="same params, enlarged max length",
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msolostringer[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jupyter/.netrc


### Configurations

In [3]:
TRAIN_CSV = "data/train.csv"
model_path = "unsloth/llama-3-8b-Instruct-bnb-4bit"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
target_columns = ['winner_model_a', 'winner_model_b', 'winner_tie']
columns_to_vectorize = ["prompt", "response_a", "response_b"]

train = pd.read_csv(TRAIN_CSV)

train['label'] = train[target_columns].idxmax(axis=1) 
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['label'])
train = train[columns_to_vectorize + ['label']]

In [4]:
from tqdm import tqdm

### Tokenizer and prepare dataset, metrics

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.add_eos_token = True
tokenizer.padding_side = 'right'

# Define label IDs
LABEL_IDS = [tokenizer(i, add_special_tokens=False)["input_ids"][0] for i in ['a', 'b', 'tie']]

MAX_LENGTH = 1700  # Adjust based on your requirements

def accumulate_and_truncate_conversation(prompts, responses, max_length, tokenizer, bot_name):
    tokens = []
    for prompt, response in zip(prompts, responses):
        user_text = f'### User: "{prompt}"\n\n'
        response_text = f'### Bot {bot_name} Response: "{response}"\n\n'
        user_tokens = tokenizer(user_text, add_special_tokens=False)["input_ids"]
        response_tokens = tokenizer(response_text, add_special_tokens=False)["input_ids"]
        if len(tokens) + len(user_tokens) + len(response_tokens) > max_length:
            # Truncate the response tokens to fit the remaining space
            available_space = max_length - len(tokens) - len(user_tokens)
            if available_space > 0:
                response_tokens = response_tokens[:available_space]
                tokens += user_tokens + response_tokens
            break
        tokens += user_tokens + response_tokens
    return tokens

def tokenize(example, tokenizer):
    initial_prompts = eval(example['prompt'], {"null": ""})
    initial_responses = eval(example['response_a'], {"null": ""})
    follow_up_prompts = eval(example['prompt'], {"null": ""})
    follow_up_responses = eval(example['response_b'], {"null": ""})
    
    # Add the separator for conversation with Bot A
    conversation_a_separator = tokenizer('~~~~~~~~~~ CONVERSATION WITH BOT A ~~~~~~~~~~\n\n', add_special_tokens=False)["input_ids"]
    conversation_a = accumulate_and_truncate_conversation(initial_prompts, initial_responses, MAX_LENGTH // 2, tokenizer, 'A')
    
    # Add the separator for conversation with Bot B
    conversation_b_separator = tokenizer('\n\n~~~~~~~~~~ CONVERSATION WITH BOT B ~~~~~~~~~~\n\n', add_special_tokens=False)["input_ids"]
    conversation_b = accumulate_and_truncate_conversation(follow_up_prompts, follow_up_responses, MAX_LENGTH // 2, tokenizer, 'B')
    
    # Add the final separator and the final question
    final_separator = tokenizer('\n\n~~~~~~~~~~ \n\nWhich is the better response for the prompt? a or b or tie?\n\nAnswer: ', add_special_tokens=False)["input_ids"]
    
    label_token_id = LABEL_IDS[int(example['label'])]
    
    # Combine the tokens
    input_ids = [tokenizer.bos_token_id] + conversation_a_separator + conversation_a + conversation_b_separator + conversation_b + final_separator + [label_token_id] + [tokenizer.eos_token_id]
    attention_mask = [1] * len(input_ids)
    labels = [-100] * (len([tokenizer.bos_token_id] + conversation_a_separator + conversation_a + conversation_b_separator + conversation_b + final_separator)) + [label_token_id] + [tokenizer.eos_token_id]
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def load_data(df, tokenizer):
    raw_datasets = Dataset.from_pandas(df)
    tokenized_datasets = raw_datasets.map(
        tokenize, 
        remove_columns=raw_datasets.column_names,
        fn_kwargs={'tokenizer': tokenizer}
    )
    return tokenized_datasets


def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1)
    label_tokens_ids = np.array(LABEL_IDS)
    index_mapping = {value.item(): idx for idx, value in enumerate(label_tokens_ids)}
    labels = labels[np.isin(labels, label_tokens_ids)]
    labels = np.array([index_mapping[label.item()] for label in labels])
    acc = accuracy_score(labels, preds)
    probs = softmax(logits, axis=-1)
    log_loss_ = log_loss(labels, probs)
    return {'accuracy': acc, 'log_loss': log_loss_}

In [7]:
ultra = pd.read_csv('pseudo-labeling/ultrachat_s42_a0.5.csv')
ultra['label'] = ultra[target_columns].idxmax(axis=1) 
label_encoder = LabelEncoder()
ultra['label'] = label_encoder.fit_transform(ultra['label'])
ultra = ultra[columns_to_vectorize + ['label']]

In [8]:
from datasets import concatenate_datasets

In [9]:
n_splits = 5
fold_idx = 0

ds = load_data(train, tokenizer)
ultra_ds = load_data(ultra, tokenizer)

folds = [
    (
        [i for i in range(len(ds)) if i % n_splits != fold_idx],
        [i for i in range(len(ds)) if i % n_splits == fold_idx]
    ) 
    for fold_idx in range(n_splits)
]

train_idx, eval_idx = folds[fold_idx]


train_ds = ds.select(train_idx)
train_ds = concatenate_datasets((ultra_ds, train_ds))
eval_ds = ds.select(eval_idx)

Map: 100%|██████████| 57477/57477 [02:47<00:00, 343.12 examples/s]
Map: 100%|██████████| 34190/34190 [01:58<00:00, 287.64 examples/s]


ValueError: Expected a list of Dataset objects or a list of IterableDataset objects, but element at position 0 is a dict.

In [11]:
len(train_ds), len(eval_ds)

(80171, 11496)

In [13]:
print(tokenizer.decode(train_ds[-1]['input_ids']))

<|begin_of_text|>~~~~~~~~~~ CONVERSATION WITH BOT A ~~~~~~~~~~

### User: "three kids eat three apples in three days, how many apples will nine kids eat in nine days?"

### Bot A Response: "27 apples"



~~~~~~~~~~ CONVERSATION WITH BOT B ~~~~~~~~~~

### User: "three kids eat three apples in three days, how many apples will nine kids eat in nine days?"

### Bot B Response: "If three kids eat three apples in three days, each kid eats one apple in three days (since 3 apples \/ 3 kids = 1 apple per kid).

Now, let's find out how many apples each kid would eat in nine days. Since it takes each kid three days to eat one apple, in nine days, each kid would eat three apples (since 9 days \/ 3 days per apple = 3 apples per kid).

Now, we can calculate how many apples nine kids would eat in nine days:

9 kids * 3 apples per kid = 27 apples

So, nine kids would eat 27 apples in nine days."



~~~~~~~~~~ 

Which is the better response for the prompt? a or b or tie?

Answer: a<|eot_id|>


### Model

In [14]:
class Llama3ForSFT(LlamaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.post_init()

    def forward(
        self,
        input_ids= None,
        attention_mask= None,
        position_ids = None,
        past_key_values= None,
        inputs_embeds= None,
        labels= None,
        use_cache= None,
        output_attentions= None,
        output_hidden_states = None,
        return_dict= None,
        cache_position = None,
    ):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )
        hidden_states = outputs[0]
        if self.config.pretraining_tp > 1:
            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
            logits = torch.cat(logits, dim=-1)
        else:
            logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)

            label_tokens_ids = torch.tensor(LABEL_IDS,device=shift_labels.device)
            index_mapping = {value.item(): idx for idx, value in enumerate(label_tokens_ids)}
            true_labels = shift_labels[torch.isin(shift_labels, label_tokens_ids)]
            true_labels = torch.tensor([index_mapping[label.item()] for label in true_labels], device=true_labels.device)
            true_logits = shift_logits[torch.isin(shift_labels, label_tokens_ids)][:,label_tokens_ids]
            loss = loss_fct(true_logits, true_labels)

        return CausalLMOutputWithPast(
            loss=loss,
            logits=true_logits,
        )

In [15]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
    target_modules=['q_proj', 'k_proj', 'v_proj',], 
)

model = Llama3ForSFT.from_pretrained(
    model_path, 
    torch_dtype=torch.float16, 
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
print(model)
model.print_trainable_parameters()

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Llama3ForSFT(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4bit

#### Training Arguments

In [16]:
args = TrainingArguments(
    output_dir='output_conversational',
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    save_strategy = "steps",
    save_steps=200,
    save_total_limit=1,
    logging_strategy="steps",
    logging_steps=10,
    warmup_steps=20,
    optim="adamw_8bit",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    fp16=True,
    metric_for_best_model="log_loss",
    greater_is_better = False,
    report_to="wandb",
)



### Training !

In [None]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


In [15]:
trainer.model.save_pretrained('output_conversational/final_model')
tokenizer.save_pretrained('output_conversational/final_model')

('output_conversational/final_model/tokenizer_config.json',
 'output_conversational/final_model/special_tokens_map.json',
 'output_conversational/final_model/tokenizer.json')