In [1]:
import os

import pandas as pd
from datasets import load_dataset, load_from_disk
import json
from matplotlib import pyplot as plt
import numpy as np

%matplotlib inline
data_dir = os.path.join("data/llm-classification-finetuning")
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_PROJECT"] = "llm-classification-finetuning-finnal"
os.environ["WANDB_DIR"] = data_dir


In [2]:
from datasets import load_dataset

id2label = {0: "winner_model_a", 1: "winner_model_b", 2: "winner_tie"}
label2id = {v: k for k, v in id2label.items()}


def preprocess_function(examples):
    prompts = examples["prompt"]
    response_as = examples["response_a"]
    response_bs = examples["response_b"]
    winner_model_a = examples["winner_model_a"]
    winner_model_b = examples["winner_model_b"]
    winner_tie = examples["winner_tie"]
    ids = examples["id"]

    samples = []
    for (
        prompt,
        response_a,
        response_b,
        winner_model_a,
        winner_model_b,
        winner_tie,
        id,
    ) in zip(
        prompts,
        response_as,
        response_bs,
        winner_model_a,
        winner_model_b,
        winner_tie,
        ids,
    ):
        prompt = json.loads(prompt)
        response_a = json.loads(response_a)
        response_b = json.loads(response_b)
        if winner_model_a == 1:
            label = "winner_model_a"
        elif winner_model_b == 1:
            label = "winner_model_b"
        elif winner_tie == 1:
            label = "winner_tie"
        else:
            raise ValueError("Invalid label")

        prompt = "".join(prompt)
        response_a = "".join([r if r is not None else "" for r in response_a])
        response_b = "".join([r if r is not None else "" for r in response_b])

        sentences = [prompt, response_a, response_b]
        samples.append((id, sentences, label))

    return {
        "id": [id for id, _, _ in samples],
        "sentences": [text for _, text, _ in samples],
        "labels": [label2id[l] for _, _, l in samples],
    }


def preprocess_save_dataset(dataset):
    dataset = dataset["train"]
    dataset = dataset.map(
        preprocess_function,
        batched=True,
        batch_size=8,
        remove_columns=dataset.column_names,
    )
    # dataset = dataset.shuffle(seed=42).shard(num_shards=100, index=0)
    dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
    dataset.save_to_disk(os.path.join(data_dir, "dataset_dialog"))


# preprocess_save_dataset(load_dataset(os.path.join(data_dir, "data_csv")))


In [3]:
from transformers import AutoTokenizer
from functools import partial
import os
import torch

data_dir = os.path.join("data/llm-classification-finetuning")


def tokenize_function(examples, tokenizer):
    encodings = []
    for sentence in examples["sentences"]:
        encoding = tokenizer(
            sentence,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        encodings.append(encoding)
    result = {}
    for key in encodings[0].keys():
        result[key] = torch.stack([encoding[key] for encoding in encodings])
    result["labels"] = examples["labels"]
    return result


def tokenize_dataset(tokenizer_name):
    t = AutoTokenizer.from_pretrained(
        tokenizer_name,
        use_fast=True,
    )
    if t.pad_token is None:
        t.pad_token = t.eos_token

    tokenizer = partial(
        tokenize_function,
        tokenizer=t,
    )
    dataset = load_from_disk(os.path.join(data_dir, "dataset_dialog"))
    tokenized_dataset = dataset.map(
        tokenizer, batched=True, remove_columns=dataset["train"].column_names
    )
    return tokenized_dataset

In [4]:
from torch import nn


class LongSeqClassifier(nn.Module):
    """
    A long sequence classifier that uses a pre-trained transformer model as the base model
    """

    def __init__(
        self,
        base_model,
        num_classes,
        base_model_require_grad=True,
        lstm_hidden_size=256,
        dropout=0.5,
    ):
        super().__init__()
        hidden_size = base_model.config.hidden_size
        self.model = base_model
        self.base_model_require_grad = base_model_require_grad
        self.word_lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=lstm_hidden_size,
            batch_first=True,
            bidirectional=True,
        )
        self.sentence_lstm = nn.LSTM(
            input_size=lstm_hidden_size * 2,
            hidden_size=lstm_hidden_size,
            batch_first=True,
            bidirectional=True,
        )
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(lstm_hidden_size * 2, num_classes)

        def xavier_init(layer):
            for name, param in layer.named_parameters():
                if "weight_ih" in name:
                    torch.nn.init.xavier_uniform_(param.data)
                elif "weight_hh" in name:
                    torch.nn.init.xavier_uniform_(param.data)
                elif "bias" in name:
                    torch.nn.init.zeros_(param.data)
                    # 设置forget gate的偏置为1
                    param.data[lstm_hidden_size : 2 * lstm_hidden_size].fill_(1)

        xavier_init(self.word_lstm)
        xavier_init(self.sentence_lstm)
        nn.init.uniform_(self.classifier.weight, a=-0.1, b=0.1)
        if self.classifier.bias is not None:
            nn.init.uniform_(self.classifier.bias, -0.1, 0.1)

    def forward(
        self,
        input_ids,
        attention_mask,
        labels=None,
    ):
        """
        Forward pass of the model
        """
        batch_size = input_ids.size(0)
        num_sentences = input_ids.size(1)
        input_ids = input_ids.view(-1, input_ids.size(-1))
        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
        inputs = {
            k: v for k, v in locals().items() if k in ["input_ids", "attention_mask"]
        }
        if self.base_model_require_grad:
            transformer_outputs = self.model(**inputs)
        else:
            self.model.eval()
            with torch.no_grad():
                transformer_outputs = self.model(**inputs)

        outputs = transformer_outputs.last_hidden_state

        word_lstm_output, _ = self.word_lstm(
            outputs
        )  # (batch_size * num_sentences, seq_len, lstm_hidden_size * 2)
        sentence_embeddings = []
        for i in range(batch_size * num_sentences):
            mask = attention_mask[i].bool()
            valid_output = word_lstm_output[i][mask]
            if len(valid_output) > 0:
                sentence_embedding = valid_output.mean(dim=0)
            else:
                sentence_embedding = torch.zeros(self.word_lstm.hidden_size * 2).to(
                    word_lstm_output.device
                )
            sentence_embeddings.append(sentence_embedding)

        sentence_embeddings = torch.stack(sentence_embeddings).view(
            batch_size, num_sentences, -1
        )

        sentence_lstm_output, _ = self.sentence_lstm(sentence_embeddings)
        finnal_output = self.dropout(sentence_lstm_output[:, -1, :])
        logits = self.classifier(finnal_output)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

        return (loss, logits) if loss is not None else logits

In [5]:
from transformers import BitsAndBytesConfig, AutoModel
from peft import prepare_model_for_kbit_training, PeftModel

bit_and_byte_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModel.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    quantization_config=bit_and_byte_config,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
)
base_model = prepare_model_for_kbit_training(
    base_model, gradient_checkpointing_kwargs={"use_reentrant": False}
)
model = LongSeqClassifier(base_model, num_classes=3)
model = PeftModel.from_pretrained(
    model,
    "/home/nevermore/ml-workspace/kaggle-workspace/data/llm-classification-finetuning/output-Meta-Llama-3-8B-4bit-lora/model",
)
tokenized_dataset = tokenize_dataset("meta-llama/Meta-Llama-3-8B")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
from tqdm.auto import tqdm

classifier = model

classifier.to(torch.device("cuda"))
eval_dataset = tokenized_dataset["test"]
dataloader = DataLoader(
    eval_dataset,
    batch_size=8,
    pin_memory=True,
    collate_fn=default_data_collator,
)
classifier.eval()

llama_logits = []
with torch.no_grad():
    for inputs in tqdm(dataloader):
        inputs = {k: v.to(torch.device("cuda")) for k, v in inputs.items()}
        outputs = classifier(**inputs)
        llama_logits.append(outputs[1].detach().cpu().numpy())
llama_logits = np.concatenate(llama_logits, axis=0)

  0%|          | 0/719 [00:00<?, ?it/s]

In [5]:
from transformers import BitsAndBytesConfig, AutoModel
from peft import PeftModel, prepare_model_for_kbit_training

bit_and_byte_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModel.from_pretrained(
    "google/gemma-2-2b",
    quantization_config=bit_and_byte_config,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
)
base_model = prepare_model_for_kbit_training(
    base_model, gradient_checkpointing_kwargs={"use_reentrant": False}
)
model = LongSeqClassifier(base_model, num_classes=3)
model = PeftModel.from_pretrained(
    model,
    "/home/nevermore/ml-workspace/kaggle-workspace/data/llm-classification-finetuning/output-gemma-2-2b-4bit-lora/model",
)
classifier = model
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [24]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
from tqdm.auto import tqdm
import numpy as np

classifier = model

classifier.to(torch.device("cuda"))
eval_dataset = tokenized_dataset["test"]
dataloader = DataLoader(
    eval_dataset,
    batch_size=16,
    pin_memory=True,
    collate_fn=default_data_collator,
)
classifier.eval()

gemma_logits = []
with torch.no_grad():
    for inputs in tqdm(dataloader):
        inputs = {k: v.to(torch.device("cuda")) for k, v in inputs.items()}
        outputs = classifier(**inputs)
        gemma_logits.append(outputs[1].detach().cpu().numpy())
gemma_logits = np.concatenate(gemma_logits, axis=0)

  0%|          | 0/360 [00:00<?, ?it/s]

In [15]:
from sklearn.metrics import log_loss
from scipy.optimize import minimize


def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)


labels = np.array(tokenized_dataset["test"]["labels"])
print(log_loss(labels, softmax(llama_logits)), log_loss(labels, softmax(gemma_logits)))


def ensembling(logits_list, W):
    return np.sum(
        [W[i] * softmax(logits) for i, logits in enumerate(logits_list)], axis=0
    )


log_loss(labels, softmax(ensembling([llama_logits, gemma_logits], [0.5, 0.5])))


minimize(  # minimize log loss
    lambda W: log_loss(labels, softmax(ensembling([llama_logits, gemma_logits], W))),
    x0=[0.5, 0.5],
    method="Nelder-Mead",
)

0.9979836762643524 1.1193134470481056


       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 0.9964845564404504
             x: [ 3.100e+00 -6.333e-01]
           nit: 51
          nfev: 102
 final_simplex: (array([[ 3.100e+00, -6.333e-01],
                       [ 3.100e+00, -6.333e-01],
                       [ 3.100e+00, -6.332e-01]]), array([ 9.965e-01,  9.965e-01,  9.965e-01]))

In [18]:
ensembling([llama_logits, gemma_logits], [3.100, -0.6333])

array([[0.2883788 , 0.9152256 , 1.2630954 ],
       [0.838242  , 1.071621  , 0.55683696],
       [0.33881384, 0.97839296, 1.1494932 ],
       ...,
       [1.9319441 , 0.1695876 , 0.3651681 ],
       [0.84046996, 1.0711923 , 0.55503774],
       [1.5497681 , 0.354356  , 0.5625758 ]], dtype=float32)

In [9]:
from transformers import LlamaConfig

DatasetDict({
    train: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 57477
    })
})

In [8]:
from accelerate.utils import write_basic_config

write_basic_config()

PosixPath('/home/nevermore/.cache/huggingface/accelerate/default_config.yaml')

In [6]:
model

PeftModel(
  (base_model): LoraModel(
    (model): LongSeqClassifier(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj):

In [2]:
4096 * 4096 * 0.9

15099494.4

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

data_csv_split_dir = os.path.join(data_dir, "data_csv_split")
os.makedirs(data_csv_split_dir, exist_ok=True)
df = pd.read_csv(os.path.join(data_dir, "data_csv", "train.csv"))
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df.to_csv(os.path.join(data_dir, "data_csv_split", "train.csv"), index=False)
test_df.to_csv(os.path.join(data_dir, "data_csv_split", "test.csv"), index=False)


In [10]:
(
    pd.read_csv(os.path.join(data_dir, "data_csv_split", "train.csv")).shape,
    pd.read_csv(os.path.join(data_dir, "data_csv_split", "test.csv")).shape,
)


((51729, 9), (5748, 9))

In [12]:
pd.read_csv(os.path.join(data_dir, "data_csv", "train.csv")).shape

(57477, 9)

In [14]:
load_dataset("csv", data_dir=os.path.join(data_dir, "data_csv_split"))

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 51729
    })
    test: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 5748
    })
})

In [15]:
from transformers import AutoTokenizer, AutoModel

model_name = "google/gemma-2-9b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
tokenizer.save_pretrained(os.path.join(data_dir, "tokenizer", "gemma-2-9b"))
model.save_pretrained(os.path.join(data_dir, "full_precision", "gemma-2-9b"))

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

[2024-11-11 19:43:15,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [16]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
tokenizer.save_pretrained(os.path.join(data_dir, "tokenizer", "llama-3-8b-instruct"))
model.save_pretrained(os.path.join(data_dir, "full_precision", "llama-3-8b-instruct"))


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]