In [None]:
import os

import pandas as pd
from datasets import load_dataset, load_from_disk
import json
from matplotlib import pyplot as plt
import numpy as np

%matplotlib inline
data_dir = os.path.join("data/llm-classification-finetuning")
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_PROJECT"] = "llm-classification-finetuning-finnal"
os.environ["WANDB_DIR"] = data_dir


In [6]:
from datasets import load_dataset

id2label = {0: "winner_model_a", 1: "winner_model_b", 2: "winner_tie"}
label2id = {v: k for k, v in id2label.items()}


def preprocess_function(examples):
    prompts = examples["prompt"]
    response_as = examples["response_a"]
    response_bs = examples["response_b"]
    winner_model_a = examples["winner_model_a"]
    winner_model_b = examples["winner_model_b"]
    winner_tie = examples["winner_tie"]
    ids = examples["id"]

    samples = []
    for (
        prompt,
        response_a,
        response_b,
        winner_model_a,
        winner_model_b,
        winner_tie,
        id,
    ) in zip(
        prompts,
        response_as,
        response_bs,
        winner_model_a,
        winner_model_b,
        winner_tie,
        ids,
    ):
        prompt = json.loads(prompt)
        response_a = json.loads(response_a)
        response_b = json.loads(response_b)
        if winner_model_a == 1:
            label = "winner_model_a"
        elif winner_model_b == 1:
            label = "winner_model_b"
        elif winner_tie == 1:
            label = "winner_tie"
        else:
            raise ValueError("Invalid label")

        prompt = "".join(prompt)
        response_a = "".join([r if r is not None else "" for r in response_a])
        response_b = "".join([r if r is not None else "" for r in response_b])

        sentences = [prompt, response_a, response_b]
        samples.append((id, sentences, label))

    return {
        "id": [id for id, _, _ in samples],
        "sentences": [text for _, text, _ in samples],
        "labels": [label2id[l] for _, _, l in samples],
    }


def preprocess_save_dataset(dataset):
    dataset = dataset["train"]
    dataset = dataset.map(
        preprocess_function,
        batched=True,
        batch_size=8,
        remove_columns=dataset.column_names,
    )
    # dataset = dataset.shuffle(seed=42).shard(num_shards=100, index=0)
    dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
    dataset.save_to_disk(os.path.join(data_dir, "dataset_dialog"))


# preprocess_save_dataset(load_dataset(os.path.join(data_dir, "data_csv")))
dataset = load_from_disk(os.path.join(data_dir, "dataset_dialog"))

In [7]:
from torch import nn
import torch
from transformers import PreTrainedModel


class LongSeqClassifier(nn.Module):
    def __init__(
        self,
        base_model: PreTrainedModel,
        num_classes,
        lstm_hidden_size=256,
        dropout=0.5,
    ):
        super(LongSeqClassifier, self).__init__()
        hidden_size = base_model.config.hidden_size
        self.base_model = base_model
        self.word_lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=lstm_hidden_size,
            batch_first=True,
            bidirectional=True,
        )
        self.sentence_lstm = nn.LSTM(
            input_size=lstm_hidden_size * 2,
            hidden_size=lstm_hidden_size,
            batch_first=True,
            bidirectional=True,
        )
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(lstm_hidden_size * 2, num_classes)

        def xavier_init(layer):
            for name, param in layer.named_parameters():
                if "weight_ih" in name:
                    torch.nn.init.xavier_uniform_(param.data)
                elif "weight_hh" in name:
                    torch.nn.init.xavier_uniform_(param.data)
                elif "bias" in name:
                    torch.nn.init.zeros_(param.data)
                    # 设置forget gate的偏置为1
                    param.data[lstm_hidden_size : 2 * lstm_hidden_size].fill_(1)

        xavier_init(self.word_lstm)
        xavier_init(self.sentence_lstm)
        nn.init.uniform_(self.classifier.weight, a=-0.1, b=0.1)
        if self.classifier.bias is not None:
            nn.init.uniform_(self.classifier.bias, -0.1, 0.1)

    def forward(self, input_ids, attention_mask, labels=None):
        batch_size = input_ids.size(0)
        num_sentences = input_ids.size(1)
        input_ids = input_ids.view(-1, input_ids.size(-1))
        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
        inputs = {
            k: v for k, v in locals().items() if k in ["input_ids", "attention_mask"]
        }
        self.base_model.eval()
        with torch.no_grad():
            outputs = self.base_model(**inputs).last_hidden_state

        word_lstm_output, _ = self.word_lstm(
            outputs
        )  # (batch_size * num_sentences, seq_len, lstm_hidden_size * 2)
        sentence_embeddings = []
        for i in range(batch_size * num_sentences):
            mask = attention_mask[i].bool()
            valid_output = word_lstm_output[i][mask]
            if len(valid_output) > 0:
                sentence_embedding = valid_output.mean(dim=0)
            else:
                sentence_embedding = torch.zeros(self.word_lstm.hidden_size * 2).to(
                    word_lstm_output.device
                )
            sentence_embeddings.append(sentence_embedding)

        sentence_embeddings = torch.stack(sentence_embeddings).view(
            batch_size, num_sentences, -1
        )

        sentence_lstm_output, _ = self.sentence_lstm(sentence_embeddings)
        finnal_output = self.dropout(sentence_lstm_output[:, -1, :])
        logits = self.classifier(finnal_output)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
            return loss, logits
        return logits

In [8]:
from transformers import AutoModel

identifier = "llama-3B-orgin"
base_model_name = "meta-llama/llama-3.2-3B-Instruct"
base_mode = AutoModel.from_pretrained(
    base_model_name, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
)
classifier = LongSeqClassifier(base_mode, num_classes=3)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
from transformers import AutoTokenizer
from functools import partial


def tokenize_function(examples, tokenizer):
    encodings = []
    for sentence in examples["sentences"]:
        encoding = tokenizer(
            sentence,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        encodings.append(encoding)
    result = {}
    for key in encodings[0].keys():
        result[key] = torch.stack([encoding[key] for encoding in encodings])
    result["labels"] = examples["labels"]
    return result


t = AutoTokenizer.from_pretrained(
    base_model_name,
    use_fast=True,
)
if t.pad_token is None:
    t.pad_token = t.eos_token
tokenizer = partial(
    tokenize_function,
    tokenizer=t,
)
tokenized_dataset = dataset.map(
    tokenizer, batched=True, remove_columns=dataset["train"].column_names
)

In [10]:
from transformers import TrainingArguments, Trainer, default_data_collator

batch_size = 16

args = TrainingArguments(
    run_name=f"run-{identifier}",
    output_dir=os.path.join(data_dir, f"output-{identifier}"),
    logging_dir=os.path.join(data_dir, f"logs-{identifier}"),
    eval_strategy="steps",
    eval_steps=0.2,
    save_only_model=True,
    save_steps=0.2,
    save_strategy="steps",
    save_total_limit=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=100,
    logging_strategy="steps",
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_steps=200,
    report_to=["tensorboard", "wandb"],
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    greater_is_better=False,
    bf16=True,
)
trainer = Trainer(
    model=classifier,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=default_data_collator,
)
trainer.train()

[2024-10-31 15:05:10,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msting_nevermore[0m ([33msting_nevermore-personal[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
647,1.0815,1.060604
1294,1.0536,1.045273
1941,1.0346,1.031837
2588,1.0442,1.027467


TrainOutput(global_step=3234, training_loss=1.0577747012334764, metrics={'train_runtime': 6211.4653, 'train_samples_per_second': 8.328, 'train_steps_per_second': 0.521, 'total_flos': 0.0, 'train_loss': 1.0577747012334764, 'epoch': 1.0})

In [11]:
trainer.save_model(os.path.join(data_dir, f"model-{identifier}"))

In [12]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import default_data_collator

classifier.to(torch.device("cuda"))

eval_dataset = tokenized_dataset["test"]
dataloader = DataLoader(
    eval_dataset,
    batch_size=batch_size,
    pin_memory=True,
    collate_fn=default_data_collator,
)
classifier.eval()

logits = []
with torch.no_grad():
    for inputs in tqdm(dataloader):
        inputs = {k: v.to(torch.device("cuda")) for k, v in inputs.items()}
        outputs = classifier(**inputs)
        logits.append(outputs[1].detach().cpu().numpy())
logits = np.concatenate(logits, axis=0)

  0%|          | 0/360 [00:00<?, ?it/s]

In [13]:
from torch.nn import CrossEntropyLoss

CrossEntropyLoss()(torch.tensor(logits), torch.tensor(eval_dataset["labels"]))

tensor(1.0275)

In [14]:
bert = AutoModel.from_pretrained("roberta-large")
classifier = BertForLongSeqClassification(bert, lstm_hidden_size=256, drop_out=0.5)
load_model(
    classifier,
    os.path.join(data_dir, "model-roberta-large", "model.safetensors"),
    device="cuda",
)
classifier.to(torch.device("cuda"))

eval_dataset = tokenized_dataset["test"]
dataloader = DataLoader(
    eval_dataset,
    batch_size=batch_size,
    pin_memory=True,
    collate_fn=default_data_collator,
)
classifier.eval()

logits = []
with torch.no_grad():
    for inputs in tqdm(dataloader):
        inputs = {k: v.to(torch.device("cuda")) for k, v in inputs.items()}
        outputs = classifier(**inputs)
        logits.append(outputs[1].detach().cpu().numpy())
logits = np.concatenate(logits, axis=0)
roberta_logits = logits

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'BertForLongSeqClassification' is not defined

In [None]:
from sklearn.metrics import log_loss


def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)


labels = np.array(tokenized_dataset["test"]["labels"])
log_loss(labels, softmax(bert_logits)), log_loss(labels, softmax(roberta_logits))

(1.0628800346190332, 1.0619226026429176)

In [None]:
def ensembling(logits_list, W):
    return np.sum(
        [W[i] * softmax(logits) for i, logits in enumerate(logits_list)], axis=0
    )


log_loss(labels, softmax(ensembling([bert_logits, roberta_logits], [0.5, 0.5])))

1.078454545235536

In [None]:
from scipy.optimize import minimize

minimize(  # minimize log loss
    lambda W: log_loss(labels, softmax(ensembling([bert_logits, roberta_logits], W))),
    x0=[0.5, 0.5],
    method="Nelder-Mead",
)

       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 1.0600688823375397
             x: [-6.653e-01  4.028e+00]
           nit: 58
          nfev: 119
 final_simplex: (array([[-6.653e-01,  4.028e+00],
                       [-6.653e-01,  4.028e+00],
                       [-6.653e-01,  4.029e+00]]), array([ 1.060e+00,  1.060e+00,  1.060e+00]))

In [3]:
from transformers import AutoModel

model = AutoModel.from_pretrained("google/gemma-2-9b-it")

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
model

Gemma2Model(
  (embed_tokens): Embedding(256000, 3584, padding_idx=0)
  (layers): ModuleList(
    (0-41): 42 x Gemma2DecoderLayer(
      (self_attn): Gemma2Attention(
        (q_proj): Linear(in_features=3584, out_features=4096, bias=False)
        (k_proj): Linear(in_features=3584, out_features=2048, bias=False)
        (v_proj): Linear(in_features=3584, out_features=2048, bias=False)
        (o_proj): Linear(in_features=4096, out_features=3584, bias=False)
        (rotary_emb): Gemma2RotaryEmbedding()
      )
      (mlp): Gemma2MLP(
        (gate_proj): Linear(in_features=3584, out_features=14336, bias=False)
        (up_proj): Linear(in_features=3584, out_features=14336, bias=False)
        (down_proj): Linear(in_features=14336, out_features=3584, bias=False)
        (act_fn): PytorchGELUTanh()
      )
      (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
      (pre_feedforward_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
      (post_feedforward_layernorm): Gemma2RMSNorm((3584