In [1]:
%%writefile eval.py

from torch.utils.data import DataLoader
from transformers import AutoModel
from peft import PeftModel
from transformers import BitsAndBytesConfig
import torch
import torch.nn as nn
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset
from functools import partial
from typing import Optional
from accelerate import Accelerator
from transformers import AutoTokenizer
import os
import pandas as pd
from datasets import load_dataset
import json

from transformers import HfArgumentParser, default_data_collator
from dataclasses import dataclass, field
from tqdm.auto import tqdm


@dataclass
class Params:
    base_model_dir: str = field()
    lora_model_dir: str = field()
    dataset_dir: str = field()
    output_path: str = field()
    load_in_4bit: bool = field(default=True)
    batch_size: int = field(default=4)


def preprocess_function(examples):
    prompts = examples["prompt"]
    response_as = examples["response_a"]
    response_bs = examples["response_b"]
    ids = examples["id"]

    samples = []
    for (
        prompt,
        response_a,
        response_b,
        id,
    ) in zip(
        prompts,
        response_as,
        response_bs,
        ids,
    ):
        prompt = json.loads(prompt)
        response_a = json.loads(response_a)
        response_b = json.loads(response_b)

        prompt = "".join(prompt)
        response_a = "".join([r if r is not None else "" for r in response_a])
        response_b = "".join([r if r is not None else "" for r in response_b])

        sentences = [prompt, response_a, response_b]
        samples.append((id, sentences))

    return {
        "id": [id for id, _ in samples],
        "sentences": [text for _, text in samples],
    }


def tokenize_function(examples, tokenizer, max_seq_length=512):
    """Tokenize function"""
    encodings = []
    for sentence in examples["sentences"]:
        encoding = tokenizer(
            sentence,
            padding="max_length",
            truncation=True,
            max_length=max_seq_length,
            return_tensors="pt",
        )
        encodings.append(encoding)
    result = {}
    for key in encodings[0].keys():
        result[key] = torch.stack([encoding[key] for encoding in encodings])
    return result


def preprocess_dataset(
    dataset: Dataset,
    tokenizer_name_or_path: str,
    max_seq_length: int,
    accelerator: Accelerator,
):
    """Preprocess the dataset"""
    t = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
    if t.pad_token is None:
        t.pad_token = t.eos_token
    tokenizer = partial(
        tokenize_function,
        tokenizer=t,
        max_seq_length=max_seq_length,
    )

    with accelerator.main_process_first():
        dataset = dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=dataset.column_names,
        )
        dataset = dataset.map(
            tokenizer, batched=True, remove_columns=dataset.column_names
        )
    return dataset


def prepare_dataset(
    dataset_dir: str,
    tokenizer_name_or_path: str,
    max_seq_length: int,
    accelerator: Optional[Accelerator],
):
    """Prepare the dataset"""
    dataset = load_dataset("csv", data_dir=dataset_dir)["test"]
    dataset = preprocess_dataset(
        dataset, tokenizer_name_or_path, max_seq_length, accelerator
    )
    return dataset


class LstmTextClassifier(nn.Module):
    """
    A long sequence classifier that uses a pre-trained transformer model as the base model
    """

    def __init__(
        self,
        base_model,
        num_classes,
        base_model_require_grad=True,
        lstm_hidden_size=256,
        dropout=0.5,
    ):
        super().__init__()
        hidden_size = base_model.config.hidden_size
        self.model = base_model
        self.base_model_require_grad = base_model_require_grad
        self.word_lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=lstm_hidden_size,
            batch_first=True,
            bidirectional=True,
        )
        self.sentence_lstm = nn.LSTM(
            input_size=lstm_hidden_size * 2,
            hidden_size=lstm_hidden_size,
            batch_first=True,
            bidirectional=True,
        )
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(lstm_hidden_size * 2, num_classes)

        def xavier_init(layer):
            for name, param in layer.named_parameters():
                if "weight_ih" in name:
                    torch.nn.init.xavier_uniform_(param.data)
                elif "weight_hh" in name:
                    torch.nn.init.xavier_uniform_(param.data)
                elif "bias" in name:
                    torch.nn.init.zeros_(param.data)
                    # 设置forget gate的偏置为1
                    param.data[lstm_hidden_size : 2 * lstm_hidden_size].fill_(1)

        xavier_init(self.word_lstm)
        xavier_init(self.sentence_lstm)
        nn.init.uniform_(self.classifier.weight, a=-0.1, b=0.1)
        if self.classifier.bias is not None:
            nn.init.uniform_(self.classifier.bias, -0.1, 0.1)

    def forward(
        self,
        input_ids,
        attention_mask,
        labels=None,
    ):
        """
        Forward pass of the model
        """
        batch_size = input_ids.size(0)
        num_sentences = input_ids.size(1)
        input_ids = input_ids.view(-1, input_ids.size(-1))
        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
        inputs = {
            k: v for k, v in locals().items() if k in ["input_ids", "attention_mask"]
        }
        if self.base_model_require_grad:
            transformer_outputs = self.model(**inputs)
        else:
            self.model.eval()
            with torch.no_grad():
                transformer_outputs = self.model(**inputs)

        outputs = transformer_outputs.last_hidden_state

        word_lstm_output, _ = self.word_lstm(
            outputs
        )  # (batch_size * num_sentences, seq_len, lstm_hidden_size * 2)
        sentence_embeddings = []
        for i in range(batch_size * num_sentences):
            mask = attention_mask[i].bool()
            valid_output = word_lstm_output[i][mask]
            if len(valid_output) > 0:
                sentence_embedding = valid_output.mean(dim=0)
            else:
                sentence_embedding = torch.zeros(self.word_lstm.hidden_size * 2).to(
                    word_lstm_output.device
                )
            sentence_embeddings.append(sentence_embedding)

        sentence_embeddings = torch.stack(sentence_embeddings).view(
            batch_size, num_sentences, -1
        )

        sentence_lstm_output, _ = self.sentence_lstm(sentence_embeddings)
        finnal_output = self.dropout(sentence_lstm_output[:, -1, :])
        logits = self.classifier(finnal_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

        return SequenceClassifierOutput(loss=loss, logits=logits)


def eval_loop(
    accelerator: Optional[Accelerator],
    base_model_dir: str,
    lora_model_dir: str,
    load_in_4bit: bool,
    dataset_dir: str,
    batch_size: int,
    output_path: str,
):
    dataset = prepare_dataset(dataset_dir, base_model_dir, 512, accelerator)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        pin_memory=True,
        collate_fn=default_data_collator,
    )
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )
    base_model = AutoModel.from_pretrained(
        base_model_dir,
        torch_dtype=torch.float16,
        quantization_config=quantization_config,
    )
    classifier = LstmTextClassifier(base_model, num_classes=3)
    classifier = PeftModel.from_pretrained(classifier, lora_model_dir)
    classifier, dataloader = accelerator.prepare(classifier, dataloader)

    def flatten_parameters(model):
        if isinstance(model, nn.LSTM):
            model.flatten_parameters()

    classifier.apply(flatten_parameters)

    pbar = tqdm(
        total=len(dataloader),
        desc="Evaluating",
        disable=not accelerator.is_main_process,
    )
    logits = []
    with torch.no_grad():
        for batch in dataloader:
            outputs = classifier(**batch)
            outputs = outputs.logits
            gathered = accelerator.gather_for_metrics(outputs)
            if accelerator.is_main_process:
                logits.append(gathered.cpu())
            pbar.update()
        if accelerator.is_main_process:
            logits = torch.cat(logits)
            test_csv = pd.read_csv(os.path.join(dataset_dir, "test.csv"))
            df = pd.DataFrame(
                {
                    "id": test_csv["id"],
                    "winner_model_a": logits[:, 0],
                    "winner_model_b": logits[:, 1],
                    "winner_tie": logits[:, 2],
                }
            )
            df.to_csv(output_path, index=False)


def main():
    parser = HfArgumentParser((Params,))
    (params,) = parser.parse_args_into_dataclasses()
    accelerator = Accelerator()
    eval_loop(
        accelerator,
        params.base_model_dir,
        params.lora_model_dir,
        params.load_in_4bit,
        params.dataset_dir,
        params.batch_size,
        params.output_path,
    )


if __name__ == "__main__":
    main()


Overwriting eval.py


In [2]:
import os

deep_speed_config_path = os.path.join(os.getcwd(), "zero_stage3_config.json")

with open("default_config.yaml", "w") as f:
    f.write(
        f"""
compute_environment: LOCAL_MACHINE
deepspeed_config:
 deepspeed_config_file: {deep_speed_config_path}
 zero3_init_flag: true
distributed_type: DEEPSPEED
fsdp_config: {{}}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
num_machines: 1
num_processes: 2
use_cpu: false
"""
    )
print("default_config.yaml created")

default_config.yaml created


In [3]:
with open("zero_stage3_config.json", "w") as f:
    f.write(
        """{
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "zero_optimization": {
        "stage": 3,
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": true
    },
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}"""
    )
print("zero_stage3_config.json created")

zero_stage3_config.json created


In [4]:
base_model_dir = "/root/autodl-fs/pretrained/gemma-2-9b-it"
lora_model_dir = "/root/autodl-tmp/lora/gemma-2-9b-it-llmc"
dataset_dir = "/root/autodl-fs/datasets/llm-classification"
batch_size = 8
gemma_output_path = "/root/autodl-tmp/workspace/gemma-2-9b-it-predictions.csv"

!accelerate launch --config_file default_config.yaml eval.py \
    --base_model_dir {base_model_dir} \
    --lora_model_dir {lora_model_dir} \
    --dataset_dir {dataset_dir} \
    --batch_size {batch_size} \
    --output_path {gemma_output_path} 

[2024-11-14 14:17:15,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-14 14:17:21,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-14 14:17:21,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-14 14:17:22,075] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-14 14:17:22,075] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2024-11-14 14:17:22,312] [INFO] [comm.py:652:init_distributed] cdb=None
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards:   0%|                          | 0/8 [00:00<?, ?it/s]`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████████████| 8/8 [00:10<00:00,  1.30s/it]
Loading checkpoint shards: 100%|██████████████████| 8/8 [00:

In [5]:
base_model_dir = "/root/autodl-fs/pretrained/llama-3-8b-instruct"
lora_model_dir = "/root/autodl-tmp/lora/llama-3-8b-instruct-llmc"
dataset_dir = "/root/autodl-fs/datasets/llm-classification"
batch_size = 8
llama_output_path = "/root/autodl-tmp/workspace/llama-3-8b-instruct-predictions.csv"

!accelerate launch --config_file default_config.yaml eval.py \
    --base_model_dir {base_model_dir} \
    --lora_model_dir {lora_model_dir} \
    --dataset_dir {dataset_dir} \
    --batch_size {batch_size} \
    --output_path {llama_output_path} 

[2024-11-14 14:33:16,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-14 14:33:22,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-14 14:33:22,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-14 14:33:23,290] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-11-14 14:33:23,290] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2024-11-14 14:33:23,323] [INFO] [comm.py:652:init_distributed] cdb=None
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards:   0%|                          | 0/7 [00:00<?, ?it/s]`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████████████| 7/7 [00:09<00:00,  1.34s/it]
Loading checkpoint shards: 100%|██████████████████| 7/7 [00:

In [18]:
import pandas as pd
import numpy as np


def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)


def get_labels():
    test_df = pd.read_csv(os.path.join(dataset_dir, "test.csv"))
    test_df["labels"] = test_df.apply(
        lambda row: 0
        if row["winner_model_a"] == 1
        else 1
        if row["winner_model_b"] == 1
        else 2,
        axis=1,
    )
    return test_df["labels"].values


def ensembling(logits_list, W):
    return np.sum(
        [W[i] * softmax(logits) for i, logits in enumerate(logits_list)], axis=0
    )


def get_logits(submission_path):
    df = pd.read_csv(submission_path)
    return df[["winner_model_a", "winner_model_b", "winner_tie"]].values


In [11]:
# uncomment to calculate ensemble weights

# from sklearn.metrics import log_loss
# from scipy.optimize import minimize

# labels = get_labels()
# gemma_logits = get_logits(gemma_output_path)
# llama_logits = get_logits(llama_output_path)


# print(log_loss(labels, softmax(gemma_logits)))
# print(log_loss(labels, softmax(llama_logits)))

# minimize(  # minimize log loss
#     lambda W: log_loss(labels, softmax(ensembling([llama_logits, gemma_logits], W))),
#     x0=[0.5, 0.5],
#     method="Nelder-Mead",
# )

1.0005517163000228
1.0148125257681215


       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 0.9949258105664686
             x: [ 9.294e-01  2.841e+00]
           nit: 41
          nfev: 80
 final_simplex: (array([[ 9.294e-01,  2.841e+00],
                       [ 9.293e-01,  2.841e+00],
                       [ 9.293e-01,  2.841e+00]]), array([ 9.949e-01,  9.949e-01,  9.949e-01]))

In [19]:
llama_logits = get_logits(llama_output_path)
gemma_logits = get_logits(gemma_output_path)

test_df = pd.read_csv(os.path.join(dataset_dir, "test.csv"))
finnal_logits = softmax(ensembling([llama_logits, gemma_logits], [9.294e-01, 2.841e00]))
pd.DataFrame(
    {
        "id": test_df["id"],
        "winner_model_a": finnal_logits[:, 0],
        "winner_model_b": finnal_logits[:, 1],
        "winner_tie": finnal_logits[:, 2],
    }
).to_csv("submission.csv", index=False)
