# Packages

In [None]:
%pip install /kaggle/input/lmsys-packages/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
%pip install /kaggle/input/lmsys-packages/xformers-0.0.24042abc8.d20240802-cp310-cp310-linux_x86_64.whl
!pip install transformers peft accelerate bitsandbytes \-U --no-index --find-links /kaggle/input/lmsys-wheel-files

In [None]:
!cp -r /kaggle/input/lmsys-modules-0805 human_pref

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Prepare test file

In [None]:
%%writefile prepare_test_file.py

import pandas as pd


df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
df["winner_model_a"] = 1
df["winner_model_b"] = 0
df["winner_tie"] = 0
df.to_parquet("test.parquet", index=False)

df["response_a"], df["response_b"] = df["response_b"], df["response_a"]
df.to_parquet("test_swap.parquet", index=False)

###/kaggle/input/llm-finetuning-dataset/train_folds_lmsys.csv
'''
import pandas as pd

# 讀取主要的訓練資料集和 reward 分數資訊
full_data = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")  # 包含 id, prompt, response_a, response_b 等欄位
rewards_data = pd.read_csv("/kaggle/input/llm-finetuning-dataset/train_folds_lmsys.csv")  # 包含 id, fold

# 合併 reward 分數到主要資料集中
merged_data = full_data.merge(rewards_data, on='id', how='inner')
merged_data.rename(columns={'fold': 'reward'}, inplace=True)  # 將 'fold' 欄位重命名為 'reward'

# 檢查合併後的資料
print(merged_data.head())

# 儲存合併後的資料
merged_data.to_parquet("train_with_rewards.parquet", index=False)

'''


'''
# %%writefile prepare_test_file.py
import pandas as pd

# 讀取原始測試資料和 reward 資料
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
folds_df = pd.read_csv("/kaggle/input/llm-finetuning-dataset/train_folds_lmsys.csv")

# 合併 reward 資料到測試資料中
merged_df = test_df.merge(folds_df, on="id", how="inner")

# 假設 fold 欄位即為 reward 分數，設置 winner labels
merged_df["winner_model_a"] = (merged_df["fold"] == 0).astype(int)
merged_df["winner_model_b"] = (merged_df["fold"] == 1).astype(int)
merged_df["winner_tie"] = (merged_df["fold"] == 2).astype(int)

# 保存資料
merged_df.to_parquet("test.parquet", index=False)

# 創建 response_a 和 response_b 調換版本的資料集
merged_df["response_a"], merged_df["response_b"] = merged_df["response_b"], merged_df["response_a"]
merged_df.to_parquet("test_swap.parquet", index=False)
'''


In [None]:
!python prepare_test_file.py

# Inference: gemma2-9b

In [None]:
%%writefile predict_m0.py
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from human_pref.models.modeling_gemma2 import Gemma2ForSequenceClassification
from human_pref.data.processors import ProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device


model_name_or_path = "/kaggle/input/lmsys-checkpoints-0-0805"
csv_path = "test.parquet"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
processor = ProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=False,
)
dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)
dataloader = DataLoader(
    dataset,
    batch_size=80,
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192, base_collator=VarlenCollator()
    ),
)


# model for pipelined inference
num_hidden_layers = 42
device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"

model = Gemma2ForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# inv_freq clones for each device
config = model.config
dim = config.head_dim
inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)
inv_freq0 = inv_freq.to("cuda:0")
inv_freq1 = inv_freq.to("cuda:1")


# for name, p in model.named_parameters():
#     print(name, p.device)
# for name, b in model.model.named_buffers():
#     print(name, b.device)

# pipeline parallelism with two GPUs
is_first = True
hidden_states = None
outs = []
for batch in tqdm(dataloader):
    for micro_batch in batch:
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
            attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
        )
        seq_info = to_device(seq_info, "cuda:0")
        if is_first:
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            is_first = False
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue
        with torch.no_grad(), torch.cuda.amp.autocast():
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)

            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())

# last micro-batch
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())

pred = torch.cat(outs, dim=0)
prob = pred.softmax(-1)
print(dataset.evaluate(prob.numpy()))

np.save('prob_m0.npy', prob)

In [None]:
!python predict_m0.py

# Inference: llama3-8b

In [None]:
%%writefile predict_m3.py
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer
import torch.nn as nn
import torch.nn.functional as F
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from transformers import LlamaModel,LlamaPreTrainedModel
from human_pref.models.modeling_llama import LlamaForSequenceClassification
from human_pref.data.processors import ProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device


model_name_or_path = "/kaggle/input/lmsys-checkpoints-3-0805"
csv_path = "test_swap.parquet"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.deprecation_warnings[
    "sequence-length-is-longer-than-the-specified-maximum"
] = True

processor = ProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=True,
)

dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)

dataloader = DataLoader(
    dataset,
    batch_size=80,
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192, base_collator=VarlenCollator()
    ),
)

# model for pipelined inference
num_hidden_layers = 32
device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"


        
model = LlamaForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# inv_freq clones for each device
config = model.config
dim = config.hidden_size // config.num_attention_heads

inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)

inv_freq0 = inv_freq.to("cuda:0")
inv_freq1 = inv_freq.to("cuda:1")


# for name, p in model.named_parameters():
#     print(name, p.device)
# for name, b in model.model.named_buffers():
#     print(name, b.device)

# pipeline parallelism with two GPUs
is_first = True
hidden_states = None
outs = []
for batch in tqdm(dataloader):
    for micro_batch in batch:
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
            attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
        )
        seq_info = to_device(seq_info, "cuda:0")
        if is_first:
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            is_first = False
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue
        with torch.no_grad(), torch.cuda.amp.autocast():
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)

            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())

# last micro-batch
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())


pred = torch.cat(outs, dim=0)
prob = pred.softmax(-1)
print(dataset.evaluate(prob.numpy()))

np.save('prob_m3.npy', prob)

In [None]:
!python predict_m3.py

In [None]:
import numpy as np

prob = np.load('prob_m3.npy')

print(prob[:5])


In [None]:
#!pip install transformers peft accelerate bitsandbytes -U --no-index --find-links /kaggle/input/lmsys-wheel-files

In [None]:
'''
%%time

tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)
'''

In [None]:
'''
#%%writefile predict_lora.py


import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

assert torch.cuda.device_count() == 2
@dataclass
class Config:
    gemma_dir = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-4bit/1/gemma-2-9b-it-4bit'
    lora_dir = '/kaggle/input/73zap2gx/checkpoint-5748'
    max_length = 2048
    batch_size = 4
    device = torch.device("cuda")    
    tta = False  # test time augmentation. <prompt>-<model-b's response>-<model-a's response>
    spread_max_length = False  # whether to apply max_length//3 on each input or max_length on the concatenated input

cfg = Config()
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
def process_text(text: str) -> str:
    return " ".join(eval(text, {"null": ""}))

test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

display(test.head(5))

def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):
    prompt = ["<prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask
    
# Load base model on GPU 0
device_0 = torch.device('cuda:0')
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_0,
    use_cache=False,
)

# Load base model on GPU 1
device_1 = torch.device('cuda:1')
model_1 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_1,
    use_cache=False,
)

model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)
'''

In [None]:
'''
#%%writefile predict_qlora.py
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    a_win, b_win, tie = [], [], []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df

'''

In [None]:
#!python predict_qlora.py

In [None]:
'''
st = time.time()

# sort by input length to fully leverage dynaminc padding
data = data.sort_values("length", ascending=False)
# the total #tokens in sub_1 and sub_2 should be more or less the same
sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

result_df = pd.concat(list(results), axis=0)
display(result_df)

proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

st = time.time()

if cfg.tta:
    data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

    tta_result_df = pd.concat(list(results), axis=0)
    # recall TTA's order is flipped
    tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
    # average original result and TTA result.
    proba = (proba + tta_proba) / 2

print(f"elapsed time: {time.time() - st}")

result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)
submission_df = pd.DataFrame(submission_df)

prob_qlora = submission_df[["winner_model_a", "winner_model_b", "winner_tie"]].to_numpy()

print(prob_qlora)
np.save('prob_qlora.npy', proba)
'''

In [None]:
# !python predict_lorasciia.py
# display(result_df)

Sentence Transformer

In [None]:
!pip install /kaggle/input/some-pack/faiss_cpu_downloads/faiss_cpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --no-index --find-links=/kaggle/input/some-pack/sentence_transformers_packages sentence-transformers

In [None]:
%%writefile predict_fias.py

from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoModel
from transformers import AutoTokenizer
from sklearn.preprocessing import MinMaxScaler
import faiss
test_data = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
model_load_path = '/kaggle/input/some-pack/sentence-transformer-model' 
sentence_model = SentenceTransformer(model_load_path)


#自建的Debert模型
class CustomDebertaModel(nn.Module):
    def __init__(self, model_name, num_labels, feature_dim=2, dropout_rate=0.05):
        super(CustomDebertaModel, self).__init__()
        # 初始化DeBERTa模型
        self.base_model = AutoModel.from_pretrained(model_name)
        # 相似度特徵塔 (MLP)
        self.feature_fc = nn.Sequential(
            nn.Linear(feature_dim, 128),  # 映射到128維
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(128, self.base_model.config.hidden_size),  # 映射到文本嵌入維度
            nn.ReLU()
        )
        # 注意力機制
        self.attention = nn.MultiheadAttention(
            embed_dim=self.base_model.config.hidden_size,
            num_heads=4,  # 設定注意力頭的數量
            batch_first=True
        )
        # Dropout 層
        self.dropout = nn.Dropout(p=dropout_rate)

        # 最終分類層
        self.classifier = nn.Sequential(
            nn.Linear(self.base_model.config.hidden_size * 2, self.base_model.config.hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(self.base_model.config.hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask, similarity_features, labels=None):
        # 文本塔：提取文本嵌入
        base_outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = base_outputs.last_hidden_state[:, 0, :]  # 提取 [CLS] token 嵌入

        # 特徵塔：處理相似度特徵
        similarity_embeds = self.feature_fc(similarity_features)

        #使用注意力機制進行交互
        query = text_embeddings.unsqueeze(1)  # [batch_size, 1, hidden_size]
        key_value = similarity_embeds.unsqueeze(1)  # [batch_size, 1, hidden_size]
        attention_output, _ = self.attention(query, key_value, key_value)

        # 拼接文本嵌入與注意力輸出
        combined_features = torch.cat([text_embeddings, attention_output.squeeze(1)], dim=1)

        # Dropout 和分類
        logits = self.classifier(self.dropout(combined_features))

        # 輸出結果
        outputs = {"logits": logits}
        if labels is not None:
            # 如果有標籤，計算損失
            loss_fn = nn.CrossEntropyLoss()
            outputs["loss"] = loss_fn(logits, labels)
        return outputs
        
# 使用 FAISS 計算語義相似性分數
def compute_semantic_features_with_faiss(df):
    prompts = df['prompt'].tolist()
    responses_a = df['response_a'].tolist()
    responses_b = df['response_b'].tolist()

    # emb and norm
    prompt_embeddings = np.array(sentence_model.encode(prompts))
    #norm
    prompt_embeddings = prompt_embeddings / np.linalg.norm(prompt_embeddings, axis=1, keepdims=True)

    response_a_embeddings = np.array(sentence_model.encode(responses_a))
    response_a_embeddings = response_a_embeddings / np.linalg.norm(response_a_embeddings, axis=1, keepdims=True)

    response_b_embeddings = np.array(sentence_model.encode(responses_b))
    response_b_embeddings = response_b_embeddings / np.linalg.norm(response_b_embeddings, axis=1, keepdims=True)

    dim = prompt_embeddings.shape[1]
    index_flat = faiss.IndexFlatIP(dim)  #使用內積計算相似度

    # 計算相似度
    index_flat.add(prompt_embeddings)  #添加prompt嵌入向量到索引
    similarity_a = index_flat.search(response_a_embeddings, k=1)[0].squeeze()

    index_flat.reset()
    
    index_flat.add(prompt_embeddings)
    similarity_b = index_flat.search(response_b_embeddings, k=1)[0].squeeze()

    df['similarity_a'] = similarity_a
    df['similarity_b'] = similarity_b

    return df
    
test_data = compute_semantic_features_with_faiss(test_data)
model = torch.load("/kaggle/input/akemiiiiii/custom_model_dir/custom_model_complete.pth")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Custom model loaded successfully!")


tokenizer_path = "/kaggle/input/akemiiiiii/custom_model_dir" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

def preprocess_test_data(row):
    input_text = f"Prompt: {row['prompt']} Response A: {row['response_a']} Response B: {row['response_b']}"
    tokenized_inputs = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    )
    tokenized_inputs["similarity_a"] = torch.tensor([row["similarity_a"]], dtype=torch.float32)
    tokenized_inputs["similarity_b"] = torch.tensor([row["similarity_b"]], dtype=torch.float32)
    return tokenized_inputs

processed_test_data = [preprocess_test_data(row) for _, row in test_data.iterrows()]
# 創建自定義測試數據集
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# 初始化數據集
test_dataset = TestDataset(processed_test_data)

# 自定義 DataLoader 的 collate_fn 函數
def collate_fn_test(batch):
    input_ids = torch.cat([item["input_ids"] for item in batch])
    attention_mask = torch.cat([item["attention_mask"] for item in batch])
    similarity_features = torch.cat(
        [torch.cat([item["similarity_a"], item["similarity_b"]], dim=0).unsqueeze(0) for item in batch]
    )
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "similarity_features": similarity_features,
    }

# 創建 DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn_test, shuffle=False)
# 模型推理
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        similarity_features = batch["similarity_features"].to(device)

        # 推理
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, similarity_features=similarity_features)
        logits = outputs["logits"]
        probs = torch.nn.functional.softmax(logits, dim=-1)
        predictions.append(probs.cpu().numpy())

# 合併所有預測結果
predictions = np.concatenate(predictions, axis=0)

# 檢視預測結果
print(predictions)

np.save('prob_faiss.npy', predictions)

In [None]:
!python predict_fias.py

# Make submission

In [None]:
'''
%%writefile make_submission.py
import numpy as np
import pandas as pd

df = pd.read_parquet("test.parquet")
preds = np.average(
    [
        np.load("prob_m0.npy"),
        np.load("prob_m3.npy")[:, [1, 0, 2]],
    ],
    axis=0,
    weights=[2, 1],
)
sub = pd.DataFrame({
    "id": df["id"],
    "winner_model_a": preds[:, 0],
    "winner_model_b": preds[:, 1],
    "winner_tie": preds[:, 2],
})
sub.to_csv("submission.csv", index=False)
print(sub.head())
'''

In [None]:
##Model embedding

import numpy as np
import pandas as pd


df = pd.read_parquet("test.parquet")


prob_m0 = np.load("prob_m0.npy")  # Gemma2
prob_m3 = np.load("prob_m3.npy")[:, [1, 0, 2]]  # Llama3 (swap response_a and response_b)
prob_faiss = np.load("prob_faiss.npy")  # faiss

# Combine predictions with weights
# Adjust weights as needed for optimal performance

from scipy.optimize import minimize




preds = np.average(
    [
        prob_m0,       # Gemma2 results
        prob_m3,       # Llama3 results
        prob_faiss     # faiss results
    ],
    axis=0,
    weights=[0.5, 0.3, 0.2]  # Weights for each model
)





# Create submission DataFrame
sub = pd.DataFrame({
    "id": df["id"],
    "winner_model_a": preds[:, 0],
    "winner_model_b": preds[:, 1],
    "winner_tie": preds[:, 2],
})



# Save to CSV
sub.to_csv("submission.csv", index=False)
print(sub.head())
