In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install /kaggle/input/some-pack/faiss_cpu_downloads/faiss_cpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
pip install --no-index --find-links=/kaggle/input/some-pack/sentence_transformers_packages sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model_load_path = '/kaggle/input/some-pack/sentence-transformer-model' 

sentence_model = SentenceTransformer(model_load_path)
print("Model loaded successfully!")

sentences = ["This is a test sentence."]
embeddings = sentence_model.encode(sentences)

print(embeddings)
print(embeddings.shape) 

In [None]:
#訓練後的 Model 數據
merged_model_dir = "/kaggle/input/pppppppp/merged_model_dir"

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel
import numpy as np

class CustomDebertaModel(nn.Module):
    def __init__(self, model_name, num_labels, feature_dim=2):
        super(CustomDebertaModel, self).__init__()
        # 初始化DeBERTa預訓練模型
        self.base_model = AutoModel.from_pretrained(model_name)
        # 確保額外特徵維度與模型嵌入一致
        self.feature_fc = nn.Linear(feature_dim, self.base_model.config.hidden_size)  # 額外特徵處理層
        # 定義分類層，合併文本嵌入與額外特徵嵌入
        self.classifier = nn.Linear(self.base_model.config.hidden_size * 2, num_labels)  # 合併特徵後分類
    def forward(self, input_ids, attention_mask, similarity_features, labels=None):
        # 獲取文本嵌入（[CLS] Token 嵌入）
        base_outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = base_outputs.last_hidden_state[:, 0, :]  # 提取 [CLS] Token 的嵌入
        similarity_embeds = self.feature_fc(similarity_features)
        combined_features = torch.cat([text_embeddings, similarity_embeds], dim=1)
        logits = self.classifier(combined_features)
        outputs = {"logits": logits}
        if labels is not None:
            # 如果提供了標籤，計算交叉熵損失
            loss_fn = nn.CrossEntropyLoss()
            outputs["loss"] = loss_fn(logits, labels)
        return outputs

### Faiss 提取相似性特徵 ###

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
test_data = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

In [None]:
import faiss

# 使用 FAISS 計算語義相似性分數
def compute_semantic_features_with_faiss(df):
    prompts = df['prompt'].tolist()
    responses_a = df['response_a'].tolist()
    responses_b = df['response_b'].tolist()

    # 提取嵌入向量並正規化
    prompt_embeddings = np.array(sentence_model.encode(prompts))
    prompt_embeddings = prompt_embeddings / np.linalg.norm(prompt_embeddings, axis=1, keepdims=True)

    response_a_embeddings = np.array(sentence_model.encode(responses_a))
    response_a_embeddings = response_a_embeddings / np.linalg.norm(response_a_embeddings, axis=1, keepdims=True)

    response_b_embeddings = np.array(sentence_model.encode(responses_b))
    response_b_embeddings = response_b_embeddings / np.linalg.norm(response_b_embeddings, axis=1, keepdims=True)

    dim = prompt_embeddings.shape[1]
    index_flat = faiss.IndexFlatIP(dim)  # 使用內積計算相似度

    # 計算相似度
    index_flat.add(prompt_embeddings)  # 添加 prompt 嵌入向量到索引
    similarity_a = index_flat.search(response_a_embeddings, k=1)[0].squeeze()

    index_flat.reset()
    index_flat.add(prompt_embeddings)
    similarity_b = index_flat.search(response_b_embeddings, k=1)[0].squeeze()

    df['similarity_a'] = similarity_a
    df['similarity_b'] = similarity_b

    return df

In [None]:
test_data = compute_semantic_features_with_faiss(test_data)

In [None]:
model_name = "/kaggle/input/amodellll/deberta-v3-small-local"  
num_labels = 3  
feature_dim = 2 

custom_model = CustomDebertaModel(model_name, num_labels, feature_dim)

state_dict_path = "/kaggle/input/pppppppp/merged_model_dir/custom_model.pth"
custom_model.load_state_dict(torch.load(state_dict_path))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model.to(device)

print("Custom model loaded successfully!")

In [None]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/amodellll/deberta-v3-small-local")

def preprocess_test_data(row):
    input_text = f"Prompt: {row['prompt']} Response A: {row['response_a']} Response B: {row['response_b']}"
    tokenized_inputs = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    )
    tokenized_inputs["similarity_a"] = torch.tensor([row["similarity_a"]], dtype=torch.float32)
    tokenized_inputs["similarity_b"] = torch.tensor([row["similarity_b"]], dtype=torch.float32)
    return tokenized_inputs

processed_test_data = [preprocess_test_data(row) for _, row in test_data.iterrows()]

In [None]:
# 創建自定義測試數據集
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# 初始化數據集
test_dataset = TestDataset(processed_test_data)

# 自定義 DataLoader 的 collate_fn 函數
def collate_fn_test(batch):
    input_ids = torch.cat([item["input_ids"] for item in batch])
    attention_mask = torch.cat([item["attention_mask"] for item in batch])
    similarity_features = torch.cat(
        [torch.cat([item["similarity_a"], item["similarity_b"]], dim=0).unsqueeze(0) for item in batch]
    )
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "similarity_features": similarity_features,
    }

# 創建 DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn_test, shuffle=False)

In [None]:
# 模型推理
custom_model.eval()
predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        similarity_features = batch["similarity_features"].to(device)

        # 推理
        outputs = custom_model(input_ids=input_ids, attention_mask=attention_mask, similarity_features=similarity_features)
        logits = outputs["logits"]
        probs = torch.nn.functional.softmax(logits, dim=-1)
        predictions.append(probs.cpu().numpy())

# 合併所有預測結果
predictions = np.concatenate(predictions, axis=0)

# 檢視預測結果
print(predictions)

In [None]:
submission = pd.DataFrame({
    "id": test_data["id"],
    "winner_model_a": predictions[:, 0],
    "winner_model_b": predictions[:, 1],
    "winner_tie": predictions[:, 2],
})
submission.to_csv("submission.csv", index=False)

In [None]:
submission