# Student Information
* Họ và tên: Nguyễn Văn Lê Bá Thành
* MSSV: 22127390
* Lớp: HP2-K34

# Installing Packages

In [None]:
%pip install /kaggle/input/lmsys-packages/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
%pip install /kaggle/input/lmsys-packages/xformers-0.0.24042abc8.d20240802-cp310-cp310-linux_x86_64.whl

In [None]:
%pip install /kaggle/input/some-pack/faiss_cpu_downloads/faiss_cpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
%pip install --no-index --find-links=/kaggle/input/some-pack/sentence_transformers_packages sentence-transformers

In [None]:
!cp -r /kaggle/input/lmsys-modules-0805 human_pref

## Check input files

In [None]:
import os

# Walk through all directories and files under the '/kaggle/input' folder
for dirname, _, filenames in os.walk('/kaggle/input'):
    # Loop through each file in the current directory
    for filename in filenames:
        # Print the full path to the file by joining the directory and filename
        print(os.path.join(dirname, filename))

# Prepare test file

In [None]:
%%writefile test.py
import pandas as pd

# Load the original test CSV file from the Kaggle input directory
df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

# Add dummy label columns indicating model A is always the winner
df["winner_model_a"] = 1
df["winner_model_b"] = 0
df["winner_tie"] = 0

# Save the original test dataframe (with dummy labels) to a Parquet file
df.to_parquet("test.parquet", index=False)

# Swap responses A and B — simulating the reverse comparison
df["response_a"], df["response_b"] = df["response_b"], df["response_a"]

# Save the swapped version to a separate Parquet file
df.to_parquet("test_swap.parquet", index=False)

In [None]:
!python test.py

# Inference: gemma2-9b

In [None]:
%%writefile gemma.py
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from human_pref.models.modeling_gemma2 import Gemma2ForSequenceClassification
from human_pref.data.processors import ProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device

# --- Configuration ---
model_name_or_path = "/kaggle/input/lmsys-checkpoints-0-0805"  # Pretrained model checkpoint path
csv_path = "test.parquet"  # Path to input data in .parquet format

# --- Load tokenizer and processor ---
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# Processor to tokenize input samples (prompt-response pairs)
processor = ProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=False,
)

# Load dataset and apply preprocessing
dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)

# DataLoader with custom collator to batch examples based on total token count
dataloader = DataLoader(
    dataset,
    batch_size=80,  # Each "batch" is a list of micro-batches
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192,  # Maximum total tokens per batch
        base_collator=VarlenCollator()
    ),
)

# --- Define pipeline parallelism across 2 GPUs ---

# Total number of transformer layers in the model
num_hidden_layers = 42

# Assign embedding, final layers, and score head to GPU 0 and 1
device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}

# Split model layers between two GPUs: first half on cuda:0, second half on cuda:1
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"

# Load model with weights onto corresponding devices and use float16
model = Gemma2ForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# --- Prepare rotary embeddings ---
config = model.config
dim = config.head_dim  # dimension of attention heads
# Compute inverse frequencies for RoPE (rotary positional encoding)
inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)
inv_freq0 = inv_freq.to("cuda:0")  # For first half of model
inv_freq1 = inv_freq.to("cuda:1")  # For second half

# --- Inference loop using pipelined execution ---
is_first = True
hidden_states = None
outs = []

# Loop through all batches
for batch in tqdm(dataloader):
    # Each batch is a list of micro-batches
    for micro_batch in batch:
        # Move input tokens to GPU 0
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")

        # Prepare sequence-related information
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
            attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
        )
        seq_info = to_device(seq_info, "cuda:0")

        # If first iteration, run only part 1 and store state
        if is_first:
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq1)
            is_first = False
            # Move intermediate outputs to GPU 1 for next step
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue

        # Run part 2 for previous micro-batch and part 1 for current micro-batch
        with torch.no_grad(), torch.cuda.amp.autocast():
            # Compute final logits for previous hidden states
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)

            # Compute hidden states for the next micro-batch
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq1)

            # Move new hidden state and seq_info to GPU 1
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())  # Store prediction logits on CPU

# --- Final prediction for the last micro-batch ---
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())

# Concatenate all logits and compute probabilities
pred = torch.cat(outs, dim=0)
prob = pred.softmax(-1)

# Evaluate predictions with dataset's built-in method
print(dataset.evaluate(prob.numpy()))

# Save prediction probabilities to file
np.save('prob_m0.npy', prob)


In [None]:
!python gemma.py

# Inference: llama3-8b

In [None]:
%%writefile llama.py
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from human_pref.models.modeling_llama import LlamaForSequenceClassification
from human_pref.data.processors import ProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device

# --- Configurations ---
model_name_or_path = "/kaggle/input/lmsys-checkpoints-3-0805"  # Path to pretrained LLaMA checkpoint
csv_path = "test_swap.parquet"  # Dataset file (responses are swapped)

# --- Tokenizer and Processor setup ---
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# Suppress tokenizer length warning
tokenizer.deprecation_warnings[
    "sequence-length-is-longer-than-the-specified-maximum"
] = True

# Processor formats prompt-response input for the model
processor = ProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=True,
)

# --- Load Dataset ---
dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)

# --- DataLoader setup with dynamic token batching ---
dataloader = DataLoader(
    dataset,
    batch_size=80,
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192,
        base_collator=VarlenCollator()
    ),
)

# --- Device mapping for pipelined parallelism ---
num_hidden_layers = 32  # LLaMA-3 has 32 transformer layers

device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}

# First half of layers on GPU 0
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"

# Second half of layers on GPU 1
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"

# Load the model with float16 precision using the device map
model = LlamaForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# --- Prepare rotary position encodings ---
config = model.config
dim = config.hidden_size // config.num_attention_heads

# Compute inverse frequencies for rotary embeddings
inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)
inv_freq0 = inv_freq.to("cuda:0")
inv_freq1 = inv_freq.to("cuda:1")

# --- Pipelined Inference across 2 GPUs ---
is_first = True  # Special case for first micro-batch
hidden_states = None
outs = []  # Store output logits

# Loop through batches
for batch in tqdm(dataloader):
    for micro_batch in batch:
        # Move inputs to GPU 0
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")

        # Construct sequence info for attention mask and positional encoding
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
            attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
        )
        seq_info = to_device(seq_info, "cuda:0")

        if is_first:
            # First micro-batch: run only forward_part1
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            is_first = False

            # Move intermediate results to GPU 1 for next step
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue

        # Pipelined inference:
        # - Run part2 for previous micro-batch on GPU 1
        # - Run part1 for current micro-batch on GPU 0
        with torch.no_grad(), torch.cuda.amp.autocast():
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)

            # Prepare next micro-batch state
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())  # Save logits to CPU memory

# --- Process final micro-batch (no part1 needed) ---
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())

# --- Evaluate and Save ---
pred = torch.cat(outs, dim=0)         # Concatenate logits
prob = pred.softmax(-1)               # Convert logits to probabilities
print(dataset.evaluate(prob.numpy())) # Evaluate predictions

np.save('prob_m3.npy', prob)          # Save prediction scores


In [None]:
!python llama.py

In [None]:
import numpy as np

prob = np.load('prob_m3.npy')

print(prob[:5])


# Inference: Faiss

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoTokenizer
from sklearn.preprocessing import MinMaxScaler
import faiss

model_load_path = '/kaggle/input/some-pack/sentence-transformer-model' 
sentence_model = SentenceTransformer(model_load_path)

test_data = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

class CustomDebertaModel(nn.Module):
    def __init__(self, model_name, num_labels, feature_dim=2, dropout_rate=0.1):
        super(CustomDebertaModel, self).__init__()
        
        # Initialize DeBERTa model
        self.base_model = AutoModel.from_pretrained(model_name)
        
        # Feature tower for similarity features (a small MLP)
        self.feature_fc = nn.Sequential(
            nn.Linear(feature_dim, 128),                 # Map input similarity features to 128-dimensional space
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(128, self.base_model.config.hidden_size),  # Project to same size as text embeddings
            nn.ReLU()
        )
        
        # Attention mechanism to allow interaction between text and similarity embeddings
        self.attention = nn.MultiheadAttention(
            embed_dim=self.base_model.config.hidden_size,
            num_heads=4,  # Number of attention heads
            batch_first=True  # Enable batch-first input format
        )
        
        # Dropout layer for regularization
        self.dropout = nn.Dropout(p=dropout_rate)

        # Final classifier layer (MLP for classification)
        self.classifier = nn.Sequential(
            nn.Linear(self.base_model.config.hidden_size * 2, self.base_model.config.hidden_size),  # Combine text + attention features
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(self.base_model.config.hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask, similarity_features, labels=None):
        # Text tower: extract [CLS] token embedding from DeBERTa
        base_outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = base_outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]

        # Feature tower: process similarity features through MLP
        similarity_embeds = self.feature_fc(similarity_features)  # [batch_size, hidden_size]

        # Cross-modal interaction using attention mechanism
        query = text_embeddings.unsqueeze(1)       # Shape: [batch_size, 1, hidden_size]
        key_value = similarity_embeds.unsqueeze(1) # Shape: [batch_size, 1, hidden_size]
        attention_output, _ = self.attention(query, key_value, key_value)  # [batch_size, 1, hidden_size]

        # Concatenate text and attended similarity features
        combined_features = torch.cat([text_embeddings, attention_output.squeeze(1)], dim=1)

        # Apply dropout and classification head
        logits = self.classifier(self.dropout(combined_features))

        # Output dictionary with logits
        outputs = {"logits": logits}
        
        # If labels are provided (e.g., during training), compute the cross-entropy loss
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            outputs["loss"] = loss_fn(logits, labels)

        return outputs

In [None]:
from sklearn.preprocessing import MinMaxScaler
import faiss

test_data = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

# Compute semantic similarity scores using FAISS
def compute_semantic_features_with_faiss(df):
    # Extract prompts and responses as lists
    prompts = df['prompt'].tolist()
    responses_a = df['response_a'].tolist()
    responses_b = df['response_b'].tolist()

    # Generate sentence embeddings and normalize them (unit vectors)
    prompt_embeddings = np.array(sentence_model.encode(prompts))
    prompt_embeddings = prompt_embeddings / np.linalg.norm(prompt_embeddings, axis=1, keepdims=True)

    response_a_embeddings = np.array(sentence_model.encode(responses_a))
    response_a_embeddings = response_a_embeddings / np.linalg.norm(response_a_embeddings, axis=1, keepdims=True)

    response_b_embeddings = np.array(sentence_model.encode(responses_b))
    response_b_embeddings = response_b_embeddings / np.linalg.norm(response_b_embeddings, axis=1, keepdims=True)

    # Determine the embedding dimension
    dim = prompt_embeddings.shape[1]
    
    # Create a FAISS index using inner product (cosine similarity since vectors are normalized)
    index_flat = faiss.IndexFlatIP(dim)

    # Compute similarity between response A and prompt
    index_flat.add(prompt_embeddings)  # Add prompt embeddings to the FAISS index
    similarity_a = index_flat.search(response_a_embeddings, k=1)[0].squeeze()  # Get top-1 similarity score

    # Reset and compute similarity for response B
    index_flat.reset()
    index_flat.add(prompt_embeddings)
    similarity_b = index_flat.search(response_b_embeddings, k=1)[0].squeeze()

    # Store similarity scores in the DataFrame
    df['similarity_a'] = similarity_a
    df['similarity_b'] = similarity_b

    return df

In [None]:
test_data = compute_semantic_features_with_faiss(test_data)

In [None]:
# Load the trained custom PyTorch model
model = torch.load("/kaggle/input/akemiiiiii/custom_model_dir/custom_model_complete.pth")

# Set device to GPU if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Important: set to evaluation mode

print("Custom model loaded successfully!")

# Load the tokenizer used during training
tokenizer_path = "/kaggle/input/akemiiiiii/custom_model_dir"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Function to preprocess a single test sample
def preprocess_test_data(row):
    # Construct the input string in the same format used for training
    input_text = f"Prompt: {row['prompt']} Response A: {row['response_a']} Response B: {row['response_b']}"

    # Tokenize the input using the same tokenizer settings as training
    tokenized_inputs = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"  # Return PyTorch tensors
    )

    # Add similarity scores as additional features
    # They should be shaped as [1, feature_dim] to batch correctly
    similarity = torch.tensor([[row["similarity_a"], row["similarity_b"]]], dtype=torch.float32)

    tokenized_inputs["similarity_features"] = similarity

    return tokenized_inputs

# Apply the preprocessing function to every row of the test dataset
processed_test_data = [preprocess_test_data(row) for _, row in test_data.iterrows()]

In [None]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

test_dataset = TestDataset(processed_test_data)

def collate_fn_test(batch):
    # Concatenate input_ids and attention_mask along batch dimension
    input_ids = torch.cat([item["input_ids"] for item in batch], dim=0)
    attention_mask = torch.cat([item["attention_mask"] for item in batch], dim=0)
    
    # Stack similarity features (already shape [1, 2])
    similarity_features = torch.cat([item["similarity_features"] for item in batch], dim=0)  # shape [B, 2]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "similarity_features": similarity_features,
    }

# DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn_test, shuffle=False)

In [None]:
# Set the model to evaluation mode (important to deactivate dropout, etc.)
model.eval()

# List to store prediction results
predictions = []

# Disable gradient calculation (saves memory and speeds up inference)
with torch.no_grad():
    for batch in test_dataloader:
        # Move batch data to the correct device (GPU or CPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        similarity_features = batch["similarity_features"].to(device)

        # Run forward pass (model inference)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            similarity_features=similarity_features
        )

        # Extract raw logits from output
        logits = outputs["logits"]

        # Convert logits to probabilities using softmax
        probs = torch.nn.functional.softmax(logits, dim=-1)

        # Move to CPU and convert to NumPy for later use
        predictions.append(probs.cpu().numpy())

# Concatenate all batch results into a single NumPy array
predictions = np.concatenate(predictions, axis=0)

# Print the final predictions
print(predictions)

# Save predictions to a .npy file for downstream analysis or evaluation
np.save('prob_faiss.npy', predictions)

# Create submission file

In [None]:
df = pd.read_parquet("test.parquet")


prob_m0 = np.load("prob_m0.npy")  # Gemma2
prob_m3 = np.load("prob_m3.npy")[:, [1, 0, 2]]  # Llama3 (swap response_a and response_b)
prob_faiss = np.load("prob_faiss.npy")  # faiss

# Combine predictions with weights
# Adjust weights as needed for optimal performance
preds = np.average(
    [
        prob_m0,       # Gemma2 results
        prob_m3,       # Llama3 results
        prob_faiss     # faiss results
    ],
    axis=0,
    weights=[0.7, 0.2, 0.1]  # Weights for each model
)

# Create submission DataFrame
sub = pd.DataFrame({
    "id": df["id"],
    "winner_model_a": preds[:, 0],
    "winner_model_b": preds[:, 1],
    "winner_tie": preds[:, 2],
})

# Save to CSV
sub.to_csv("submission.csv", index=False)
print(sub.head())


# Reference
- LMSYS - Chatbot Arena Human Preference Predictions 2nd place solution - https://www.kaggle.com/competitions/lmsys-chatbot-arena/discussion/527685
- Blue - https://www.kaggle.com/code/blue0924/finetuning-test2