In [None]:
# Import essential libraries 
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import xgboost as xgb
from torch.utils.data import DataLoader

# Define batch size for data loading
batch_size = 32


# Dataset preparation

In [None]:
# Custom dataset class for LLM classification data
class LmsysDataset:
    def __init__(self, data, target=None, tokenizer=None):
        self.data = data
        self.target = target
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        def process_text(text):
            return ' '.join([s.strip('"') for s in text.strip('[]').split('","')])
        
        prompt = self.data.iloc[idx]["prompt"]
        response_a = self.data.iloc[idx]["response_a"]
        response_b = self.data.iloc[idx]["response_b"]
        
        if self.target is not None:
            y = torch.tensor([self.target.iloc[idx]["winner_model_a"], 
                              self.target.iloc[idx]["winner_model_b"], 
                              self.target.iloc[idx]["winner_tie"]])
        else:
            y = torch.tensor([0, 0, 0])

        # Combine prompt and responses for input text
        text = "prompt: " + process_text(prompt) + " model_a: " + process_text(response_a) + " model_b: " + process_text(response_b)

        if self.tokenizer is not None:
            # Tokenize and encode the combined text
            encoding = self.tokenizer.encode_plus(text, truncation=True, padding='max_length', max_length=2048, return_tensors="pt")
            
            input_ids = encoding['input_ids'].squeeze(0)
            attention_mask = encoding['attention_mask'].squeeze(0)

            return input_ids, attention_mask, y
        else:
            return text, y


# Embedding generation

In [None]:
# Load test data and select relevant columns
df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
data = df[["prompt", "response_a", "response_b"]]

# Initialize tokenizer and model from pre-trained files
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/lmsys-gte/gte_tokenizer")
model = AutoModel.from_pretrained("/kaggle/input/base_custom_gte/transformers/default/1", trust_remote_code=True)

# Create dataset instance with tokenized data
dataset = LmsysDataset(data, tokenizer=tokenizer)


In [None]:
# Function for mean pooling to average token embeddings based on attention mask
def mean_pooling(last_hidden_state, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask
    
# Function to process each batch and extract embeddings
def process_batch(model, batch, device):
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    embeddings = mean_pooling(outputs.last_hidden_state, attention_mask)
    labels = labels.numpy()

    return embeddings, labels


In [None]:
# Set device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize dataloader for batching
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

data_list = []

# Process each batch, collect embeddings and labels
for batch in dataloader:
    embeddings, labels = process_batch(model, batch, device)
    for emb, label in zip(embeddings, labels):
        data_list.append({
            "emb": emb.tolist(),
            "label": label.tolist()
        })

# Convert embeddings and labels to a DataFrame
emb_df = pd.DataFrame(data_list, columns=["emb", "label"])


# Inference

In [None]:
# Load pretrained XGBoost classifier model
xgb_model = xgb.XGBClassifier()
xgb_model.load_model('/kaggle/input/lmsys-xgb/xgb_model.json')

# Prepare data and make predictions
X = np.vstack(emb_df['emb'].values)
y_pred = xgb_model.predict_proba(X)


In [None]:
# Create submission DataFrame with prediction probabilities for each outcome
submission = pd.DataFrame({
    'id': df['id'],
    'winner_model_a': y_pred[:, 0],
    'winner_model_b': y_pred[:, 1],
    'winner_tie': y_pred[:, 2]
})

# Save submission file
submission.to_csv("submission.csv", index=False)


In [None]:
submission

# Experimentation and Optimization

Feel free to experiment with the following ideas to enhance your model's performance:

- **Different Pretrained Models**: Try using various pretrained models from the Hugging Face model hub to see which one works best for your dataset.

- **Grid Search for XGBoost**: Implement a grid search to optimize the hyperparameters of the XGBoost classifier. Consider parameters like `max_depth`, `learning_rate`, `n_estimators`, and `subsample`.

- **Feature Engineering**: Explore different ways to preprocess and engineer features from the text data. Experiment with techniques like adding more context, using different tokenization strategies, or leveraging embeddings differently.

- **Model Ensembling**: Combine predictions from multiple models (e.g., stacking or voting) to potentially improve accuracy and robustness.

- **Cross-Validation**: Use cross-validation techniques to better understand model performance and avoid overfitting.

- **Hyperparameter Tuning for Other Models**: Besides XGBoost, consider tuning hyperparameters for any other models you incorporate.

- **Error Analysis**: Perform an analysis of the predictions to identify common misclassifications and refine your approach accordingly.

Explore these ideas to see how they impact your results!
