In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import numpy as np
from datasets import load_dataset

def prepare_datasets():
    """
    Load MSLR-WEB10K dataset using Hugging Face's `datasets` library.
    Returns:
        X_train, y_train, X_valid, y_valid, X_test, y_test
    """
    ds = load_dataset("philipphager/MSLR-WEB10k")

    def process_split(split):
        """
        Processes a dataset split to extract features and labels.
        Args:
            split (Dataset): A split of the dataset (train, validation, test).
        Returns:
            X (ndarray): Feature matrix.
            y (ndarray): Relevance scores.
        """
        features = split["features"]
        relevance = np.array(split["relevance_label"])

        # Check the shape of each element in features
        feature_shapes = [np.shape(f) for f in features]
        print("Feature shapes:", feature_shapes)

        # Convert features to a NumPy array
        features = np.array([np.array(f) for f in features])

        return features, relevance

    X_train, y_train = process_split(ds["train"])
    X_valid, y_valid = process_split(ds["validation"])
    X_test, y_test = process_split(ds["test"])
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [8]:

# Define a simple neural network model for ranking
class RankNet(nn.Module):
    def __init__(self, input_dim):
        super(RankNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.model(x).squeeze()

# ListNet loss function
def listnet_loss(y_pred, y_true):
    y_pred_softmax = torch.softmax(y_pred, dim=0)
    y_true_softmax = torch.softmax(y_true, dim=0)
    return -torch.sum(y_true_softmax * torch.log(y_pred_softmax))

# ListMLE loss function
def listmle_loss(y_pred, y_true):
    _, sorted_indices = torch.sort(y_true, descending=True)
    y_pred_sorted = y_pred[sorted_indices]
    return -torch.sum(torch.log(torch.softmax(y_pred_sorted, dim=0)))

# Train the model
def train_model(model, loss_fn, X_train, y_train, X_valid, y_valid, lr=0.001, epochs=10, batch_size=128):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader):.4f}")

    # Evaluate on validation set
    model.eval()
    y_valid_pred = model(torch.tensor(X_valid, dtype=torch.float32)).detach().numpy()
    ndcg = ndcg_score([y_valid], [y_valid_pred])
    print(f"Validation NDCG: {ndcg:.4f}")


In [None]:

# Main function
if __name__ == "__main__":
    # Prepare datasets
    feature_count = 136  # Number of features in MSLR-WEB10K
    X_train, y_train, X_valid, y_valid, X_test, y_test = prepare_datasets()

    # Train ListNet
    print("Training ListNet...")
    listnet_model = RankNet(input_dim=feature_count)
    train_model(listnet_model, listnet_loss, X_train, y_train, X_valid, y_valid)

    # Train ListMLE
    print("Training ListMLE...")
    listmle_model = RankNet(input_dim=feature_count)
    train_model(listmle_model, listmle_loss, X_train, y_train, X_valid, y_valid)

    # Evaluate on test set
    listnet_model.eval()
    listmle_model.eval()
    y_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_pred_listnet = listnet_model(y_test_tensor).detach().numpy()
    y_pred_listmle = listmle_model(y_test_tensor).detach().numpy()

    ndcg_listnet = ndcg_score([y_test], [y_pred_listnet])
    ndcg_listmle = ndcg_score([y_test], [y_pred_listmle])

    print(f"Test NDCG (ListNet): {ndcg_listnet:.4f}")
    print(f"Test NDCG (ListMLE): {ndcg_listmle:.4f}")
