In [None]:
!pip install pytorch-tabnet2

In [None]:
# Import necessary libraries
import pandas as pd
from pytorch_tabnet import TabNetRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
import torch

In [None]:
# Load data
train = pd.read_csv("/kaggle/input/playground-series-s5e5/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e5/test.csv")
sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e5/sample_submission.csv")

train.head()

In [None]:
# Encode the categorical variable 'sex'
cat_features = ['Sex']
for col in cat_features:
    encoder = LabelEncoder()
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.transform(test[col])

# Drop the 'id' column from the features
X_train, X_val, y_train, y_val = train_test_split(
    train.drop(['id', 'Calories'], axis=1), train['Calories'], test_size=0.1, random_state=42
)

X_train.head()

In [None]:
class MSLELoss(nn.Module):
    """
    Calculates the Mean Squared Logarithmic Error (MSLE) between
    predictions and targets.

    MSLE = mean( (log(prediction + 1) - log(target + 1))^2 )

    Args:
        epsilon (float): A small value added to prediction and target
                         before taking the logarithm and clamping to
                         prevent log(0) or log(<negative>).
                         Ensures the input to log is >= epsilon.
                         Default: 1e-8
    """
    def __init__(self, epsilon: float = 1e-8):
        super().__init__()
        # Ensure epsilon is positive
        assert epsilon > 0, "epsilon must be positive"
        self.epsilon = epsilon
        # Using built-in MSELoss to calculate the mean squared error
        # of the log-transformed values.
        self.mse = nn.MSELoss()

    def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
        """
        Calculates the forward pass for MSLE.

        Args:
            y_pred (torch.Tensor): The predicted values from the model.
                                   Expected to be raw outputs (non-negative).
            y_true (torch.Tensor): The ground truth target values.
                                   Expected to be non-negative.

        Returns:
            torch.Tensor: The calculated MSLE loss (scalar).
        """
        # Ensure inputs have the same shape
        if y_pred.shape != y_true.shape:
            raise ValueError(
                f"Input shapes must match. Got pred: {y_pred.shape}, true: {y_true.shape}"
            )

        # Ensure inputs are non-negative (optional but good practice for MSLE context)
        # If predictions can be negative, clamping is crucial.
        # Clamping predictions ensures log input is valid even if model outputs < -1
        y_pred_clamped = torch.clamp(y_pred, min=0.)
        # Targets are usually assumed non-negative for MSLE
        y_true_clamped = torch.clamp(y_true, min=0.)


        # Add 1, clamp to ensure input >= epsilon, then take log
        # Clamping *after* adding 1 is important
        log_pred = torch.log(torch.clamp(y_pred_clamped + 1, min=self.epsilon))
        log_true = torch.log(torch.clamp(y_true_clamped + 1, min=self.epsilon))

        # Calculate the Mean Squared Error between the log-transformed values
        loss = self.mse(log_pred, log_true)

        return loss

In [None]:
model = TabNetRegressor(cat_idxs=[0],cat_dims=[2])
model.fit(
    X_train.values,
    y_train.values.reshape(-1,1),
    eval_set=[(X_val.values, y_val.values.reshape(-1,1))],
    eval_metric=['rmsle'],
    max_epochs=500, #300
    patience=100, #50
    batch_size=1024*64,
    loss_fn = MSLELoss()
)

In [None]:
# Make predictions
import numpy as np
y_test = model.predict(test.drop(['id',], axis=1).values)[:, 0]
y_test = np.clip(y_test,0,999999)

In [None]:
# Create submission file
submission = pd.DataFrame({"id": test["id"], "Calories": y_test})
submission.to_csv("submission.csv", index=False)

submission