In [None]:
pip install pytorch-tabnet category_encoders scikit-learn pandas

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cublas_cu1

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
import lightgbm as lgb

In [None]:
np.random.seed(42)
torch.manual_seed(42)


<torch._C.Generator at 0x7914325fc570>

In [None]:
# 📥 Load Data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:

# 🔢 Column Definitions
num_cols = ['temperature', 'irradiance', 'humidity', 'panel_age', 'maintenance_count',
            'soiling_ratio', 'voltage', 'current', 'module_temperature',
            'cloud_coverage', 'wind_speed', 'pressure']
cat_cols = ['string_id', 'error_code', 'installation_type']

In [None]:
# 🧼 Handle Missing & Type Casting
for col in num_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

train_df[num_cols] = train_df[num_cols].fillna(train_df[num_cols].mean())
test_df[num_cols] = test_df[num_cols].fillna(test_df[num_cols].mean())

for col in cat_cols:
    train_df[col] = train_df[col].astype(str).replace("nan", "missing")
    test_df[col] = test_df[col].astype(str).replace("nan", "missing")

In [None]:
def create_features(df):
    df = df.copy()
    df['power_output'] = df['voltage'] * df['current']
    df['degradation_rate'] = df['panel_age'] * df['soiling_ratio']
    df['temp_diff'] = df['module_temperature'] - df['temperature']
    df['irradiance_adjusted'] = df['irradiance'] * (1 - df['cloud_coverage'] / 100)
    df['temp_irradiance_interaction'] = df['module_temperature'] * df['irradiance']
    df['age_soiling_temp'] = df['panel_age'] * df['soiling_ratio'] * df['module_temperature']
    df['wind_cooling_effect'] = df['wind_speed'] * (df['module_temperature'] - df['temperature'])
    df['humidity_pressure'] = df['humidity'] * df['pressure']
    df['efficiency_drop_rate'] = (df['voltage'] * df['current']) / (df['irradiance'] + 1e-6)
    df['high_temp_flag'] = (df['module_temperature'] > 45).astype(int)
    df['age_bucket'] = pd.cut(df['panel_age'], bins=[0, 1, 5, 10, 20], labels=False, include_lowest=True)
    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

In [None]:
target = 'efficiency'
all_features = num_cols + [
    'power_output', 'degradation_rate', 'temp_diff',
    'irradiance_adjusted', 'temp_irradiance_interaction',
    'age_soiling_temp', 'wind_cooling_effect', 'humidity_pressure',
    'efficiency_drop_rate', 'high_temp_flag', 'age_bucket'
] + cat_cols

X = train_df[all_features]
y = train_df[target]
X_test = test_df[all_features]


In [None]:
# 🔤 Encode Categorical Features (Safe assignment)
encoder = TargetEncoder(cols=cat_cols)

# Fit and transform on X
X.loc[:, cat_cols] = encoder.fit_transform(X[cat_cols], y)

# Transform test data
X_test.loc[:, cat_cols] = encoder.transform(X_test[cat_cols])

In [None]:
# Create safe copies to avoid SettingWithCopyError
X = X.copy()
X_test = X_test.copy()

# Scale numerical columns
scaler = StandardScaler()
X.loc[:, num_cols] = scaler.fit_transform(X[num_cols])
X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train_trans = np.log1p(y_train)
y_val_trans = np.log1p(y_val)

In [None]:
tabnet = TabNetRegressor(
    n_d=32,
    n_a=32,
    n_steps=5,
    gamma=1.5,
    lambda_sparse=1e-4,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    scheduler_params={"step_size": 10, "gamma": 0.9},
    mask_type="entmax",
    verbose=10,
    seed=42
)


In [None]:
X_train = X_train.fillna(X_train.median(numeric_only=True))
X_val = X_val.fillna(X_val.median(numeric_only=True))

In [None]:
# Impute numerics
X_train = X_train.fillna(X_train.median(numeric_only=True))
X_val = X_val.fillna(X_val.median(numeric_only=True))

# Impute categoricals
cat_cols = X_train.select_dtypes(include='object').columns
for col in cat_cols:
    X_train[col] = X_train[col].fillna(X_train[col].mode()[0])
    X_val[col] = X_val[col].fillna(X_val[col].mode()[0])

# Final check
print("Remaining NaNs in X_train:", X_train.isna().sum().sum())
print("Remaining NaNs in X_val:", X_val.isna().sum().sum())

Remaining NaNs in X_train: 0
Remaining NaNs in X_val: 0


In [None]:
assert X_train.isna().sum().sum() == 0
assert X_val.isna().sum().sum() == 0

In [None]:
tabnet.fit(
    X_train=X_train.values,
    y_train=y_train_trans.values.reshape(-1, 1),
    eval_set=[(X_val.values, y_val_trans.values.reshape(-1, 1))],
    eval_metric=['rmse'],
    max_epochs=300,
    patience=20,
    batch_size=1024,
    virtual_batch_size=128,
    drop_last=False
)

epoch 0  | loss: 1.04828 | val_0_rmse: 2.29478 |  0:00:06s
epoch 10 | loss: 0.009   | val_0_rmse: 0.10093 |  0:00:40s
epoch 20 | loss: 0.00801 | val_0_rmse: 0.08949 |  0:01:09s
epoch 30 | loss: 0.00743 | val_0_rmse: 0.08846 |  0:01:37s
epoch 40 | loss: 0.00717 | val_0_rmse: 0.0869  |  0:02:04s
epoch 50 | loss: 0.00705 | val_0_rmse: 0.08616 |  0:02:32s
epoch 60 | loss: 0.00704 | val_0_rmse: 0.08947 |  0:03:00s
epoch 70 | loss: 0.0069  | val_0_rmse: 0.08606 |  0:03:27s
epoch 80 | loss: 0.00671 | val_0_rmse: 0.08534 |  0:03:55s
epoch 90 | loss: 0.00667 | val_0_rmse: 0.08495 |  0:04:23s
epoch 100| loss: 0.00662 | val_0_rmse: 0.08691 |  0:04:50s
epoch 110| loss: 0.0066  | val_0_rmse: 0.08475 |  0:05:17s
epoch 120| loss: 0.0065  | val_0_rmse: 0.08435 |  0:05:45s
epoch 130| loss: 0.00646 | val_0_rmse: 0.08449 |  0:06:13s
epoch 140| loss: 0.00644 | val_0_rmse: 0.08447 |  0:06:40s
epoch 150| loss: 0.00641 | val_0_rmse: 0.08415 |  0:07:08s
epoch 160| loss: 0.00645 | val_0_rmse: 0.08499 |  0:07:3

In [None]:

# 📈 Evaluate
val_preds_log = tabnet.predict(X_val.values).flatten()
val_preds = np.expm1(val_preds_log)
val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"✅ TabNet RMSE: {val_rmse:.4f}")

✅ TabNet RMSE: 0.1095


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object