In [1]:
!pip install pytorch-tabnet torch


Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [2]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

def process_time(df):
    df['time'] = pd.to_datetime(df['time'])
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    df['second'] = df['time'].dt.second
    return df

def encode_activities(train_df, test_df, activity_cols):
    le = LabelEncoder()
    all_activities = ['missing']
    for col in activity_cols:
        all_activities.extend(train_df[col].dropna().unique())
        if test_df is not None:
            all_activities.extend(test_df[col].dropna().unique())
    
    le.fit(list(set(all_activities)))
    
    for col in activity_cols:
        train_df[col] = train_df[col].fillna('missing')
        train_df[col] = le.transform(train_df[col])
        if test_df is not None:
            test_df[col] = test_df[col].fillna('missing')
            test_df[col] = le.transform(test_df[col])
    
    return train_df, test_df

def prepare_data(train_path, test_path=None):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path) if test_path else None
    train_df = process_time(train_df)
    if test_df is not None:
        test_df = process_time(test_df)
    
    # Identify column types
    bg_cols = [col for col in train_df.columns if col.startswith('bg-')]
    insulin_cols = [col for col in train_df.columns if col.startswith('insulin-')]
    carbs_cols = [col for col in train_df.columns if col.startswith('carbs-')]
    hr_cols = [col for col in train_df.columns if col.startswith('hr-')]
    steps_cols = [col for col in train_df.columns if col.startswith('steps-')]
    cals_cols = [col for col in train_df.columns if col.startswith('cals-')]
    activity_cols = [col for col in train_df.columns if col.startswith('activity-')]
    
    print("Handling missing values...")
    # Handle missing numerical values
    numerical_cols = bg_cols + insulin_cols + carbs_cols + hr_cols + steps_cols + cals_cols
    for col in numerical_cols:
        train_df[col] = train_df[col].fillna(train_df[col].mean())
        if test_df is not None:
            test_df[col] = test_df[col].fillna(train_df[col].mean())
    
    print("Encoding activities...")
    # Encode activities
    train_df, test_df = encode_activities(train_df, test_df, activity_cols)
    
    print("Scaling features...")
    # Scale numerical features
    scaler = StandardScaler()
    train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
    if test_df is not None:
        test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])
    
    #feature columns
    feature_cols = (bg_cols + insulin_cols + carbs_cols + hr_cols + steps_cols + 
                   cals_cols + activity_cols + ['hour', 'minute', 'second'])
    
    X_train = train_df[feature_cols].values
    y_train = train_df['bg+1:00'].values.reshape(-1, 1)  
    
    if test_df is not None:
        X_test = test_df[feature_cols].values
        return X_train, y_train, X_test, feature_cols
    else:
        return X_train, y_train, None, feature_cols

def train_tabnet(X_train, y_train, patience=20, max_epochs=300):
    # Split data for validation
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    # Initialize tabnet
    model = TabNetRegressor(
        n_d=64,
        n_a=64,
        n_steps=5,
        gamma=1.5,
        n_independent=2,
        n_shared=2,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=1e-2),
        mask_type='entmax',
        lambda_sparse=1e-3,
        scheduler_params=dict(
            mode="min",
            patience=patience//2,
            min_lr=1e-5,
            factor=0.5,
        ),
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        verbose=10
    )
    
    # Train 
    model.fit(
        X_train_split, y_train_split,
        eval_set=[(X_val, y_val)],
        max_epochs=max_epochs,
        patience=patience,
        batch_size=1024,
        virtual_batch_size=128,
        num_workers=0,
        drop_last=False
    )
    
    return model

def main():
  
    X_train, y_train, X_test, feature_cols = prepare_data('/kaggle/input/brist1d/train.csv', '/kaggle/input/brist1d/test.csv')
    
    print(f"Data shapes: X_train: {X_train.shape}, y_train: {y_train.shape}")
    
    model = train_tabnet(X_train, y_train)
    
    print("Making predictions...")
    #predictions
    preds = model.predict(X_test)
    
  
    test_df = pd.read_csv('/kaggle/input/brist1d/test.csv')
    submission = pd.DataFrame({
        'id': test_df['id'],
        'bg+1:00': preds.squeeze()  # Convert back to 1D array for submission
    })
    submission.to_csv('submission.csv', index=False)
    
    # Evaluate on validation set
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    val_preds = model.predict(X_val)
    
    print("\nValidation Metrics:")
    print("RMSE:", np.sqrt(mean_squared_error(y_val, val_preds)))
    print("R2 Score:", r2_score(y_val, val_preds))

if __name__ == "__main__":
    main()

Handling missing values...
Encoding activities...
Scaling features...
Data shapes: X_train: (177024, 507), y_train: (177024, 1)
epoch 0  | loss: 10.69026| val_0_mse: 8.75371 |  0:00:11s
epoch 10 | loss: 4.27385 | val_0_mse: 4.31102 |  0:01:55s
epoch 20 | loss: 3.60645 | val_0_mse: 3.91229 |  0:03:42s
epoch 30 | loss: 2.78944 | val_0_mse: 3.61913 |  0:05:26s
epoch 40 | loss: 2.02134 | val_0_mse: 3.2076  |  0:07:11s
epoch 50 | loss: 1.58616 | val_0_mse: 2.95995 |  0:08:57s
epoch 60 | loss: 1.30268 | val_0_mse: 2.69373 |  0:10:44s
epoch 70 | loss: 1.11255 | val_0_mse: 2.59059 |  0:12:29s
epoch 80 | loss: 0.98643 | val_0_mse: 2.42659 |  0:14:14s
epoch 90 | loss: 0.88335 | val_0_mse: 2.29879 |  0:16:01s
epoch 100| loss: 0.83308 | val_0_mse: 2.2167  |  0:17:47s
epoch 110| loss: 0.77337 | val_0_mse: 2.15345 |  0:19:32s
epoch 120| loss: 0.72371 | val_0_mse: 2.07075 |  0:21:16s
epoch 130| loss: 0.6903  | val_0_mse: 2.02353 |  0:22:59s
epoch 140| loss: 0.65278 | val_0_mse: 1.95731 |  0:24:43s
ep