# Experiment 001: Extended Temporal and Physics-Informed Features

**Goal**: Improve baseline by adding longer temporal features and physics-informed features.
**Target**: 0.35-0.40 MAE (20-30% improvement over baseline 0.4589)

**Key improvements**:
1. Extended lags: u_in_lag_5, u_in_lag_10, u_out_lag_5, u_out_lag_10
2. Pressure lags (target encoding): pressure_lag_1, pressure_lag_2, pressure_lag_5
3. Rate of change: u_in_diff_2, u_in_diff_5
4. EMAs: u_in_ewm_5, u_in_ewm_10
5. Physics features: RC = R*C, time_normalized = time_step/(R*C), stiffness = 1/C
6. GroupKFold CV to prevent breath leakage

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [2]:
# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Training data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"\nColumns: {list(train.columns)}")

Training data shape: (5432400, 8)
Test data shape: (603600, 7)

Columns: ['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']


In [3]:
# Create extended temporal features
def create_extended_features(df):
    """Create extended temporal and physics-informed features"""
    
    # Sort by breath_id and time_step to ensure proper ordering
    df = df.sort_values(['breath_id', 'time_step']).reset_index(drop=True)
    
    # ----- Extended Lags -----
    for lag in [1, 2, 3, 5, 10]:
        df[f'u_in_lag_{lag}'] = df.groupby('breath_id')['u_in'].shift(lag)
        df[f'u_out_lag_{lag}'] = df.groupby('breath_id')['u_out'].shift(lag)
    
    # ----- Pressure Lags (Target Encoding) -----
    # Only for training data
    if 'pressure' in df.columns:
        for lag in [1, 2, 5]:
            df[f'pressure_lag_{lag}'] = df.groupby('breath_id')['pressure'].shift(lag)
    
    # ----- Rate of Change Features -----
    df['u_in_diff_1'] = df.groupby('breath_id')['u_in'].diff(1)
    df['u_in_diff_2'] = df.groupby('breath_id')['u_in'].diff(2)
    df['u_in_diff_5'] = df.groupby('breath_id')['u_in'].diff(5)
    
    # ----- Rolling Statistics with Different Windows -----
    for window in [5, 10, 20]:
        df[f'u_in_rolling_mean_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).mean().reset_index(0, drop=True)
        df[f'u_in_rolling_std_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).std().reset_index(0, drop=True)
        df[f'u_in_rolling_max_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).max().reset_index(0, drop=True)
        df[f'u_in_rolling_min_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).min().reset_index(0, drop=True)
    
    # ----- Exponential Moving Averages -----
    for span in [5, 10, 20]:
        df[f'u_in_ewm_{span}'] = df.groupby('breath_id')['u_in'].ewm(span=span, min_periods=1).mean().reset_index(0, drop=True)
    
    # ----- Cumulative Features -----
    df['u_in_cumsum'] = df.groupby('breath_id')['u_in'].cumsum()
    df['time_cumsum'] = df.groupby('breath_id')['time_step'].cumsum()
    
    # ----- Physics-Informed Features -----
    # RC time constant (lung mechanics)
    df['RC'] = df['R'] * df['C']
    
    # Normalized time (time relative to lung time constant)
    df['time_normalized'] = df['time_step'] / (df['R'] * df['C'])
    
    # Lung stiffness (inverse of compliance)
    df['stiffness'] = 1.0 / df['C']
    
    # Work done (integral of pressure * flow) - approximate with u_in
    df['u_in_integral'] = df.groupby('breath_id')['u_in'].cumsum() * df['time_step']
    
    # ----- Interaction Features -----
    df['u_in_times_R'] = df['u_in'] * df['R']
    df['u_in_times_C'] = df['u_in'] * df['C']
    df['u_in_times_RC'] = df['u_in'] * df['RC']
    
    # Time since start of breath
    df['time_since_start'] = df['time_step'] - df.groupby('breath_id')['time_step'].transform('first')
    
    # Breath position (0 to 1 within breath)
    df['breath_position'] = df.groupby('breath_id').cumcount() / df.groupby('breath_id').size()
    
    return df

print("Creating extended features for training data...")
train_extended = create_extended_features(train.copy())

print("Creating extended features for test data...")
test_extended = create_extended_features(test.copy())

print(f"\nTraining data with extended features: {train_extended.shape}")
print(f"Test data with extended features: {test_extended.shape}")

Creating extended features for training data...


Creating extended features for test data...



Training data with extended features: (5432400, 50)
Test data with extended features: (603600, 46)


In [4]:
# Define feature columns (exclude IDs, target, and any columns with too many NaNs)
exclude_cols = ['id', 'breath_id', 'pressure', 'u_in_lag_10', 'u_out_lag_10']

feature_cols = [col for col in train_extended.columns if col not in exclude_cols]

print(f"Number of features: {len(feature_cols)}")
print(f"\nSample features: {feature_cols[:20]}")

Number of features: 45

Sample features: ['R', 'C', 'time_step', 'u_in', 'u_out', 'u_in_lag_1', 'u_out_lag_1', 'u_in_lag_2', 'u_out_lag_2', 'u_in_lag_3', 'u_out_lag_3', 'u_in_lag_5', 'u_out_lag_5', 'pressure_lag_1', 'pressure_lag_2', 'pressure_lag_5', 'u_in_diff_1', 'u_in_diff_2', 'u_in_diff_5', 'u_in_rolling_mean_5']


In [5]:
# Prepare data for training
X = train_extended[feature_cols].copy()
y = train_extended['pressure'].copy()

# Create groups for GroupKFold (one group per breath)
groups = train_extended['breath_id']

print(f"Training data shape: X={X.shape}, y={y.shape}")
print(f"Number of unique breaths: {groups.nunique()}")

# Check for NaN values
nan_counts = X.isnull().sum()
if nan_counts.sum() > 0:
    print(f"\nColumns with NaN values:")
    print(nan_counts[nan_counts > 0].sort_values(ascending=False).head())
    
    # Fill NaN values with 0 (common approach for lag features)
    X = X.fillna(0)
    print("Filled NaN values with 0")
else:
    print("\nNo NaN values found")

Training data shape: X=(5432400, 45), y=(5432400,)
Number of unique breaths: 67905



Columns with NaN values:
breath_position    5364495
u_out_lag_5         339525
u_in_diff_5         339525
u_in_lag_5          339525
pressure_lag_5      339525
dtype: int64


Filled NaN values with 0


In [6]:
# Set up GroupKFold cross-validation
gkf = GroupKFold(n_splits=5)

# Create splits
folds = list(gkf.split(X, y, groups))

print("Cross-validation splits created:")
for i, (train_idx, val_idx) in enumerate(folds):
    train_breaths = groups.iloc[train_idx].nunique()
    val_breaths = groups.iloc[val_idx].nunique()
    print(f"Fold {i+1}: Train={train_breaths} breaths, Val={val_breaths} breaths")

Cross-validation splits created:
Fold 1: Train=54324 breaths, Val=13581 breaths


Fold 2: Train=54324 breaths, Val=13581 breaths
Fold 3: Train=54324 breaths, Val=13581 breaths


Fold 4: Train=54324 breaths, Val=13581 breaths
Fold 5: Train=54324 breaths, Val=13581 breaths


In [7]:
# Train LightGBM model with cross-validation
fold_scores = []
predictions = np.zeros(len(train_extended))
feature_importance_list = []

print("Training LightGBM model with 5-fold GroupKFold...")

for fold, (train_idx, val_idx) in enumerate(folds):
    print(f"\nFold {fold + 1}/5")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Model parameters
    params = {
        'objective': 'regression',
        'metric': 'mae',
        'boosting_type': 'gbdt',
        'num_leaves': 100,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=10000,
        valid_sets=[val_data],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    # Calculate MAE (focusing on inspiratory phase where u_out=0)
    val_data_fold = train_extended.iloc[val_idx]
    inspiratory_mask = val_data_fold['u_out'] == 0
    
    if inspiratory_mask.sum() > 0:
        fold_score = mean_absolute_error(y_val[inspiratory_mask], val_pred[inspiratory_mask])
        print(f"Fold {fold + 1} MAE (inspiratory): {fold_score:.4f}")
        fold_scores.append(fold_score)
    else:
        print(f"Fold {fold + 1}: No inspiratory samples in validation set")
    
    # Store predictions
    predictions[val_idx] = val_pred
    
    # Store feature importance
    importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importance(importance_type='gain'),
        'fold': fold + 1
    })
    feature_importance_list.append(importance)

# Calculate overall CV score
if len(fold_scores) > 0:
    cv_score = np.mean(fold_scores)
    cv_std = np.std(fold_scores)
    print(f"\n{'='*50}")
    print(f"Overall CV MAE: {cv_score:.4f} ± {cv_std:.4f}")
    print(f"Individual folds: {[f'{s:.4f}' for s in fold_scores]}")
    
    # Compare to baseline
    baseline_score = 0.4589
    improvement = (baseline_score - cv_score) / baseline_score * 100
    print(f"Improvement over baseline: {improvement:.1f}%")
else:
    print("\nNo valid folds with inspiratory samples found")

Training LightGBM model with 5-fold GroupKFold...

Fold 1/5


Training until validation scores don't improve for 50 rounds


[100]	valid_0's l1: 0.34468


[200]	valid_0's l1: 0.285665


[300]	valid_0's l1: 0.264693


[400]	valid_0's l1: 0.250602


[500]	valid_0's l1: 0.240629


[600]	valid_0's l1: 0.233304


[700]	valid_0's l1: 0.227795


[800]	valid_0's l1: 0.222366


[900]	valid_0's l1: 0.218167


[1000]	valid_0's l1: 0.214074


[1100]	valid_0's l1: 0.210948


[1200]	valid_0's l1: 0.208032


[1300]	valid_0's l1: 0.205758


[1400]	valid_0's l1: 0.203309


[1500]	valid_0's l1: 0.200729


[1600]	valid_0's l1: 0.199019


[1700]	valid_0's l1: 0.197138


[1800]	valid_0's l1: 0.195209


[1900]	valid_0's l1: 0.193624


[2000]	valid_0's l1: 0.192169


[2100]	valid_0's l1: 0.190742


[2200]	valid_0's l1: 0.189482


[2300]	valid_0's l1: 0.188109


[2400]	valid_0's l1: 0.186833


[2500]	valid_0's l1: 0.185752


[2600]	valid_0's l1: 0.184412


[2700]	valid_0's l1: 0.183207


[2800]	valid_0's l1: 0.182191


[2900]	valid_0's l1: 0.181307


[3000]	valid_0's l1: 0.180405


[3100]	valid_0's l1: 0.179569


[3200]	valid_0's l1: 0.178695


[3300]	valid_0's l1: 0.177756


[3400]	valid_0's l1: 0.176939


[3500]	valid_0's l1: 0.176125


[3600]	valid_0's l1: 0.175491


[3700]	valid_0's l1: 0.174943


[3800]	valid_0's l1: 0.174182


[3900]	valid_0's l1: 0.173523


[4000]	valid_0's l1: 0.172912


[4100]	valid_0's l1: 0.172331


[4200]	valid_0's l1: 0.171879


[4300]	valid_0's l1: 0.171331


[4400]	valid_0's l1: 0.170802


[4500]	valid_0's l1: 0.17034


[4600]	valid_0's l1: 0.169872


[4700]	valid_0's l1: 0.169381


[4800]	valid_0's l1: 0.168873


[4900]	valid_0's l1: 0.168486


[5000]	valid_0's l1: 0.168056


[5100]	valid_0's l1: 0.167663


[5200]	valid_0's l1: 0.167284


[5300]	valid_0's l1: 0.16691


[5400]	valid_0's l1: 0.166598


[5500]	valid_0's l1: 0.166217


[5600]	valid_0's l1: 0.165912


[5700]	valid_0's l1: 0.165595


[5800]	valid_0's l1: 0.165314


[5900]	valid_0's l1: 0.165008


[6000]	valid_0's l1: 0.164675


[6100]	valid_0's l1: 0.164323


[6200]	valid_0's l1: 0.164018


[6300]	valid_0's l1: 0.1637


[6400]	valid_0's l1: 0.163517


[6500]	valid_0's l1: 0.1632


[6600]	valid_0's l1: 0.162921


[6700]	valid_0's l1: 0.162589


[6800]	valid_0's l1: 0.162355


[6900]	valid_0's l1: 0.162067


[7000]	valid_0's l1: 0.161736


[7100]	valid_0's l1: 0.161501


[7200]	valid_0's l1: 0.161264


[7300]	valid_0's l1: 0.16097


[7400]	valid_0's l1: 0.160719


[7500]	valid_0's l1: 0.160477


[7600]	valid_0's l1: 0.160294


[7700]	valid_0's l1: 0.160086


[7800]	valid_0's l1: 0.159934


[7900]	valid_0's l1: 0.159737


[8000]	valid_0's l1: 0.159582


[8100]	valid_0's l1: 0.159356


[8200]	valid_0's l1: 0.159209


[8300]	valid_0's l1: 0.159027


[8400]	valid_0's l1: 0.158839


[8500]	valid_0's l1: 0.158622


[8600]	valid_0's l1: 0.158452


[8700]	valid_0's l1: 0.15823


[8800]	valid_0's l1: 0.158071


[8900]	valid_0's l1: 0.157868


[9000]	valid_0's l1: 0.157705


[9100]	valid_0's l1: 0.157573


[9200]	valid_0's l1: 0.157375


[9300]	valid_0's l1: 0.157221


[9400]	valid_0's l1: 0.157049


[9500]	valid_0's l1: 0.156892


[9600]	valid_0's l1: 0.156759


[9700]	valid_0's l1: 0.156613


[9800]	valid_0's l1: 0.15652


[9900]	valid_0's l1: 0.156382


[10000]	valid_0's l1: 0.156218
Did not meet early stopping. Best iteration is:
[9999]	valid_0's l1: 0.156218


Fold 1 MAE (inspiratory): 0.1766

Fold 2/5


Training until validation scores don't improve for 50 rounds


[100]	valid_0's l1: 0.345321


[200]	valid_0's l1: 0.287346


[300]	valid_0's l1: 0.265501


[400]	valid_0's l1: 0.251983


[500]	valid_0's l1: 0.242335


[600]	valid_0's l1: 0.234367


[700]	valid_0's l1: 0.228314


[800]	valid_0's l1: 0.22304


[900]	valid_0's l1: 0.218633


[1000]	valid_0's l1: 0.214861


[1100]	valid_0's l1: 0.211719


[1200]	valid_0's l1: 0.208894


[1300]	valid_0's l1: 0.206193


[1400]	valid_0's l1: 0.203775


[1500]	valid_0's l1: 0.201324


[1600]	valid_0's l1: 0.19943


[1700]	valid_0's l1: 0.197549


[1800]	valid_0's l1: 0.195684


[1900]	valid_0's l1: 0.194113


[2000]	valid_0's l1: 0.1925


[2100]	valid_0's l1: 0.190705


[2200]	valid_0's l1: 0.189407


[2300]	valid_0's l1: 0.188155


[2400]	valid_0's l1: 0.187076


[2500]	valid_0's l1: 0.186003


[2600]	valid_0's l1: 0.18484


[2700]	valid_0's l1: 0.183769


[2800]	valid_0's l1: 0.182642


[2900]	valid_0's l1: 0.181812


[3000]	valid_0's l1: 0.180872


[3100]	valid_0's l1: 0.17999


[3200]	valid_0's l1: 0.179025


[3300]	valid_0's l1: 0.178119


[3400]	valid_0's l1: 0.177283


[3500]	valid_0's l1: 0.176685


[3600]	valid_0's l1: 0.176031


[3700]	valid_0's l1: 0.175344


[3800]	valid_0's l1: 0.174786


[3900]	valid_0's l1: 0.174243


[4000]	valid_0's l1: 0.17372


[4100]	valid_0's l1: 0.173137


[4200]	valid_0's l1: 0.17263


[4300]	valid_0's l1: 0.171984


[4400]	valid_0's l1: 0.171493


[4500]	valid_0's l1: 0.171011


[4600]	valid_0's l1: 0.17041


[4700]	valid_0's l1: 0.169878


[4800]	valid_0's l1: 0.169423


[4900]	valid_0's l1: 0.169011


[5000]	valid_0's l1: 0.168552


[5100]	valid_0's l1: 0.16823


[5200]	valid_0's l1: 0.167897


[5300]	valid_0's l1: 0.167409


[5400]	valid_0's l1: 0.16707


[5500]	valid_0's l1: 0.166748


[5600]	valid_0's l1: 0.166431


[5700]	valid_0's l1: 0.166095


[5800]	valid_0's l1: 0.16574


[5900]	valid_0's l1: 0.165401


[6000]	valid_0's l1: 0.165114


[6100]	valid_0's l1: 0.164794


[6200]	valid_0's l1: 0.164476


[6300]	valid_0's l1: 0.164261


[6400]	valid_0's l1: 0.163912


[6500]	valid_0's l1: 0.163672


[6600]	valid_0's l1: 0.163431


[6700]	valid_0's l1: 0.163176


[6800]	valid_0's l1: 0.162907


[6900]	valid_0's l1: 0.162664


[7000]	valid_0's l1: 0.162435


[7100]	valid_0's l1: 0.162152


[7200]	valid_0's l1: 0.161899


[7300]	valid_0's l1: 0.161644


[7400]	valid_0's l1: 0.161392


[7500]	valid_0's l1: 0.161149


[7600]	valid_0's l1: 0.160928


[7700]	valid_0's l1: 0.16071


[7800]	valid_0's l1: 0.160506


[7900]	valid_0's l1: 0.160341


[8000]	valid_0's l1: 0.16013


[8100]	valid_0's l1: 0.159947


[8200]	valid_0's l1: 0.159724


[8300]	valid_0's l1: 0.159511


[8400]	valid_0's l1: 0.159331


[8500]	valid_0's l1: 0.159197


[8600]	valid_0's l1: 0.158996


[8700]	valid_0's l1: 0.158775


[8800]	valid_0's l1: 0.15861


[8900]	valid_0's l1: 0.158422


[9000]	valid_0's l1: 0.158194


[9100]	valid_0's l1: 0.158045


[9200]	valid_0's l1: 0.15783


[9300]	valid_0's l1: 0.157688


[9400]	valid_0's l1: 0.157501


[9500]	valid_0's l1: 0.157329


[9600]	valid_0's l1: 0.15718


[9700]	valid_0's l1: 0.157072


[9800]	valid_0's l1: 0.15695


[9900]	valid_0's l1: 0.156775


[10000]	valid_0's l1: 0.156622
Did not meet early stopping. Best iteration is:
[10000]	valid_0's l1: 0.156622


Fold 2 MAE (inspiratory): 0.1767

Fold 3/5


Training until validation scores don't improve for 50 rounds


[100]	valid_0's l1: 0.3447


[200]	valid_0's l1: 0.286809


[300]	valid_0's l1: 0.264962


[400]	valid_0's l1: 0.251656


[500]	valid_0's l1: 0.24182


[600]	valid_0's l1: 0.234052


[700]	valid_0's l1: 0.227914


[800]	valid_0's l1: 0.222955


[900]	valid_0's l1: 0.218389


[1000]	valid_0's l1: 0.214573


[1100]	valid_0's l1: 0.211532


[1200]	valid_0's l1: 0.208572


[1300]	valid_0's l1: 0.205472


[1400]	valid_0's l1: 0.203001


[1500]	valid_0's l1: 0.200766


[1600]	valid_0's l1: 0.198935


[1700]	valid_0's l1: 0.197117


[1800]	valid_0's l1: 0.195455


[1900]	valid_0's l1: 0.193636


[2000]	valid_0's l1: 0.19186


[2100]	valid_0's l1: 0.190369


[2200]	valid_0's l1: 0.189


[2300]	valid_0's l1: 0.187665


[2400]	valid_0's l1: 0.186347


[2500]	valid_0's l1: 0.185175


[2600]	valid_0's l1: 0.184035


[2700]	valid_0's l1: 0.182856


[2800]	valid_0's l1: 0.182001


[2900]	valid_0's l1: 0.181195


[3000]	valid_0's l1: 0.180136


[3100]	valid_0's l1: 0.179269


[3200]	valid_0's l1: 0.178556


[3300]	valid_0's l1: 0.177669


[3400]	valid_0's l1: 0.176951


[3500]	valid_0's l1: 0.176253


[3600]	valid_0's l1: 0.175591


[3700]	valid_0's l1: 0.174804


[3800]	valid_0's l1: 0.174119


[3900]	valid_0's l1: 0.173479


[4000]	valid_0's l1: 0.172874


[4100]	valid_0's l1: 0.172258


[4200]	valid_0's l1: 0.171853


[4300]	valid_0's l1: 0.171236


[4400]	valid_0's l1: 0.170779


[4500]	valid_0's l1: 0.170203


[4600]	valid_0's l1: 0.169731


[4700]	valid_0's l1: 0.169154


[4800]	valid_0's l1: 0.168743


[4900]	valid_0's l1: 0.168332


[5000]	valid_0's l1: 0.167901


[5100]	valid_0's l1: 0.16754


[5200]	valid_0's l1: 0.167192


[5300]	valid_0's l1: 0.166788


[5400]	valid_0's l1: 0.166423


[5500]	valid_0's l1: 0.166081


[5600]	valid_0's l1: 0.165703


[5700]	valid_0's l1: 0.165269


[5800]	valid_0's l1: 0.16493


[5900]	valid_0's l1: 0.164561


[6000]	valid_0's l1: 0.16428


[6100]	valid_0's l1: 0.163943


[6200]	valid_0's l1: 0.163675


[6300]	valid_0's l1: 0.163286


[6400]	valid_0's l1: 0.163044


[6500]	valid_0's l1: 0.16273


[6600]	valid_0's l1: 0.162475


[6700]	valid_0's l1: 0.162118


[6800]	valid_0's l1: 0.16183


[6900]	valid_0's l1: 0.161607


[7000]	valid_0's l1: 0.161361


[7100]	valid_0's l1: 0.161116


[7200]	valid_0's l1: 0.160902


[7300]	valid_0's l1: 0.160677


[7400]	valid_0's l1: 0.160459


[7500]	valid_0's l1: 0.160265


[7600]	valid_0's l1: 0.160076


[7700]	valid_0's l1: 0.159854


[7800]	valid_0's l1: 0.159612


[7900]	valid_0's l1: 0.159444


[8000]	valid_0's l1: 0.159293


[8100]	valid_0's l1: 0.159057


[8200]	valid_0's l1: 0.158811


[8300]	valid_0's l1: 0.158611


[8400]	valid_0's l1: 0.158439


[8500]	valid_0's l1: 0.158238


[8600]	valid_0's l1: 0.158062


[8700]	valid_0's l1: 0.157918


[8800]	valid_0's l1: 0.157733


[8900]	valid_0's l1: 0.157548


[9000]	valid_0's l1: 0.157371


[9100]	valid_0's l1: 0.157211


[9200]	valid_0's l1: 0.157047


[9300]	valid_0's l1: 0.15691


[9400]	valid_0's l1: 0.156771


[9500]	valid_0's l1: 0.156599


[9600]	valid_0's l1: 0.156467


[9700]	valid_0's l1: 0.156369


[9800]	valid_0's l1: 0.156223


[9900]	valid_0's l1: 0.155987


[10000]	valid_0's l1: 0.155828
Did not meet early stopping. Best iteration is:
[10000]	valid_0's l1: 0.155828


Fold 3 MAE (inspiratory): 0.1760

Fold 4/5


Training until validation scores don't improve for 50 rounds


[100]	valid_0's l1: 0.34614


[200]	valid_0's l1: 0.286663


[300]	valid_0's l1: 0.265699


[400]	valid_0's l1: 0.252225


[500]	valid_0's l1: 0.24172


[600]	valid_0's l1: 0.233602


[700]	valid_0's l1: 0.227471


[800]	valid_0's l1: 0.222776


[900]	valid_0's l1: 0.21866


[1000]	valid_0's l1: 0.214648


[1100]	valid_0's l1: 0.211018


[1200]	valid_0's l1: 0.208417


[1300]	valid_0's l1: 0.206178


[1400]	valid_0's l1: 0.203635


[1500]	valid_0's l1: 0.201299


[1600]	valid_0's l1: 0.199362


[1700]	valid_0's l1: 0.197199


[1800]	valid_0's l1: 0.195566


[1900]	valid_0's l1: 0.193756


[2000]	valid_0's l1: 0.192097


[2100]	valid_0's l1: 0.190817


[2200]	valid_0's l1: 0.189343


[2300]	valid_0's l1: 0.187964


[2400]	valid_0's l1: 0.186711


[2500]	valid_0's l1: 0.185443


[2600]	valid_0's l1: 0.184379


[2700]	valid_0's l1: 0.183333


[2800]	valid_0's l1: 0.182432


[2900]	valid_0's l1: 0.181421


[3000]	valid_0's l1: 0.180697


[3100]	valid_0's l1: 0.179829


[3200]	valid_0's l1: 0.179134


[3300]	valid_0's l1: 0.178359


[3400]	valid_0's l1: 0.177467


[3500]	valid_0's l1: 0.176713


[3600]	valid_0's l1: 0.176015


[3700]	valid_0's l1: 0.175382


[3800]	valid_0's l1: 0.174786


[3900]	valid_0's l1: 0.174112


[4000]	valid_0's l1: 0.173543


[4100]	valid_0's l1: 0.17302


[4200]	valid_0's l1: 0.172452


[4300]	valid_0's l1: 0.171945


[4400]	valid_0's l1: 0.17145


[4500]	valid_0's l1: 0.170956


[4600]	valid_0's l1: 0.170465


[4700]	valid_0's l1: 0.169872


[4800]	valid_0's l1: 0.169466


[4900]	valid_0's l1: 0.169073


[5000]	valid_0's l1: 0.168663


[5100]	valid_0's l1: 0.168217


[5200]	valid_0's l1: 0.16786


[5300]	valid_0's l1: 0.167524


[5400]	valid_0's l1: 0.167091


[5500]	valid_0's l1: 0.166668


[5600]	valid_0's l1: 0.166352


[5700]	valid_0's l1: 0.165964


[5800]	valid_0's l1: 0.165559


[5900]	valid_0's l1: 0.165211


[6000]	valid_0's l1: 0.164892


[6100]	valid_0's l1: 0.16458


[6200]	valid_0's l1: 0.164279


[6300]	valid_0's l1: 0.164035


[6400]	valid_0's l1: 0.163764


[6500]	valid_0's l1: 0.163513


[6600]	valid_0's l1: 0.163245


[6700]	valid_0's l1: 0.16294


[6800]	valid_0's l1: 0.162729


[6900]	valid_0's l1: 0.162493


[7000]	valid_0's l1: 0.162154


[7100]	valid_0's l1: 0.161922


[7200]	valid_0's l1: 0.161699


[7300]	valid_0's l1: 0.161463


[7400]	valid_0's l1: 0.161225


[7500]	valid_0's l1: 0.160949


[7600]	valid_0's l1: 0.160754


[7700]	valid_0's l1: 0.160505


[7800]	valid_0's l1: 0.160281


[7900]	valid_0's l1: 0.16011


[8000]	valid_0's l1: 0.15996


[8100]	valid_0's l1: 0.15973


[8200]	valid_0's l1: 0.159494


[8300]	valid_0's l1: 0.159325


[8400]	valid_0's l1: 0.15908


[8500]	valid_0's l1: 0.158911


[8600]	valid_0's l1: 0.158696


[8700]	valid_0's l1: 0.158563


[8800]	valid_0's l1: 0.158372


[8900]	valid_0's l1: 0.158203


[9000]	valid_0's l1: 0.158069


[9100]	valid_0's l1: 0.157926


[9200]	valid_0's l1: 0.157802


[9300]	valid_0's l1: 0.157619


[9400]	valid_0's l1: 0.157465


[9500]	valid_0's l1: 0.157316


[9600]	valid_0's l1: 0.15717


[9700]	valid_0's l1: 0.157036


[9800]	valid_0's l1: 0.156885


[9900]	valid_0's l1: 0.15672


[10000]	valid_0's l1: 0.156577
Did not meet early stopping. Best iteration is:
[9995]	valid_0's l1: 0.156575


Fold 4 MAE (inspiratory): 0.1770

Fold 5/5


Training until validation scores don't improve for 50 rounds


[100]	valid_0's l1: 0.345581


[200]	valid_0's l1: 0.286888


[300]	valid_0's l1: 0.2651


[400]	valid_0's l1: 0.251937


[500]	valid_0's l1: 0.241391


[600]	valid_0's l1: 0.234327


[700]	valid_0's l1: 0.22811


[800]	valid_0's l1: 0.223122


[900]	valid_0's l1: 0.218159


[1000]	valid_0's l1: 0.214626


[1100]	valid_0's l1: 0.211206


[1200]	valid_0's l1: 0.208226


[1300]	valid_0's l1: 0.20561


[1400]	valid_0's l1: 0.203039


[1500]	valid_0's l1: 0.201038


[1600]	valid_0's l1: 0.19899


[1700]	valid_0's l1: 0.197076


[1800]	valid_0's l1: 0.194994


[1900]	valid_0's l1: 0.193246


[2000]	valid_0's l1: 0.191715


[2100]	valid_0's l1: 0.190094


[2200]	valid_0's l1: 0.189022


[2300]	valid_0's l1: 0.187732


[2400]	valid_0's l1: 0.186677


[2500]	valid_0's l1: 0.18545


[2600]	valid_0's l1: 0.18439


[2700]	valid_0's l1: 0.183327


[2800]	valid_0's l1: 0.182489


[2900]	valid_0's l1: 0.181494


[3000]	valid_0's l1: 0.180523


[3100]	valid_0's l1: 0.179662


[3200]	valid_0's l1: 0.178932


[3300]	valid_0's l1: 0.177991


[3400]	valid_0's l1: 0.17733


[3500]	valid_0's l1: 0.176537


[3600]	valid_0's l1: 0.175846


[3700]	valid_0's l1: 0.17514


[3800]	valid_0's l1: 0.174458


[3900]	valid_0's l1: 0.173845


[4000]	valid_0's l1: 0.173241


[4100]	valid_0's l1: 0.172672


[4200]	valid_0's l1: 0.172129


[4300]	valid_0's l1: 0.171636


[4400]	valid_0's l1: 0.171112


[4500]	valid_0's l1: 0.170687


[4600]	valid_0's l1: 0.170186


[4700]	valid_0's l1: 0.169747


[4800]	valid_0's l1: 0.169348


[4900]	valid_0's l1: 0.168945


[5000]	valid_0's l1: 0.168562


[5100]	valid_0's l1: 0.168215


[5200]	valid_0's l1: 0.167722


[5300]	valid_0's l1: 0.167257


[5400]	valid_0's l1: 0.166969


[5500]	valid_0's l1: 0.166477


[5600]	valid_0's l1: 0.166071


[5700]	valid_0's l1: 0.165699


[5800]	valid_0's l1: 0.165394


[5900]	valid_0's l1: 0.165133


[6000]	valid_0's l1: 0.164837


[6100]	valid_0's l1: 0.164522


[6200]	valid_0's l1: 0.164212


[6300]	valid_0's l1: 0.16395


[6400]	valid_0's l1: 0.163698


[6500]	valid_0's l1: 0.163439


[6600]	valid_0's l1: 0.163177


[6700]	valid_0's l1: 0.162831


[6800]	valid_0's l1: 0.162592


[6900]	valid_0's l1: 0.16235


[7000]	valid_0's l1: 0.162044


[7100]	valid_0's l1: 0.161809


[7200]	valid_0's l1: 0.16154


[7300]	valid_0's l1: 0.161301


[7400]	valid_0's l1: 0.16106


[7500]	valid_0's l1: 0.160857


[7600]	valid_0's l1: 0.160651


[7700]	valid_0's l1: 0.160438


[7800]	valid_0's l1: 0.160241


[7900]	valid_0's l1: 0.15997


[8000]	valid_0's l1: 0.159779


[8100]	valid_0's l1: 0.159585


[8200]	valid_0's l1: 0.159439


[8300]	valid_0's l1: 0.159221


[8400]	valid_0's l1: 0.159045


[8500]	valid_0's l1: 0.158857


[8600]	valid_0's l1: 0.158693


[8700]	valid_0's l1: 0.158513


[8800]	valid_0's l1: 0.158326


[8900]	valid_0's l1: 0.158163


[9000]	valid_0's l1: 0.158035


[9100]	valid_0's l1: 0.157854


[9200]	valid_0's l1: 0.157727


[9300]	valid_0's l1: 0.157574


[9400]	valid_0's l1: 0.15741


[9500]	valid_0's l1: 0.157263


[9600]	valid_0's l1: 0.157119


[9700]	valid_0's l1: 0.15696


[9800]	valid_0's l1: 0.15684


[9900]	valid_0's l1: 0.156746


[10000]	valid_0's l1: 0.156512
Did not meet early stopping. Best iteration is:
[10000]	valid_0's l1: 0.156512


Fold 5 MAE (inspiratory): 0.1769

Overall CV MAE: 0.1766 ± 0.0004
Individual folds: ['0.1766', '0.1767', '0.1760', '0.1770', '0.1769']
Improvement over baseline: 61.5%


In [8]:
# Analyze feature importance
feature_importance = pd.concat(feature_importance_list, ignore_index=True)

# Calculate mean importance across folds
mean_importance = feature_importance.groupby('feature')['importance'].mean().sort_values(ascending=False)

print("Top 20 most important features:")
print(mean_importance.head(20))

# Save feature importance
mean_importance.to_csv('/home/code/exp_001_feature_importance.csv')
print(f"\nFeature importance saved to: /home/code/exp_001_feature_importance.csv")

Top 20 most important features:
feature
pressure_lag_1        1.746752e+09
pressure_lag_2        3.691427e+08
u_out_lag_2           6.300612e+07
u_in_diff_1           3.503024e+07
u_in_diff_2           2.575988e+07
pressure_lag_5        2.052691e+07
u_in_lag_2            1.592686e+07
u_in_lag_1            1.171369e+07
time_step             6.889669e+06
R                     6.531747e+06
time_normalized       6.038115e+06
u_in_times_R          4.153061e+06
u_in_rolling_std_5    3.825203e+06
u_in_lag_3            3.367306e+06
time_since_start      2.724785e+06
time_cumsum           2.215127e+06
u_in_diff_5           1.929533e+06
u_in_lag_5            1.482421e+06
RC                    1.461192e+06
u_in                  1.444286e+06
Name: importance, dtype: float64

Feature importance saved to: /home/code/exp_001_feature_importance.csv


In [None]:
# Prepare test data for prediction
X_test = test_extended[feature_cols].copy()

# Fill NaN values
X_test = X_test.fillna(0)

print(f"Test data shape: {X_test.shape}")

# Train final model on full training data
print("\nTraining final model on full training data...")

final_train_data = lgb.Dataset(X, label=y)

final_model = lgb.train(
    params,
    final_train_data,
    num_boost_round=10000,
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(100)
    ]
)

# Make predictions on test set
test_predictions = final_model.predict(X_test, num_iteration=final_model.best_iteration)

print(f"Test predictions shape: {test_predictions.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'id': test_extended['id'],
    'pressure': test_predictions
})

# Ensure correct format
print(f"Submission shape: {submission.shape}")
print(f"\nFirst 5 rows:")
print(submission.head())
print(f"\nLast 5 rows:")
print(submission.tail())

# Save submission
submission_path = '/home/code/submission_candidates/exp_001_submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")