# Experiment 001: Extended Temporal and Physics-Informed Features

**Goal**: Improve baseline by adding longer temporal features and physics-informed features.
**Target**: 0.35-0.40 MAE (20-30% improvement over baseline 0.4589)

**Key improvements**:
1. Extended lags: u_in_lag_5, u_in_lag_10, u_out_lag_5, u_out_lag_10
2. Pressure lags (target encoding): pressure_lag_1, pressure_lag_2, pressure_lag_5
3. Rate of change: u_in_diff_2, u_in_diff_5
4. EMAs: u_in_ewm_5, u_in_ewm_10
5. Physics features: RC = R*C, time_normalized = time_step/(R*C), stiffness = 1/C
6. GroupKFold CV to prevent breath leakage

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [2]:
# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Training data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"\nColumns: {list(train.columns)}")

Training data shape: (5432400, 8)
Test data shape: (603600, 7)

Columns: ['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']


In [3]:
# Create extended temporal features
def create_extended_features(df):
    """Create extended temporal and physics-informed features"""
    
    # Sort by breath_id and time_step to ensure proper ordering
    df = df.sort_values(['breath_id', 'time_step']).reset_index(drop=True)
    
    # ----- Extended Lags -----
    for lag in [1, 2, 3, 5, 10]:
        df[f'u_in_lag_{lag}'] = df.groupby('breath_id')['u_in'].shift(lag)
        df[f'u_out_lag_{lag}'] = df.groupby('breath_id')['u_out'].shift(lag)
    
    # ----- Pressure Lags (Target Encoding) -----
    # Only for training data
    if 'pressure' in df.columns:
        for lag in [1, 2, 5]:
            df[f'pressure_lag_{lag}'] = df.groupby('breath_id')['pressure'].shift(lag)
    
    # ----- Rate of Change Features -----
    df['u_in_diff_1'] = df.groupby('breath_id')['u_in'].diff(1)
    df['u_in_diff_2'] = df.groupby('breath_id')['u_in'].diff(2)
    df['u_in_diff_5'] = df.groupby('breath_id')['u_in'].diff(5)
    
    # ----- Rolling Statistics with Different Windows -----
    for window in [5, 10, 20]:
        df[f'u_in_rolling_mean_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).mean().reset_index(0, drop=True)
        df[f'u_in_rolling_std_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).std().reset_index(0, drop=True)
        df[f'u_in_rolling_max_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).max().reset_index(0, drop=True)
        df[f'u_in_rolling_min_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).min().reset_index(0, drop=True)
    
    # ----- Exponential Moving Averages -----
    for span in [5, 10, 20]:
        df[f'u_in_ewm_{span}'] = df.groupby('breath_id')['u_in'].ewm(span=span, min_periods=1).mean().reset_index(0, drop=True)
    
    # ----- Cumulative Features -----
    df['u_in_cumsum'] = df.groupby('breath_id')['u_in'].cumsum()
    df['time_cumsum'] = df.groupby('breath_id')['time_step'].cumsum()
    
    # ----- Physics-Informed Features -----
    # RC time constant (lung mechanics)
    df['RC'] = df['R'] * df['C']
    
    # Normalized time (time relative to lung time constant)
    df['time_normalized'] = df['time_step'] / (df['R'] * df['C'])
    
    # Lung stiffness (inverse of compliance)
    df['stiffness'] = 1.0 / df['C']
    
    # Work done (integral of pressure * flow) - approximate with u_in
    df['u_in_integral'] = df.groupby('breath_id')['u_in'].cumsum() * df['time_step']
    
    # ----- Interaction Features -----
    df['u_in_times_R'] = df['u_in'] * df['R']
    df['u_in_times_C'] = df['u_in'] * df['C']
    df['u_in_times_RC'] = df['u_in'] * df['RC']
    
    # Time since start of breath
    df['time_since_start'] = df['time_step'] - df.groupby('breath_id')['time_step'].transform('first')
    
    # Breath position (0 to 1 within breath)
    df['breath_position'] = df.groupby('breath_id').cumcount() / df.groupby('breath_id').size()
    
    return df

print("Creating extended features for training data...")
train_extended = create_extended_features(train.copy())

print("Creating extended features for test data...")
test_extended = create_extended_features(test.copy())

print(f"\nTraining data with extended features: {train_extended.shape}")
print(f"Test data with extended features: {test_extended.shape}")

Creating extended features for training data...


Creating extended features for test data...



Training data with extended features: (5432400, 50)
Test data with extended features: (603600, 46)


In [4]:
# Define feature columns (exclude IDs, target, and any columns with too many NaNs)
exclude_cols = ['id', 'breath_id', 'pressure', 'u_in_lag_10', 'u_out_lag_10']

feature_cols = [col for col in train_extended.columns if col not in exclude_cols]

print(f"Number of features: {len(feature_cols)}")
print(f"\nSample features: {feature_cols[:20]}")

Number of features: 45

Sample features: ['R', 'C', 'time_step', 'u_in', 'u_out', 'u_in_lag_1', 'u_out_lag_1', 'u_in_lag_2', 'u_out_lag_2', 'u_in_lag_3', 'u_out_lag_3', 'u_in_lag_5', 'u_out_lag_5', 'pressure_lag_1', 'pressure_lag_2', 'pressure_lag_5', 'u_in_diff_1', 'u_in_diff_2', 'u_in_diff_5', 'u_in_rolling_mean_5']


In [None]:
# Prepare data for training
X = train_extended[feature_cols].copy()
y = train_extended['pressure'].copy()

# Create groups for GroupKFold (one group per breath)
groups = train_extended['breath_id']

print(f"Training data shape: X={X.shape}, y={y.shape}")
print(f"Number of unique breaths: {groups.nunique()}")

# Check for NaN values
nan_counts = X.isnull().sum()
if nan_counts.sum() > 0:
    print(f"\nColumns with NaN values:")
    print(nan_counts[nan_counts > 0].sort_values(ascending=False).head())
    
    # Fill NaN values with 0 (common approach for lag features)
    X = X.fillna(0)
    print("Filled NaN values with 0")
else:
    print("\nNo NaN values found")

In [None]:
# Set up GroupKFold cross-validation
gkf = GroupKFold(n_splits=5)

# Create splits
folds = list(gkf.split(X, y, groups))

print("Cross-validation splits created:")
for i, (train_idx, val_idx) in enumerate(folds):
    train_breaths = groups.iloc[train_idx].nunique()
    val_breaths = groups.iloc[val_idx].nunique()
    print(f"Fold {i+1}: Train={train_breaths} breaths, Val={val_breaths} breaths")

In [None]:
# Train LightGBM model with cross-validation
fold_scores = []
predictions = np.zeros(len(train_extended))
feature_importance_list = []

print("Training LightGBM model with 5-fold GroupKFold...")

for fold, (train_idx, val_idx) in enumerate(folds):
    print(f"\nFold {fold + 1}/5")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Model parameters
    params = {
        'objective': 'regression',
        'metric': 'mae',
        'boosting_type': 'gbdt',
        'num_leaves': 100,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=10000,
        valid_sets=[val_data],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    # Calculate MAE (focusing on inspiratory phase where u_out=0)
    val_data_fold = train_extended.iloc[val_idx]
    inspiratory_mask = val_data_fold['u_out'] == 0
    
    if inspiratory_mask.sum() > 0:
        fold_score = mean_absolute_error(y_val[inspiratory_mask], val_pred[inspiratory_mask])
        print(f"Fold {fold + 1} MAE (inspiratory): {fold_score:.4f}")
        fold_scores.append(fold_score)
    else:
        print(f"Fold {fold + 1}: No inspiratory samples in validation set")
    
    # Store predictions
    predictions[val_idx] = val_pred
    
    # Store feature importance
    importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importance(importance_type='gain'),
        'fold': fold + 1
    })
    feature_importance_list.append(importance)

# Calculate overall CV score
if len(fold_scores) > 0:
    cv_score = np.mean(fold_scores)
    cv_std = np.std(fold_scores)
    print(f"\n{'='*50}")
    print(f"Overall CV MAE: {cv_score:.4f} Â± {cv_std:.4f}")
    print(f"Individual folds: {[f'{s:.4f}' for s in fold_scores]}")
    
    # Compare to baseline
    baseline_score = 0.4589
    improvement = (baseline_score - cv_score) / baseline_score * 100
    print(f"Improvement over baseline: {improvement:.1f}%")
else:
    print("\nNo valid folds with inspiratory samples found")

In [None]:
# Analyze feature importance
feature_importance = pd.concat(feature_importance_list, ignore_index=True)

# Calculate mean importance across folds
mean_importance = feature_importance.groupby('feature')['importance'].mean().sort_values(ascending=False)

print("Top 20 most important features:")
print(mean_importance.head(20))

# Save feature importance
mean_importance.to_csv('/home/code/exp_001_feature_importance.csv')
print(f"\nFeature importance saved to: /home/code/exp_001_feature_importance.csv")

In [None]:
# Prepare test data for prediction
X_test = test_extended[feature_cols].copy()

# Fill NaN values
X_test = X_test.fillna(0)

print(f"Test data shape: {X_test.shape}")

# Train final model on full training data
print("\nTraining final model on full training data...")

final_train_data = lgb.Dataset(X, label=y)

final_model = lgb.train(
    params,
    final_train_data,
    num_boost_round=10000,
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(100)
    ]
)

# Make predictions on test set
test_predictions = final_model.predict(X_test, num_iteration=final_model.best_iteration)

print(f"Test predictions shape: {test_predictions.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'id': test_extended['id'],
    'pressure': test_predictions
})

# Ensure correct format
print(f"Submission shape: {submission.shape}")
print(f"\nFirst 5 rows:")
print(submission.head())
print(f"\nLast 5 rows:")
print(submission.tail())

# Save submission
submission_path = '/home/code/submission_candidates/exp_001_submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")