# Ventilator Pressure Prediction - Baseline Model

This notebook implements a baseline LightGBM model for predicting ventilator pressure.

## Approach
- Use LightGBM with time series features
- Basic feature engineering: lags, rolling statistics
- 5-fold time series cross-validation
- Predict pressure for each time step

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

## Load Data

In [2]:
# Load training and test data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Training columns: {train_df.columns.tolist()}")
print(f"Test columns: {test_df.columns.tolist()}")

# Display basic info about the data
print("\nTraining data info:")
print(train_df.head())
print(f"\nUnique breaths in train: {train_df['breath_id'].nunique()}")
print(f"Unique breaths in test: {test_df['breath_id'].nunique()}")

Training data shape: (5432400, 8)
Test data shape: (603600, 7)
Training columns: ['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']
Test columns: ['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out']

Training data info:
   id  breath_id  R   C  time_step      u_in  u_out  pressure
0   1      85053  5  10   0.000000  4.174419      0  6.118700
1   2      85053  5  10   0.033812  7.050149      0  5.907794
2   3      85053  5  10   0.067497  7.564931      0  7.313837
3   4      85053  5  10   0.101394  8.103306      0  8.227765
4   5      85053  5  10   0.135344  8.502619      0  9.422901

Unique breaths in train: 67905
Unique breaths in test: 7545


## Feature Engineering

Create basic time series features:
- Lag features (previous pressure values)
- Rolling statistics
- Interaction features

In [None]:
def create_features(df, is_train=True):
    """Create features for the model"""
    
    # Sort by breath_id and time_step to ensure proper ordering
    df = df.sort_values(['breath_id', 'time_step']).reset_index(drop=True)
    
    # Basic features
    features = ['R', 'C', 'time_step', 'u_in', 'u_out']
    
    # Create lag features (previous values within the same breath)
    for lag in [1, 2, 3]:
        df[f'u_in_lag_{lag}'] = df.groupby('breath_id')['u_in'].shift(lag)
        df[f'u_out_lag_{lag}'] = df.groupby('breath_id')['u_out'].shift(lag)
        
        # Fill NaN values with 0 for lag features
        df[f'u_in_lag_{lag}'] = df[f'u_in_lag_{lag}'].fillna(0)
        df[f'u_out_lag_{lag}'] = df[f'u_out_lag_{lag}'].fillna(0)
    
    # Rolling statistics for u_in
    for window in [5, 10]:
        df[f'u_in_rolling_mean_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).mean().reset_index(0, drop=True)
        df[f'u_in_rolling_std_{window}'] = df.groupby('breath_id')['u_in'].rolling(window, min_periods=1).std().reset_index(0, drop=True)
    
    # Rate of change of u_in
    df['u_in_diff'] = df.groupby('breath_id')['u_in'].diff().fillna(0)
    
    # Interaction features
    df['R_C_interaction'] = df['R'] * df['C']
    df['u_in_R_interaction'] = df['u_in'] * df['R']
    df['u_in_C_interaction'] = df['u_in'] * df['C']
    
    # Time since start of breath
    df['time_since_start'] = df.groupby('breath_id')['time_step'].transform(lambda x: x - x.min())
    
    # Cumulative sum of u_in within breath
    df['u_in_cumsum'] = df.groupby('breath_id')['u_in'].cumsum()
    
    # Add all created features to the feature list
    feature_cols = [col for col in df.columns if col not in ['id', 'breath_id', 'pressure']]
    
    return df, feature_cols

# Create features for training data
print("Creating features for training data...")
train_df, feature_cols = create_features(train_df, is_train=True)

# Create features for test data
print("Creating features for test data...")
test_df, _ = create_features(test_df, is_train=False)

print(f"Number of features: {len(feature_cols)}")
print(f"Feature columns: {feature_cols[:10]}...")  # Show first 10 features

## Prepare Data for Training

In [None]:
# Prepare training data
X = train_df[feature_cols]
y = train_df['pressure']

print(f"Training features shape: {X.shape}")
print(f"Training target shape: {y.shape}")

# Check for any missing values
print(f"\nMissing values in features: {X.isnull().sum().sum()}")
print(f"Missing values in target: {y.isnull().sum()}")

# Fill any remaining NaN values with 0
X = X.fillna(0)
test_df[feature_cols] = test_df[feature_cols].fillna(0)

## Cross-Validation Setup

Use KFold cross-validation since we're dealing with time series data within each breath, but breaths are independent.

In [None]:
# Create breath-level splits for cross-validation
# Each breath is independent, so we can use KFold on breath_ids
breath_ids = train_df['breath_id'].unique()
n_splits = 5

kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

folds = []
for fold, (train_idx, val_idx) in enumerate(kf.split(breath_ids)):
    train_breaths = breath_ids[train_idx]
    val_breaths = breath_ids[val_idx]
    
    # Get indices for these breaths
    train_indices = train_df[train_df['breath_id'].isin(train_breaths)].index
    val_indices = train_df[train_df['breath_id'].isin(val_breaths)].index
    
    folds.append((train_indices, val_indices))
    
print(f"Created {len(folds)} folds")
print(f"Total breaths: {len(breath_ids)}")
for i, (train_idx, val_idx) in enumerate(folds):
    train_breath_count = len(train_df.loc[train_idx, 'breath_id'].unique())
    val_breath_count = len(train_df.loc[val_idx, 'breath_id'].unique())
    print(f"Fold {i+1}: {train_breath_count} train breaths, {val_breath_count} val breaths")

## Train Model with Cross-Validation

In [None]:
# LightGBM parameters
lgb_params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 100,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': SEED,
    'n_jobs': -1
}

# Store predictions and scores
oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))
fold_scores = []

print("Training LightGBM model with cross-validation...")

for fold, (train_idx, val_idx) in enumerate(folds):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_set = lgb.Dataset(X_train, label=y_train)
    val_set = lgb.Dataset(X_val, label=y_val)
    
    # Train model
    model = lgb.train(
        lgb_params,
        train_set,
        num_boost_round=10000,
        valid_sets=[val_set],
        callbacks=[
            lgb.early_stopping(100),
            lgb.log_evaluation(100)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Calculate validation score
    val_score = mean_absolute_error(y_val, val_pred)
    fold_scores.append(val_score)
    print(f"Fold {fold + 1} MAE: {val_score:.4f}")
    
    # Predict on test set
    test_pred = model.predict(test_df[feature_cols], num_iteration=model.best_iteration)
    test_predictions += test_pred / n_splits

# Calculate overall CV score
cv_score = mean_absolute_error(y, oof_predictions)
print(f"\nOverall CV MAE: {cv_score:.4f}")
print(f"Fold scores: {fold_scores}")
print(f"Mean ± Std: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")

## Feature Importance

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_importance.head(15))

## Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'pressure': test_predictions
})

# Ensure the submission has the correct format
print(f"Submission shape: {submission.shape}")
print(f"Submission head:")
print(submission.head())

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Verify submission format matches sample
sample_submission = pd.read_csv('/home/data/sample_submission.csv')
print(f"\nSample submission shape: {sample_submission.shape}")
print(f"Sample submission head:")
print(sample_submission.head())

# Check if IDs match
if set(submission['id']) == set(sample_submission['id']):
    print("✓ Submission IDs match sample submission IDs")
else:
    print("⚠ Warning: Submission IDs don't match sample submission IDs")