# Sepsis Prediction Model - Cloud Training

## Instructions
1. **Upload Data**: Click the 'Files' icon on the left sidebar and upload your `Dataset.csv` file.
2. **Run All**: Go to 'Runtime' > 'Run all'.
3. **Download Model**: The trained model `sepsis_xgboost.pkl` AND `sepsis_xgboost.pkl.features` will be saved. Download BOTH.

In [None]:
# 1. Install Dependencies
!pip install xgboost scikit-learn pandas numpy joblib matplotlib

In [None]:
# 2. Imports
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report
import joblib
import os
import matplotlib.pyplot as plt

print("Libraries loaded.")

In [None]:
# 3. Data Loader Class (Engineers Features)
class DataLoader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.raw_df = None
        self.processed_df = None

    def load_data(self):
        print(f"Loading data from {self.filepath}...")
        self.raw_df = pd.read_csv(self.filepath)
        print(f"Data loaded: {self.raw_df.shape}")
        return self.raw_df

    def preprocess(self):
        print("Preprocessing data...")
        df = self.raw_df.copy()
        df.sort_values(by=['Patient_ID', 'Hour'], inplace=True)

        # 1. Imputation
        print("Imputing missing values...")
        patient_ids = df['Patient_ID']
        df = df.groupby('Patient_ID').ffill()
        df['Patient_ID'] = patient_ids
        df = df.fillna(df.median())

        # 2. Feature Engineering
        print("Engineering Time-Series Features...")
        vitals = ['HR', 'MAP', 'SBP', 'O2Sat', 'Temp', 'Resp']
        if 'MAP' not in df.columns and 'SBP' in df.columns and 'DBP' in df.columns:
             df['MAP'] = (df['SBP'] + 2*df['DBP']) / 3
        
        existing_vitals = [v for v in vitals if v in df.columns]

        for col in existing_vitals:
            # Lag 1
            df[f'{col}_Lag1'] = df.groupby('Patient_ID')[col].shift(1)
            # Delta
            df[f'{col}_Delta'] = df[col] - df[f'{col}_Lag1']
            # Rolling Mean (Optimized)
            rolled = df.groupby('Patient_ID')[col].rolling(window=6, min_periods=1).mean()
            df[f'{col}_RollMean6h'] = rolled.reset_index(level=0, drop=True)

        # Handle Lag NaNs (Cold Start assumption)
        for col in existing_vitals:
            df[f'{col}_Lag1'] = df[f'{col}_Lag1'].fillna(df[col])
            df[f'{col}_Delta'] = df[f'{col}_Delta'].fillna(0)
            df[f'{col}_RollMean6h'] = df[f'{col}_RollMean6h'].fillna(df[col])

        self.processed_df = df
        print(f"Preprocessing complete. Features: {df.shape[1]}")
        return self.processed_df

    def split_data(self, test_size=0.2, val_size=0.1):
        print("Splitting data by Patient_ID...")
        X = self.processed_df.drop(columns=['SepsisLabel', 'Patient_ID'])
        y = self.processed_df['SepsisLabel']
        groups = self.processed_df['Patient_ID']

        splitter_test = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=42)
        train_val_idx, test_idx = next(splitter_test.split(X, y, groups))
        X_train_val = X.iloc[train_val_idx]
        y_train_val = y.iloc[train_val_idx]
        groups_train_val = groups.iloc[train_val_idx]
        X_test = X.iloc[test_idx]
        y_test = y.iloc[test_idx]

        relative_val_size = val_size / (1 - test_size)
        splitter_val = GroupShuffleSplit(test_size=relative_val_size, n_splits=1, random_state=42)
        train_idx, val_idx = next(splitter_val.split(X_train_val, y_train_val, groups_train_val))

        X_train = X_train_val.iloc[train_idx]
        y_train = y_train_val.iloc[train_idx]
        X_val = X_train_val.iloc[val_idx]
        y_val = y_train_val.iloc[val_idx]
        
        print(f"Train: {X_train.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}")
        return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
# 4. Training Class (Corrected: No Early Stopping)
class SepsisModel:
    def __init__(self):
        self.model = None

    def train(self, X_train, y_train, X_val, y_val):
        print("Initializing XGBoost...")
        ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
        
        self.model = xgb.XGBClassifier(
            objective='binary:logistic',
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            scale_pos_weight=ratio,
            use_label_encoder=False,
            eval_metric='auc',
            random_state=42
        )
        print("Training...")
        self.model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            verbose=True
        )

    def evaluate(self, X_test, y_test):
        print("\nEvaluating...")
        y_probs = self.model.predict_proba(X_test)[:, 1]
        y_preds = self.model.predict(X_test)

        auc = roc_auc_score(y_test, y_probs)
        auprc = average_precision_score(y_test, y_probs)
        print(f"Test AUC: {auc:.4f}")
        print(f"Test AUPRC: {auprc:.4f}")
        print(confusion_matrix(y_test, y_preds))

    def save_model(self, path='sepsis_xgboost.pkl'):
        print(f"Saving to {path}...")
        joblib.dump(self.model, path)
        
        # SAVE FEATURES
        try:
            feature_names = self.model.get_booster().feature_names
            joblib.dump(feature_names, path + ".features")
            print(f"Feature signature saved to {path}.features")
        except Exception as e:
            print(f"Warning: Could not save feature names: {e}")

In [None]:
# 5. Execution Pipeline
if __name__ == "__main__":
    # Check if file exists
    if not os.path.exists('Dataset.csv'):
        print("ERROR: Dataset.csv not found. Please upload it using the Files tab.")
    else:
        loader = DataLoader('Dataset.csv')
        loader.load_data()
        loader.preprocess()
        X_train, y_train, X_val, y_val, X_test, y_test = loader.split_data()

        trainer = SepsisModel()
        trainer.train(X_train, y_train, X_val, y_val)
        trainer.evaluate(X_test, y_test)
        trainer.save_model('sepsis_xgboost.pkl')
        print("Done! Please download sepsis_xgboost.pkl AND sepsis_xgboost.pkl.features")