# Install necessary packages

In [6]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Part 2: Time Series Features & Tree-Based Models

**Objective:** Extract basic time-series features from heart rate data, train Random Forest and XGBoost models, and compare their performance.

## 1. Setup

Import necessary libraries.

In [7]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

## 2. Data Loading

Load the dataset.

In [8]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data with timestamp parsed as datetime
    """
    df = pd.read_csv(file_path, parse_dates = ["timestamp"])    
    return df  

## 3. Feature Engineering

Implement `extract_rolling_features` to calculate rolling mean and standard deviation for the `heart_rate`.

In [14]:
def extract_rolling_features(df, window_size_seconds):
    """
    Calculate rolling mean and standard deviation for heart rate.
    """
    df = df.sort_values("timestamp").set_index("timestamp")

    # Calculate rolling stats
    rolling = df["heart_rate"].rolling(f"{window_size_seconds}s")
    df["hr_rolling_mean"] = rolling.mean()
    df["hr_rolling_std"] = rolling.std()

    # Clean NaNs
    df["hr_rolling_mean"].fillna(method="bfill", inplace=True)
    df["hr_rolling_mean"].fillna(method="ffill", inplace=True)
    df["hr_rolling_std"].fillna(method="bfill", inplace=True)
    df["hr_rolling_std"].fillna(method="ffill", inplace=True)
    
    return df.reset_index().copy()


## 4. Data Preparation

Implement `prepare_data_part2` using the newly engineered features.

In [None]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    """
    Prepare data for modeling with time-series features.
    """
    # Select features and target
    features = ["age", "systolic_bp", "diastolic_bp", "glucose_level", "bmi", 
            "hr_rolling_mean", "hr_rolling_std"]
    X = df_with_features[features]
    y = df_with_features["disease_outcome"]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Handle missing values
    imputer = SimpleImputer(strategy="mean")
    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X.columns, index=X_train.index)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=X.columns, index=X_test.index)

    return X_train, X_test, y_train, y_test

## 5. Random Forest Model

Implement `train_random_forest`.

In [15]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    """
    Train a Random Forest classifier.
    """
    model = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    random_state=random_state
    )
    model.fit(X_train, y_train)

    return model

## 6. XGBoost Model

Implement `train_xgboost`.

In [16]:
import xgboost as xgb

def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    model = xgb.XGBClassifier(
    n_estimators=n_estimators,
    learning_rate=learning_rate,
    max_depth=max_depth,
    random_state=random_state,
    use_label_encoder=False,
    eval_metric="logloss"
)
    model.fit(X_train, y_train)

    return model

## 7. Model Comparison

Calculate and compare AUC scores for both models.

In [17]:
def model_compare (rf_model, xgb_model, X_test, y_test):
    rf_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
    xgb_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")

    return {"rf_auc": rf_auc, "xgb_auc": xgb_auc}

## 8. Save Results

Save the AUC scores to a text file.

In [19]:
def save_results(metrics):
    os.makedirs("results", exist_ok=True)

    result_str = "\n".join([f"{k}: {v:.4f}" for k, v in metrics.items()]) + "\n"

    with open("results/results_part2.txt", "w") as f:
        f.write(result_str)


## 9. Main Execution

Run the complete workflow.

In [20]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    
    # 5. Compare model performance
    metrics = model_compare(rf_model, xgb_model, X_test, y_test)
    
    # 6. Save results
    save_results(metrics)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["hr_rolling_mean"].fillna(method="bfill", inplace=True)
  df["hr_rolling_mean"].fillna(method="bfill", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["hr_rolling_mean"].fillna(method="ffill", inplace=True)
  df["hr_rolling_mean"].fillna(method="ffill", inplac

Random Forest AUC: 0.7808
XGBoost AUC: 0.7686


Parameters: { "use_label_encoder" } are not used.

