# Install necessary packages

In [None]:
%pip install -r requirements.txt

# Part 2: Time Series Features & Tree-Based Models

**Objective:** Extract basic time-series features from heart rate data, train Random Forest and XGBoost models, and compare their performance.

## 1. Setup

Import necessary libraries.

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

## 2. Data Loading

Load the dataset.

In [3]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    df = pd.read_csv(file_path)
    
    return df

## 3. Feature Engineering

Implement `extract_rolling_features` to calculate rolling mean and standard deviation for the `heart_rate`.

In [4]:
def extract_rolling_features(df, window_size_seconds):
    """
    Calculate rolling mean and standard deviation for heart rate.
    
    Args:
        df: DataFrame with timestamp and heart_rate columns
        window_size_seconds: Size of the rolling window in seconds
        
    Returns:
        DataFrame with added hr_rolling_mean and hr_rolling_std columns
    """
     # 1. Sort by timestamp
    df_sorted = df.sort_values('timestamp')

    # 2. Convert 'timestamp' to datetime if not already
    if not np.issubdtype(df_sorted['timestamp'].dtype, np.datetime64):
        df_sorted['timestamp'] = pd.to_datetime(df_sorted['timestamp'])

    # 3. Set timestamp as index
    df_indexed = df_sorted.set_index('timestamp')

    # 4. Create rolling window and calculate statistics
    rolling_window = df_indexed['heart_rate'].rolling(window=f'{window_size_seconds}s')
    df_indexed['hr_rolling_mean'] = rolling_window.mean()
    df_indexed['hr_rolling_std'] = rolling_window.std()

    # 5. Reset index to bring 'timestamp' back as a column
    df_result = df_indexed.reset_index()

    # 6. Handle NaN values — here we use backward fill as an example
    df_result = df_result.fillna(method='bfill')

    return df_result


In [23]:
# Assuming you have a DataFrame with 'timestamp' and 'heart_rate' columns:
df_with_features = extract_rolling_features(df, window_size_seconds=60)
print(df_with_features[['timestamp', 'heart_rate', 'hr_rolling_mean', 'hr_rolling_std']].head())

   timestamp  heart_rate  hr_rolling_mean  hr_rolling_std
0 2023-01-01   65.241827        65.241827        7.187153
1 2023-01-01   75.405997        70.323912        7.187153
2 2023-01-01   71.244470        70.630764        5.109800
3 2023-01-01   76.660696        72.138247        5.147497
4 2023-01-02   63.534376        63.534376        2.716776


  df_result = df_result.fillna(method='bfill')


## 4. Data Preparation

Implement `prepare_data_part2` using the newly engineered features.

In [24]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    """
    Prepare data for modeling with time-series features.
    
    Args:
        df_with_features: DataFrame with original and rolling features
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # 1. Select relevant features (original + rolling)
    feature_columns = [
        'age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi',
        'heart_rate', 'hr_rolling_mean', 'hr_rolling_std'
    ]
    X = df_with_features[feature_columns]

    # 2. Select target variable
    y = df_with_features['disease_outcome']

    # 3. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # 4. Handle missing values with mean imputation
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    return X_train, X_test, y_train, y_test

In [26]:
# Prepare the data
X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)

# Show basic info to confirm it worked
print("✅ Data preparation successful!")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Optionally, show the first row
print("\n🔍 First row of X_train:")
print(X_train[0])

✅ Data preparation successful!
X_train shape: (5860, 8)
X_test shape: (1466, 8)
y_train shape: (5860,)
y_test shape: (1466,)

🔍 First row of X_train:
[ 61.         123.36531426  76.21565544  89.52354595  22.99350618
  75.4382151   75.4382151    1.13331571]


## 5. Random Forest Model

Implement `train_random_forest`.

In [28]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    """
    Train a Random Forest classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of trees in the forest
        max_depth: Maximum depth of the trees
        random_state: Random seed for reproducibility
        
    Returns:
        Trained Random Forest model
    """
    # Initialize the model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )

    # Train the model
    model.fit(X_train, y_train)

    return model

In [29]:
rf_model = train_random_forest(X_train, y_train)

# Show basic confirmation
print("✅ Random Forest model trained!")
print(f"Number of features used: {X_train.shape[1]}")
print(f"Feature importances: {rf_model.feature_importances_}")

✅ Random Forest model trained!
Number of features used: 8
Feature importances: [0.15229812 0.07483407 0.1966758  0.19108635 0.09438751 0.13900973
 0.11412556 0.03758286]


## 6. XGBoost Model

Implement `train_xgboost`.

In [31]:
def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    """
    Train an XGBoost classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of boosting rounds
        learning_rate: Boosting learning rate
        max_depth: Maximum depth of a tree
        random_state: Random seed for reproducibility
        
    Returns:
        Trained XGBoost model
    """
     # Initialize the XGBClassifier
    model = xgb.XGBClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        random_state=random_state,
        use_label_encoder=False,
        eval_metric='logloss'  # To suppress warning in classification
    )

    # Fit the model
    model.fit(X_train, y_train)

    return model

In [32]:
xgb_model = train_xgboost(X_train, y_train)

# Confirm training
print("✅ XGBoost model trained!")
print(f"Number of features: {X_train.shape[1]}")
print(f"First few feature importances: {xgb_model.feature_importances_[:5]}")

✅ XGBoost model trained!
Number of features: 8
First few feature importances: [0.09524207 0.07333629 0.22293133 0.17960101 0.07984374]


Parameters: { "use_label_encoder" } are not used.



## 7. Model Comparison

Calculate and compare AUC scores for both models.

In [33]:
def compare_models_auc(rf_model, xgb_model, X_test, y_test):
    """
    Compare AUC scores of Random Forest and XGBoost models.
    
    Args:
        rf_model: Trained Random Forest model
        xgb_model: Trained XGBoost model
        X_test: Test features
        y_test: True labels for the test set
    
    Returns:
        Dictionary with AUC scores for both models and the name of the better model
    """
    # 1. Generate probability predictions
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

    # 2. Calculate AUC scores
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)

    # 3. Compare performance
    better_model = "Random Forest" if rf_auc > xgb_auc else "XGBoost"

    # Print results
    print("✅ AUC Score Comparison:")
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC:       {xgb_auc:.4f}")
    print(f"🏆 Better Model:    {better_model}")

    return {
        "random_forest_auc": rf_auc,
        "xgboost_auc": xgb_auc,
        "better_model": better_model
    }

In [34]:
results = compare_models_auc(rf_model, xgb_model, X_test, y_test)

✅ AUC Score Comparison:
Random Forest AUC: 0.9722
XGBoost AUC:       0.9967
🏆 Better Model:    XGBoost


## 8. Save Results

Save the AUC scores to a text file.

In [35]:
def save_auc_scores(results_dict, filename="results/results_part2.txt"):
    """
    Save AUC scores to a text file.
    
    Args:
        results_dict: Dictionary with AUC scores and model comparison
        filename: Path to the output file
    """
    # 1. Create 'results' directory if it doesn't exist
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    # 2. Format AUC scores and model comparison
    lines = [
        f"Random Forest AUC: {results_dict['random_forest_auc']:.4f}",
        f"XGBoost AUC:       {results_dict['xgboost_auc']:.4f}",
        f"Better Model:      {results_dict['better_model']}"
    ]

    # 3. Write scores to file
    with open(filename, "w") as f:
        f.write("\n".join(lines))

    print(f"✅ AUC scores saved to {filename}")

In [36]:
save_auc_scores(results)

✅ AUC scores saved to results/results_part2.txt


## 9. Main Execution

Run the complete workflow.

In [37]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    
    # 5. Calculate AUC scores
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)
    
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    
    # 6. Save results
    # (Your code for saving results)

  df_result = df_result.fillna(method='bfill')


Random Forest AUC: 0.9743
XGBoost AUC: 0.9965


Parameters: { "use_label_encoder" } are not used.

