# Part 2: Time Series Features & Tree-Based Models

Extract basic time-series features from heart rate data, train Random Forest and XGBoost models, and compare their performance.

In [None]:
%pip install -r requirements.txt

## 1. Setup
Import necessary libraries.

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

## 2. Data Loading
Load the dataset.

In [2]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data with timestamp parsed as datetime
    """
    # Load the CSV file using pandas
    data = pd.read_csv(file_path, parse_dates=["timestamp"])
    
    return data

In [None]:
#test_df = load_data('data/synthetic_health_data.csv')
#test_df

Unnamed: 0,patient_id,timestamp,age,systolic_bp,diastolic_bp,glucose_level,bmi,smoker_status,heart_rate,disease_outcome
0,1,2023-01-29 00:00:00.000000,57,113.063416,84.069561,117.475210,25.085796,no,62.719587,0
1,1,2023-01-31 07:33:55.507789,57,121.598849,89.672279,85.120875,24.120608,no,76.314434,0
2,1,2023-02-02 00:15:11.379377,57,126.623222,87.619685,,24.819332,no,62.427785,0
3,1,2023-02-04 09:37:12.589164,57,136.999366,89.199774,118.755648,25.039598,no,61.612981,0
4,1,2023-02-04 20:56:52.838198,57,127.546919,92.644673,98.882007,24.895024,no,77.649615,0
...,...,...,...,...,...,...,...,...,...,...
7321,150,2023-03-18 09:08:49.029823,54,115.038254,79.241741,84.586944,29.968156,no,73.599447,0
7322,150,2023-03-20 14:38:22.129593,54,116.389186,70.464818,91.476621,29.519510,no,64.162701,0
7323,150,2023-03-23 09:26:04.210673,54,123.419606,88.213054,96.985434,29.786678,no,71.641423,0
7324,150,2023-03-27 14:17:19.255961,54,,69.539940,85.670800,29.188655,no,72.781243,0


## 3. Feature Engineering

Implement `extract_rolling_features` to calculate rolling mean and standard deviation for the `heart_rate`.

In [5]:
def extract_rolling_features(df, window_size_seconds):
    """
    Calculate rolling mean and standard deviation for heart rate.
    
    Args:
        df: DataFrame with timestamp and heart_rate columns
        window_size_seconds: Size of the rolling window in seconds
        
    Returns:
        DataFrame with added hr_rolling_mean and hr_rolling_std columns
    """
    # 1. Sort data by timestamp
    df_sorted = df.sort_values('timestamp')
    
    # 2. Set timestamp as index (this allows time-based operations)
    df_indexed = df_sorted.set_index('timestamp')
    
    # 3. Calculate rolling mean and standard deviation
    #    - First, create a rolling window object based on time:
    rolling_window = df_indexed['heart_rate'].rolling(window=f'{window_size_seconds}s')
    #    - Then calculate statistics on this window:
    hr_mean = rolling_window.mean()
    hr_std = rolling_window.std()
    
    # 4. Add the new columns back to the dataframe
    df_indexed['hr_rolling_mean'] = hr_mean
    df_indexed['hr_rolling_std'] = hr_std
    
    # 5. Reset index to bring timestamp back as a column
    df_result = df_indexed.reset_index()
    
    # 6. Handle any NaN values (rolling calculations create NaNs at the beginning)

    df_result = df_result.fillna(method='bfill')  
    
    return df_result

In [None]:

#test_rolling_df = extract_rolling_features(df, 1)
#test_rolling_df

  df_result = df_result.fillna(method='bfill')


Unnamed: 0,timestamp,patient_id,age,systolic_bp,diastolic_bp,glucose_level,bmi,smoker_status,heart_rate,disease_outcome,hr_rolling_mean,hr_rolling_std
0,2023-01-01 00:00:00.000000,9,42,109.271202,84.872277,112.205979,28.723817,former,65.241827,0,65.241827,7.187153
1,2023-01-01 00:00:00.000000,72,73,117.499246,80.624776,113.018091,22.543094,no,75.405997,0,70.323912,7.187153
2,2023-01-01 00:00:00.000000,131,26,125.396859,75.090352,108.999820,23.975127,yes,71.244470,0,70.630764,5.109800
3,2023-01-01 00:00:00.000000,116,54,124.532627,78.492678,78.559751,25.783712,no,76.660696,0,72.138247,5.147497
4,2023-01-02 00:00:00.000000,56,63,111.304514,67.796714,97.859274,27.611658,no,63.534376,0,63.534376,2.716776
...,...,...,...,...,...,...,...,...,...,...,...,...
7321,2023-07-20 21:01:11.701224,109,53,130.694575,74.261794,107.862114,21.732533,no,71.660907,0,71.660907,
7322,2023-07-23 01:45:47.772214,5,46,108.477241,88.742732,99.719007,33.010116,no,65.398427,0,65.398427,
7323,2023-07-23 06:55:40.341291,16,41,124.742455,83.196582,117.215513,27.210364,no,73.129839,0,73.129839,
7324,2023-08-03 10:54:11.976978,109,53,130.912449,69.467888,118.815102,22.172972,no,67.511809,0,67.511809,


## 4. Data Preparation
Implement `prepare_data_part2` using the newly engineered features.

In [8]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    """
    Prepare data for modeling with time-series features.
    
    Args:
        df_with_features: DataFrame with original and rolling features
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # 1. Select relevant features including the rolling features
    feature_cols = ['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi', 'heart_rate', 'hr_rolling_mean', 'hr_rolling_std']

    # 2. Handle missing values
    df_clean = df_with_features.dropna(subset=feature_cols + ['disease_outcome'])

    # 3. Select target variable (disease_outcome)
    X = df_clean[feature_cols]
    y = df_clean['disease_outcome']

    # 4. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=True
    )
    
    return X_train, X_test, y_train, y_test

In [None]:
#X_train, X_test, y_train, y_test = prepare_data_part2(rolling_df)
#X_train
#X_test
#y_train
#y_test

## 5. Random Forest Model
Implement `train_random_forest`.

In [12]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    """
    Train a Random Forest classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of trees in the forest
        max_depth: Maximum depth of the trees
        random_state: Random seed for reproducibility
        
    Returns:
        Trained Random Forest model
    """
    # Initialize and train a RandomForestClassifier
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    output = rf_model.fit(X_train, y_train)
    
    return output

In [None]:
#test_rf = train_random_forest(X_train, y_train)
#test_rf

## 6. XGBoost Model
Implement `train_xgboost`.

In [16]:
def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    """
    Train an XGBoost classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of boosting rounds
        learning_rate: Boosting learning rate
        max_depth: Maximum depth of a tree
        random_state: Random seed for reproducibility
        
    Returns:
        Trained XGBoost model
    """
    # Initialize and train an XGBClassifier
    xgb_model = xgb.XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state, eval_metric='logloss')
    output = xgb_model.fit(X_train, y_train)
    
    return output  

In [None]:
#test_xgb = train_xgboost(X_train, y_train)
#test_xgb

## 7. Model Comparison
Calculate and compare AUC scores for both models.

In [19]:
def model_comparison(rf_model, xgb_model, X_test, y_test):
    """
    Calculate and compare AUC scores for RF and XGB models.
    
    Args:
        rf_model: Random Forest model
        xgb_model: XGBoost model
        X_test: X test dataset
        y_test: y test dataset
        
    Returns:
        AUC scores for RF and XGB models
    """
    # 1. Generate probability predictions for both models
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

    # 2. Calculate AUC scores
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)

    # 3. Compare the performance
    return rf_auc, xgb_auc

In [None]:
#test_rf_auc, test_xgb_auc = model_comparison(test_rf, test_xgb, X_test, y_test)

## 8. Save Results
Save the AUC scores to a text file.

In [23]:
def save_auc_scores(rf_auc, xgb_auc):
    """
    Save the AUC scores to a text file.
    
    Args:
        rf_auc: Random Forest AUC
        xgb_auc: XGBoost AUC
        
    Returns:
        Text file of AUC scores
    """
    # 1. Create 'results' directory if it doesn't exist
    os.makedirs('results', exist_ok=True)

    # 2. Format AUC scores as strings
    # 3. Write scores to 'results/results_part2.txt'
    filepath = 'results/results_part2.txt'
    with open(filepath, 'w') as f:
        f.write(f"Random Forest AUC: {rf_auc:.4f}\n")
        f.write(f"XGBoost AUC: {xgb_auc:.4f}\n")

In [None]:
#save_auc_scores(test_rf_auc, test_xgb_auc)

## 9. Main Execution
Run the complete workflow.

In [27]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    
    # 5. Calculate AUC scores
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)
    
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    
    # 6. Save results
    save_auc_scores(rf_auc, xgb_auc)

  df_result = df_result.fillna(method='bfill')


Random Forest AUC: 0.9758
XGBoost AUC: 0.9956
