In [1]:
# Install necessary packages
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



# Part 2: Time Series Features & Tree-Based Models

**Objective:** Extract basic time-series features from heart rate data, train Random Forest and XGBoost models, and compare their performance.

## 1. Setup

Import necessary libraries.


In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
import json



## 2. Data Loading

Load the dataset.


In [3]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data with timestamp parsed as datetime
    """
    return pd.read_csv(file_path, parse_dates=True)
df = load_data('data/synthetic_health_data.csv')
df

Unnamed: 0,patient_id,timestamp,age,systolic_bp,diastolic_bp,glucose_level,bmi,smoker_status,heart_rate,disease_outcome
0,1,2023-01-29 00:00:00.000000,57,113.063416,84.069561,117.475210,25.085796,no,62.719587,0
1,1,2023-01-31 07:33:55.507789,57,121.598849,89.672279,85.120875,24.120608,no,76.314434,0
2,1,2023-02-02 00:15:11.379377,57,126.623222,87.619685,,24.819332,no,62.427785,0
3,1,2023-02-04 09:37:12.589164,57,136.999366,89.199774,118.755648,25.039598,no,61.612981,0
4,1,2023-02-04 20:56:52.838198,57,127.546919,92.644673,98.882007,24.895024,no,77.649615,0
...,...,...,...,...,...,...,...,...,...,...
7321,150,2023-03-18 09:08:49.029823,54,115.038254,79.241741,84.586944,29.968156,no,73.599447,0
7322,150,2023-03-20 14:38:22.129593,54,116.389186,70.464818,91.476621,29.519510,no,64.162701,0
7323,150,2023-03-23 09:26:04.210673,54,123.419606,88.213054,96.985434,29.786678,no,71.641423,0
7324,150,2023-03-27 14:17:19.255961,54,,69.539940,85.670800,29.188655,no,72.781243,0



## 3. Feature Engineering

Implement `extract_rolling_features` to calculate rolling mean and standard deviation for the `heart_rate`.


In [4]:
def extract_rolling_features(df, window_size_seconds):
    """
    Calculate rolling mean and standard deviation for heart rate.
    
    Args:
        df: DataFrame with timestamp and heart_rate columns
        window_size_seconds: Size of the rolling window in seconds
        
    Returns:
        DataFrame with added hr_rolling_mean and hr_rolling_std columns
    """
    # 1. Sort data by timestamp
    df_sorted = df.sort_values('timestamp')
    
    # 2. Set timestamp as index (this allows time-based operations)
    df_indexed = df_sorted.set_index('timestamp')
    
    # 3. Calculate rolling mean and standard deviation
    #    - First, create a rolling window object based on time:
    rolling_window = df_indexed['heart_rate'].rolling(window=window_size_seconds)
    #    - Then calculate statistics on this window:
    hr_mean = rolling_window.mean()
    hr_std = rolling_window.std()
    
    # 4. Add the new columns back to the dataframe
    df_indexed['hr_rolling_mean'] = hr_mean
    df_indexed['hr_rolling_std'] = hr_std
    
    # 5. Reset index to bring timestamp back as a column
    df_result = df_indexed.reset_index()
    
    # 6. Handle any NaN values (rolling calculations create NaNs at the beginning)
    #    - You can use fillna, dropna, or other methods depending on your strategy
    df_result = df_result.bfill()
    
    return df_result

extract_rolling_features(df, 300)

Unnamed: 0,timestamp,patient_id,age,systolic_bp,diastolic_bp,glucose_level,bmi,smoker_status,heart_rate,disease_outcome,hr_rolling_mean,hr_rolling_std
0,2023-01-01 00:00:00.000000,9,42,109.271202,84.872277,112.205979,28.723817,former,65.241827,0,71.068248,10.438049
1,2023-01-01 00:00:00.000000,72,73,117.499246,80.624776,113.018091,22.543094,no,75.405997,0,71.068248,10.438049
2,2023-01-01 00:00:00.000000,131,26,125.396859,75.090352,108.999820,23.975127,yes,71.244470,0,71.068248,10.438049
3,2023-01-01 00:00:00.000000,116,54,124.532627,78.492678,78.559751,25.783712,no,76.660696,0,71.068248,10.438049
4,2023-01-02 00:00:00.000000,56,63,111.304514,67.796714,97.859274,27.611658,no,63.534376,0,71.068248,10.438049
...,...,...,...,...,...,...,...,...,...,...,...,...
7321,2023-07-20 21:01:11.701224,109,53,130.694575,74.261794,107.862114,21.732533,no,71.660907,0,72.141827,9.649297
7322,2023-07-23 01:45:47.772214,5,46,108.477241,88.742732,99.719007,33.010116,no,65.398427,0,72.168801,9.618948
7323,2023-07-23 06:55:40.341291,16,41,124.742455,83.196582,117.215513,27.210364,no,73.129839,0,72.154401,9.614249
7324,2023-08-03 10:54:11.976978,109,53,130.912449,69.467888,118.815102,22.172972,no,67.511809,0,72.155979,9.613445



## 4. Data Preparation

Implement `prepare_data_part2` using the newly engineered features.


In [5]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    """
    Prepare data for modeling with time-series features.
    
    Args:
        df_with_features: DataFrame with original and rolling features
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Select relevant features including the rolling features
    X = df_with_features[['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi', 'heart_rate', 'hr_rolling_mean', 'hr_rolling_std']]
    # 2. Select target variable (disease_outcome)
    y = df_with_features[['disease_outcome']]
    # 3. Split data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    # 4. Handle missing values using 
    X_train = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_train)
    X_test = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_test)
    Y_train = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(Y_train)
    Y_test = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(Y_test)
    
    return X_train, X_test, Y_train, Y_test




## 5. Random Forest Model

Implement `train_random_forest`.


In [6]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    """
    Train a Random Forest classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of trees in the forest
        max_depth: Maximum depth of the trees
        random_state: Random seed for reproducibility
        
    Returns:
        Trained Random Forest model
    """
    return RandomForestClassifier(n_estimators=n_estimators, random_state=random_state).fit(X_train, y_train)


## 6. XGBoost Model

Implement `train_xgboost`.


In [7]:
def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    """
    Train an XGBoost classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of boosting rounds
        learning_rate: Boosting learning rate
        max_depth: Maximum depth of a tree
        random_state: Random seed for reproducibility
        
    Returns:
        Trained XGBoost model
    """
    
    return xgb.XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state).fit(X_train, y_train)


## 8. Save Results

Save the AUC scores to a text file.


In [8]:
def save_result(result):
    if not os.path.exists('results'):
        os.mkdir('results')
    with open('results/results_part2.txt', 'w') as f:
        json.dump(result, f)



## 9. Main Execution

Run the complete workflow.


In [9]:
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    
    # 5. Calculate AUC scores
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)
    
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    
    # 6. Save results
    save_result({
        'Random Forest AUC': rf_auc,
        "XGBoost AUC": xgb_auc
    })

  return fit_method(estimator, *args, **kwargs)


Random Forest AUC: 0.9814
XGBoost AUC: 0.9967
