# Install necessary packages

In [19]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Part 2: Time Series Features & Tree-Based Models

**Objective:** Extract basic time-series features from heart rate data, train Random Forest and XGBoost models, and compare their performance.

## 1. Setup

Import necessary libraries.

In [20]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

## 2. Data Loading

Load the dataset.

In [21]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data with timestamp parsed as datetime
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas
    # Make sure to parse the timestamp column as datetime
    data = pd.read_csv(file_path, parse_dates=["timestamp"])

    return data

In [22]:
df = load_data('data/synthetic_health_data.csv')
print(df)

      patient_id                  timestamp  age  systolic_bp  diastolic_bp  \
0              1 2023-01-29 00:00:00.000000   57   113.063416     84.069561   
1              1 2023-01-31 07:33:55.507789   57   121.598849     89.672279   
2              1 2023-02-02 00:15:11.379377   57   126.623222     87.619685   
3              1 2023-02-04 09:37:12.589164   57   136.999366     89.199774   
4              1 2023-02-04 20:56:52.838198   57   127.546919     92.644673   
...          ...                        ...  ...          ...           ...   
7321         150 2023-03-18 09:08:49.029823   54   115.038254     79.241741   
7322         150 2023-03-20 14:38:22.129593   54   116.389186     70.464818   
7323         150 2023-03-23 09:26:04.210673   54   123.419606     88.213054   
7324         150 2023-03-27 14:17:19.255961   54          NaN     69.539940   
7325         150 2023-04-12 04:12:38.529880   54          NaN     76.992331   

      glucose_level        bmi smoker_status  heart

## 3. Feature Engineering

Implement `extract_rolling_features` to calculate rolling mean and standard deviation for the `heart_rate`.

In [23]:
def extract_rolling_features(df, window_size_seconds):
    """
    Calculate rolling mean and standard deviation for heart rate.
    
    Args:
        df: DataFrame with timestamp and heart_rate columns
        window_size_seconds: Size of the rolling window in seconds
        
    Returns:
        DataFrame with added hr_rolling_mean and hr_rolling_std columns
    """
    # YOUR CODE HERE
    # 1. Sort data by timestamp
    df_sorted = df.sort_values('timestamp')
    
    # 2. Set timestamp as index (this allows time-based operations)
    df_indexed = df_sorted.set_index('timestamp')
    
    # 3. Calculate rolling mean and standard deviation
    #    - First, create a rolling window object based on time:
    rolling_window = df_indexed['heart_rate'].rolling(window=f'{window_size_seconds}s')
    #    - Then calculate statistics on this window:
    hr_mean = rolling_window.mean()
    hr_std = rolling_window.std()
    
    # 4. Add the new columns back to the dataframe
    df_indexed['hr_rolling_mean'] = hr_mean
    df_indexed['hr_rolling_std'] = hr_std
    
    # 5. Reset index to bring timestamp back as a column
    df_result = df_indexed.reset_index()
    
    # 6. Handle any NaN values (rolling calculations create NaNs at the beginning)
    #    - You can use fillna, dropna, or other methods depending on your strategy
    df_result = df_result.bfill()
    
    # Placeholder return - replace with your implementation
    return df_result

In [24]:
rolling_df = extract_rolling_features(df, 1)
print(rolling_df)

                      timestamp  patient_id  age  systolic_bp  diastolic_bp  \
0    2023-01-01 00:00:00.000000           9   42   109.271202     84.872277   
1    2023-01-01 00:00:00.000000          72   73   117.499246     80.624776   
2    2023-01-01 00:00:00.000000         131   26   125.396859     75.090352   
3    2023-01-01 00:00:00.000000         116   54   124.532627     78.492678   
4    2023-01-02 00:00:00.000000          56   63   111.304514     67.796714   
...                         ...         ...  ...          ...           ...   
7321 2023-07-20 21:01:11.701224         109   53   130.694575     74.261794   
7322 2023-07-23 01:45:47.772214           5   46   108.477241     88.742732   
7323 2023-07-23 06:55:40.341291          16   41   124.742455     83.196582   
7324 2023-08-03 10:54:11.976978         109   53   130.912449     69.467888   
7325 2023-08-05 13:16:36.634428          90   57   133.101666     82.658927   

      glucose_level        bmi smoker_status  heart

## 4. Data Preparation

Implement `prepare_data_part2` using the newly engineered features.

In [25]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    """
    Prepare data for modeling with time-series features.
    
    Args:
        df_with_features: DataFrame with original and rolling features
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Select relevant features including rolling features
    feature_cols = ['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi', 'heart_rate', 'hr_rolling_mean', 'hr_rolling_std']
    
    # 2. Select target variable
    target_col = 'disease_outcome'
    
    # 3. Drop rows with missing values in the selected columns
    df_cleaned = df_with_features.dropna(subset=feature_cols + [target_col])
    
    # 4. Extract features and target
    X = df_cleaned[feature_cols]
    y = df_cleaned[target_col]
    
    # 5. Split data into train/test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=True
    )

    # Placeholder return - replace with your implementation
    return X_train, X_test, y_train, y_test

## 5. Random Forest Model

Implement `train_random_forest`.

In [26]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    """
    Train a Random Forest classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of trees in the forest
        max_depth: Maximum depth of the trees
        random_state: Random seed for reproducibility
        
    Returns:
        Trained Random Forest model
    """
    # YOUR CODE HERE
    # Initialize and train a RandomForestClassifier
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    
    # Train the model
    rf_model_ = rf_model.fit(X_train, y_train)
    
    return rf_model_  # Replace with actual implementation

## 6. XGBoost Model

Implement `train_xgboost`.

In [27]:
def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    """
    Train an XGBoost classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of boosting rounds
        learning_rate: Boosting learning rate
        max_depth: Maximum depth of a tree
        random_state: Random seed for reproducibility
        
    Returns:
        Trained XGBoost model
    """
    # YOUR CODE HERE
    # Initialize and train an XGBClassifier
    xgb_model = xgb.XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state, eval_metric='logloss')
    
    # Train the model
    xgb_model_ = xgb_model.fit(X_train, y_train)

    return xgb_model_  # Replace with actual implementation

## 7. Model Comparison

Calculate and compare AUC scores for both models.

In [28]:
# YOUR CODE HERE
# 1. Generate probability predictions for both models
# 2. Calculate AUC scores
# 3. Compare the performance

def compare_models(rf_model, xgb_model, X_test, y_test):
    # 1. Generate probability predictions for both models
    rf_probs = rf_model.predict_proba(X_test)[:, 1]  # Probability of class 1
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

    # 2. Calculate AUC scores
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)

    # 3. Compare the performance
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    if rf_auc > xgb_auc:
        print("Random Forest performed better than XGBoost model.")
    elif xgb_auc > rf_auc:
        print("→ XGBoost performed better than Random Forest model.")
    else:
        print("→ Both models performed equally.")

    return rf_auc, xgb_auc


## 8. Save Results

Save the AUC scores to a text file.

In [29]:
# YOUR CODE HERE
# 1. Create 'results' directory if it doesn't exist
# 2. Format AUC scores as strings
# 3. Write scores to 'results/results_part2.txt'

def save_auc_scores(rf_auc, xgb_auc):
    # 1. Create 'results' directory if it doesn't exist
    os.makedirs('results', exist_ok=True)

    # 2. Format AUC scores as strings
    # 3. Write scores to 'results/results_part2.txt'
    with open('results/results_part2.txt', 'w') as f:
        f.write("# AUC Scores - Part 2\n\n")
        f.write(f"Random Forest AUC: {rf_auc:.4f}" + "\n")
        f.write(f"XGBoost AUC: {xgb_auc:.4f}" + "\n")

## 9. Main Execution

Run the complete workflow.

In [30]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    
    # 5. Calculate AUC scores
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)
    
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")

    # 6. Save results
    # (Your code for saving results)
    save_auc_scores(rf_auc, xgb_auc)

Random Forest AUC: 0.9758
XGBoost AUC: 0.9956
