# Install necessary packages

In [1]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Part 2: Time Series Features & Tree-Based Models

**Objective:** Extract basic time-series features from heart rate data, train Random Forest and XGBoost models, and compare their performance.

## 1. Setup

Import necessary libraries.

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

## 2. Data Loading

Load the dataset.

In [3]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data with timestamp parsed as datetime
    """
    df = pd.read_csv(file_path, parse_dates=['timestamp'])
    # Load the CSV file using pandas
    # Make sure to parse the timestamp column as datetime
    
    return df  # Replace with actual implementation

## 3. Feature Engineering

Implement `extract_rolling_features` to calculate rolling mean and standard deviation for the `heart_rate`.

In [4]:
def extract_rolling_features(df, window_size_seconds):
    """
    Calculate rolling mean and standard deviation for heart rate.
    """
    df_sorted = df.sort_values('timestamp')
    df_indexed = df_sorted.set_index('timestamp')

    rolling_window = df_indexed['heart_rate'].rolling(f'{window_size_seconds}s')
    df_indexed['hr_rolling_mean'] = rolling_window.mean()
    df_indexed['hr_rolling_std'] = rolling_window.std()

    df_result = df_indexed.reset_index()
    df_result = df_result.fillna(method='bfill')  # Backward fill for NaNs

    return df_result

## 4. Data Preparation

Implement `prepare_data_part2` using the newly engineered features.

In [5]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    """
    Prepare data for modeling with time-series features.
    """
    features = ['heart_rate', 'hr_rolling_mean', 'hr_rolling_std']
    df_model = df_with_features.dropna(subset=features + ['disease_outcome'])  # Ensure no NaNs
    
    X = df_model[features]
    y = df_model['disease_outcome']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

## 5. Random Forest Model

Implement `train_random_forest`.

In [6]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    """
    Train a Random Forest classifier.
    """
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf.fit(X_train, y_train)
    return rf

## 6. XGBoost Model

Implement `train_xgboost`.

In [12]:
import xgboost as xgb

def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    model = xgb.XGBClassifier(n_estimators=n_estimators,
                              learning_rate=learning_rate,
                              max_depth=max_depth,
                              use_label_encoder=False,
                              eval_metric='logloss',
                              random_state=random_state)
    model.fit(X_train, y_train)
    return model


## 7. Model Comparison

Calculate and compare AUC scores for both models.

In [13]:
rf_model = train_random_forest(X_train, y_train)
xgb_model = train_xgboost(X_train, y_train)

# 1. Generate probability predictions for both models
rf_probs = rf_model.predict_proba(X_test)[:, 1]
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

# 2. Calculate AUC scores
rf_auc = roc_auc_score(y_test, rf_probs)
xgb_auc = roc_auc_score(y_test, xgb_probs)

# 3. Compare the performance
print(f"Random Forest AUC: {rf_auc:.4f}")
print(f"XGBoost AUC: {xgb_auc:.4f}")

Random Forest AUC: 0.7745
XGBoost AUC: 0.7645


Parameters: { "use_label_encoder" } are not used.



## 8. Save Results

Save the AUC scores to a text file.

In [14]:

# 1. Create 'results' directory if it doesn't exist
os.makedirs('results', exist_ok=True)

# 2. Format AUC scores as strings
rf_auc_str = f"Random Forest AUC: {rf_auc:.4f}"
xgb_auc_str = f"XGBoost AUC: {xgb_auc:.4f}"

# 3. Write scores to 'results/results_part2.txt'
with open('results/results_part2.txt', 'w') as f:
    f.write(rf_auc_str + '\n')
    f.write(xgb_auc_str + '\n')


## 9. Main Execution

Run the complete workflow.

In [15]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    
    # 5. Calculate AUC scores
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)
    
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    
    # 6. Save results
    # (Your code for saving results)

  df_result = df_result.fillna(method='bfill')  # Backward fill for NaNs


Random Forest AUC: 0.7745
XGBoost AUC: 0.7645


Parameters: { "use_label_encoder" } are not used.

