In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

In [2]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas
    path = file_path
    df = pd.read_csv(
            file_path,
            parse_dates=["timestamp"],   # parses the column to datetime dtype
            infer_datetime_format=True)
    return df

In [3]:
def extract_rolling_features(df, window_size_seconds):
    """
    Calculate rolling mean and standard deviation for heart rate.
    
    Args:
        df: DataFrame with timestamp and heart_rate columns
        window_size_seconds: Size of the rolling window in seconds
        
    Returns:
        DataFrame with added hr_rolling_mean and hr_rolling_std columns
    """
    # YOUR CODE HERE
    # 1. Sort data by timestamp
    df_sorted = df.sort_values('timestamp')
    
    # 2. Set timestamp as index (this allows time-based operations)
    df_indexed = df_sorted.set_index('timestamp')
    
    # 3. Calculate rolling mean and standard deviation
    #    - First, create a rolling window object based on time:
    rolling_window = df_indexed['heart_rate'].rolling(window=f'{window_size_seconds}s')
    #    - Then calculate statistics on this window:
    hr_mean = rolling_window.mean()
    hr_std = rolling_window.std()
    
    # 4. Add the new columns back to the dataframe
    df_indexed['hr_rolling_mean'] = hr_mean
    df_indexed['hr_rolling_std'] = hr_std
    
    # 5. Reset index to bring timestamp back as a column
    df_result = df_indexed.reset_index()
    
    # 6. Handle any NaN values (rolling calculations create NaNs at the beginning)
    #    - You can use fillna, dropna, or other methods depending on your strategy
    df_result = df_result.fillna(method='bfill')  # Example: backward fill
    
    # Placeholder return - replace with your implementation
    return df_result

In [13]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    """
    Prepare data for modeling with time-series features.
    
    Args:
        df_with_features: DataFrame with original and rolling features
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Select relevant features including the rolling features
    # 2. Select target variable (disease_outcome)
    # 3. Split data into training and testing sets
    # 4. Handle missing values

    feature_cols = [
        col for col in df_with_features.columns
        if col not in ("timestamp", "disease_outcome")
        and pd.api.types.is_numeric_dtype(df_with_features[col])   # NEW
    ]

    X = df_with_features[feature_cols]
    y = df_with_features["disease_outcome"]

    split_idx = int(len(df_with_features) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    imputer = SimpleImputer(strategy="median")
    X_train = pd.DataFrame(imputer.fit_transform(X_train),
                           index=X_train.index, columns=X_train.columns)
    X_test  = pd.DataFrame(imputer.transform(X_test),
                           index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test

In [5]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    """
    Train a Random Forest classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of trees in the forest
        max_depth: Maximum depth of the trees
        random_state: Random seed for reproducibility
        
    Returns:
        Trained Random Forest model
    """
    # Initialize the classifier
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        n_jobs=-1,           # utilize all CPU cores
        random_state=random_state,
        class_weight="balanced"  # handles class imbalance automatically
    )

    # Fit to data
    rf.fit(X_train, y_train)

    return rf

In [15]:
def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    """
    Train an XGBoost classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of boosting rounds
        learning_rate: Boosting learning rate
        max_depth: Maximum depth of a tree
        random_state: Random seed for reproducibility
        
    Returns:
        Trained XGBoost model
    """
    # YOUR CODE HERE
    # Initialize the classifier
    xgb_clf = xgb.XGBClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=0.8,            # stochastic training for robustness
        colsample_bytree=0.8,     # feature subsampling per tree
        objective="binary:logistic",  # adjust if multi-class
        eval_metric="logloss",    # safe default; change to "auc" if desired
        n_jobs=-1,                # parallel training
        random_state=random_state,
        reg_lambda=1.0,           # L2 regularization (default)
        tree_method="hist"        # fast histogram-based grower
    )

    # Fit the model
    xgb_clf.fit(X_train, y_train)

    return xgb_clf

In [7]:
# YOUR CODE HERE
# 1. Generate probability predictions for both models
# 2. Calculate AUC scores
# 3. Compare the performance
def evaluate_models(models: dict, X_test, y_test):
    print("---- ROC-AUC comparison ----")
    for name, mdl in models.items():
        probs = mdl.predict_proba(X_test)[:, 1]
        auc   = roc_auc_score(y_test, probs)
        print(f"{name:14s}: {auc:.4f}")
    print("----------------------------")

In [18]:
# YOUR CODE HERE
# 1. Create 'results' directory if it doesn't exist
# 2. Format AUC scores as strings
# 3. Write scores to 'results/results_part2.txt'
def save_to_file(rf_auc,xgb_auc):
    os.makedirs("results", exist_ok=True)
    rf_auc_str  = f"Random Forest ROC-AUC: {rf_auc:.4f}"
    xgb_auc_str = f"XGBoost ROC-AUC:      {xgb_auc:.4f}"
    with open("results/results_part2.txt", "w") as f:
        f.write(rf_auc_str + "\n")
        f.write(xgb_auc_str + "\n")

    print("Saved AUC scores to results/results_part2.txt")

In [19]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data//synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    
    # 5. Calculate AUC scores
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)
    
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    
    # 6. Save results
    save_to_file(rf_auc,xgb_auc)

  df = pd.read_csv(
  df_result = df_result.fillna(method='bfill')  # Example: backward fill


Random Forest AUC: 0.9847
XGBoost AUC: 0.9998
Saved AUC scores to results/results_part2.txt
