## 1. Setup

In [7]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

## 2. Data Loading

In [8]:
def load_data(file_path):
    
    return pd.read_csv(file_path, parse_dates=['timestamp'])

## 3. Feature Engineering

In [9]:
def extract_rolling_features(df, window_size_seconds):

    df_sorted = df.sort_values('timestamp')
    df_indexed = df_sorted.set_index('timestamp')
    
    # 3. Calculate rolling mean and standard deviation
    #    - First, create a rolling window object based on time:
    rolling_window = df_indexed['heart_rate'].rolling(window=f'{window_size_seconds}s')
    #    - Then calculate statistics on this window:
    hr_mean = rolling_window.mean()
    hr_std = rolling_window.std()
    
    # 4. Add the new columns back to the dataframe
    df_indexed['hr_rolling_mean'] = hr_mean
    df_indexed['hr_rolling_std'] = hr_std
    
    # 5. Reset index to bring timestamp back as a column
    df_result = df_indexed.reset_index()
    
    # 6. Handle any NaN values (rolling calculations create NaNs at the beginning)
    #    - You can use fillna, dropna, or other methods depending on your strategy
    df_result = df_result.fillna(method='bfill')  # Example: backward fill
    
    # Placeholder return - replace with your implementation
    return df_result

## 4. Data Preparation

In [10]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    features = ['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi',
                'hr_rolling_mean', 'hr_rolling_std']
    target = 'disease_outcome'

    X = df_with_features[features]
    y = df_with_features[target]

    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

## 5. Random Forest Model

In [11]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    model_rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    model_rf.fit(X_train, y_train)
    return model_rf

## 6. XGBoost Model

In [12]:
def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    model_xgb = xgb.XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
                              max_depth=max_depth, random_state=random_state, use_label_encoder=False, eval_metric='logloss')
    model_xgb.fit(X_train, y_train)
    return model_xgb

## 7. Model Comparison

## 8. Save Results

In [13]:
def save1(rf_auc, xgb_auc):
    os.makedirs('results', exist_ok=True)
    with open('results/results_part2.txt', 'w') as f:
        f.write(f"Random Forest AUC: {rf_auc:.4f}\n")
        f.write(f"XGBoost AUC: {xgb_auc:.4f}\n")

## 9. Main Execution

In [14]:
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    
    # 5. Calculate AUC scores
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)
    
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    
    # 6. Save results
    save1(rf_auc, xgb_auc)

  df_result = df_result.fillna(method='bfill')  # Example: backward fill


Random Forest AUC: 0.9735
XGBoost AUC: 0.9953


Parameters: { "use_label_encoder" } are not used.

