# Install necessary packages

In [None]:
%pip install -r requirements.txt

# Part 2: Time Series Features & Tree-Based Models

**Objective:** Extract basic time-series features from heart rate data, train Random Forest and XGBoost models, and compare their performance.

## 1. Setup

Import necessary libraries.

In [59]:
import pandas as pd
import numpy as np
import os
import sys
print(sys.maxsize > 2*32)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

True


## 2. Data Loading

Load the dataset.

In [9]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data with timestamp parsed as datetime
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas
    # Make sure to parse the timestamp column as datetime
    
    return pd.read_csv(file_path)  # Replace with actual implementation

In [12]:
data = load_data('data/synthetic_health_data.csv')

## 3. Feature Engineering

Implement `extract_rolling_features` to calculate rolling mean and standard deviation for the `heart_rate`.

In [71]:
def extract_rolling_features(df, window_size_seconds):
    """
    Calculate rolling mean and standard deviation for heart rate.
    
    Args:
        df: DataFrame with timestamp and heart_rate columns
        window_size_seconds: Size of the rolling window in seconds
        
    Returns:
        DataFrame with added hr_rolling_mean and hr_rolling_std columns
    """
    # YOUR CODE HERE
    # 1. Sort data by timestamp
    df['smoker_status'] = LabelEncoder().fit_transform(df['smoker_status'])
    df_sorted = df.sort_values('timestamp').copy()
    df_sorted['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # 2. Set timestamp as index (this allows time-based operations)
    df_indexed = df_sorted.set_index('timestamp')
    
    # 3. Calculate rolling mean and standard deviation
    #    - First, create a rolling window object based on time:
    rolling_window = df_indexed['heart_rate'].rolling(window=f'{window_size_seconds}s', min_periods=1)
    #    - Then calculate statistics on this window:
    hr_mean = rolling_window.mean()
    hr_std = rolling_window.std()
    
    # 4. Add the new columns back to the dataframe
    df_indexed['hr_rolling_mean'] = hr_mean
    df_indexed['hr_rolling_std'] = hr_std
    
    # 5. Reset index to bring timestamp back as a column
    df_result = df_indexed.reset_index()
    
    # 6. Handle any NaN values (rolling calculations create NaNs at the beginning)
    #    - You can use fillna, dropna, or other methods depending on your strategy
    df_result = df_result.bfill()  # Example: backward fill
    
    # Placeholder return - replace with your implementation

    return df_result.copy()

In [53]:
df = extract_rolling_features(data, window_size_seconds=10)
df['smoker_status'] = LabelEncoder().fit_transform(df['smoker_status'])
df

Unnamed: 0,timestamp,patient_id,age,systolic_bp,diastolic_bp,glucose_level,bmi,smoker_status,heart_rate,disease_outcome,hr_rolling_mean,hr_rolling_std
0,2023-01-01 00:00:00.000000,9,42,109.271202,84.872277,112.205979,28.723817,0,65.241827,0,65.241827,7.187153
1,2023-01-01 00:00:00.000000,72,73,117.499246,80.624776,113.018091,22.543094,1,75.405997,0,70.323912,7.187153
2,2023-01-01 00:00:00.000000,131,26,125.396859,75.090352,108.999820,23.975127,2,71.244470,0,70.630764,5.109800
3,2023-01-01 00:00:00.000000,116,54,124.532627,78.492678,78.559751,25.783712,1,76.660696,0,72.138247,5.147497
4,2023-01-02 00:00:00.000000,56,63,111.304514,67.796714,97.859274,27.611658,1,63.534376,0,63.534376,2.716776
...,...,...,...,...,...,...,...,...,...,...,...,...
7321,2023-07-20 21:01:11.701224,109,53,130.694575,74.261794,107.862114,21.732533,1,71.660907,0,71.660907,
7322,2023-07-23 01:45:47.772214,5,46,108.477241,88.742732,99.719007,33.010116,1,65.398427,0,65.398427,
7323,2023-07-23 06:55:40.341291,16,41,124.742455,83.196582,117.215513,27.210364,1,73.129839,0,73.129839,
7324,2023-08-03 10:54:11.976978,109,53,130.912449,69.467888,118.815102,22.172972,1,67.511809,0,67.511809,


## 4. Data Preparation

Implement `prepare_data_part2` using the newly engineered features.

In [54]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    """
    Prepare data for modeling with time-series features.
    
    Args:
        df_with_features: DataFrame with original and rolling features
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Select relevant features including the rolling features
    # 2. Select target variable (disease_outcome)
    # 3. Split data into training and testing sets
    # 4. Handle missing values
    X = df_with_features.drop(columns=['timestamp', 'disease_outcome', 'patient_id'])
    numeric_cols = X.select_dtypes(include=np.number).columns
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X[numeric_cols])
    X[numeric_cols] = X_imputed
    X = X.dropna()

    y = df_with_features.loc[X.index, 'disease_outcome']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    # Placeholder return - replace with your implementation
    return  X_train, X_test, y_train, y_test

In [55]:
X_train, X_test, y_train, y_test = prepare_data_part2(df)

## 5. Random Forest Model

Implement `train_random_forest`.

In [None]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    """
    Train a Random Forest classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of trees in the forest
        max_depth: Maximum depth of the trees
        random_state: Random seed for reproducibility
        
    Returns:
        Trained Random Forest model
    """
    # Initialize and train a RandomForestClassifier
    model = RandomForestClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        random_state=random_state)
    model.fit(X_train, y_train)

    return model  # Replace with actual implementation

## 6. XGBoost Model

Implement `train_xgboost`.

In [73]:
def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    """
    Train an XGBoost classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of boosting rounds
        learning_rate: Boosting learning rate
        max_depth: Maximum depth of a tree
        random_state: Random seed for reproducibility
        
    Returns:
        Trained XGBoost model
    """
    # YOUR CODE HERE
    # Initialize and train an XGBClassifier
    model = XGBClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        random_state=random_state,
        eval_metric='auc'
    )
    model.fit(X_train, y_train)
    return model  # Replace with actual implementation

## 7. Model Comparison

Calculate and compare AUC scores for both models.

In [75]:
# YOUR CODE HERE
# 1. Generate probability predictions for both models
# 2. Calculate AUC scores
# 3. Compare the performance
rf_model = train_random_forest(X_train, y_train)
xgb_model = train_xgboost(X_train, y_train)
rf_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
xgb_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

## 8. Save Results

Save the AUC scores to a text file.

In [79]:
# YOUR CODE HERE
# 1. Create 'results' directory if it doesn't exist
# 2. Format AUC scores as strings
# 3. Write scores to 'results/results_part2.txt'
os.makedirs('results', exist_ok=True)
results_dir2 = os.path.join('results', 'results_part2.txt')
with open(results_dir2, 'w') as f:
    f.write(f"Random Forest AUC: {rf_auc:.4f}\n")
    f.write(f"XGBoost AUC: {xgb_auc:.4f}\n")

## 9. Main Execution

Run the complete workflow.

In [77]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    
    # 5. Calculate AUC scores
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)
    
    print(f"Random Forest AUC: {rf_auc:.4f}")
    print(f"XGBoost AUC: {xgb_auc:.4f}")
    
    # 6. Save results
    # (Your code for saving results)

Random Forest AUC: 0.9774
XGBoost AUC: 0.9992
