# Part 2 - Feature Engineering

### Step 1 - Setup

In [50]:
%pip install -r requirements.txt

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [24]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data with timestamp parsed as datetime
    """
    data = pd.read_csv(file_path)

    data['timestamp'] = pd.to_datetime(data['timestamp'])
    # YOUR CODE HERE
    # Load the CSV file using pandas
    # Make sure to parse the timestamp column as datetime
    
    return data

data = load_data('data/synthetic_health_data.csv')

data

Unnamed: 0,patient_id,timestamp,age,systolic_bp,diastolic_bp,glucose_level,bmi,smoker_status,heart_rate,disease_outcome
0,1,2023-01-29 00:00:00.000000,57,113.063416,84.069561,117.475210,25.085796,no,62.719587,0
1,1,2023-01-31 07:33:55.507789,57,121.598849,89.672279,85.120875,24.120608,no,76.314434,0
2,1,2023-02-02 00:15:11.379377,57,126.623222,87.619685,,24.819332,no,62.427785,0
3,1,2023-02-04 09:37:12.589164,57,136.999366,89.199774,118.755648,25.039598,no,61.612981,0
4,1,2023-02-04 20:56:52.838198,57,127.546919,92.644673,98.882007,24.895024,no,77.649615,0
...,...,...,...,...,...,...,...,...,...,...
7321,150,2023-03-18 09:08:49.029823,54,115.038254,79.241741,84.586944,29.968156,no,73.599447,0
7322,150,2023-03-20 14:38:22.129593,54,116.389186,70.464818,91.476621,29.519510,no,64.162701,0
7323,150,2023-03-23 09:26:04.210673,54,123.419606,88.213054,96.985434,29.786678,no,71.641423,0
7324,150,2023-03-27 14:17:19.255961,54,,69.539940,85.670800,29.188655,no,72.781243,0


### 3. Feature Engineering

In [44]:
def extract_rolling_features(df, window_size_seconds = 300):
    """
    Calculate rolling mean and standard deviation for heart rate.
    
    Args:
        df: DataFrame with timestamp and heart_rate columns
        window_size_seconds: Size of the rolling window in seconds
        
    Returns:
        DataFrame with added hr_rolling_mean and hr_rolling_std columns
    """

    df = data
    # 1. Sort data by timestamp
    df_sorted = df.sort_values('timestamp')
    
    # 2. Set timestamp as index (this allows time-based operations)
    df_indexed = df_sorted.set_index('timestamp')
    
    # 3. Calculate rolling mean and standard deviation
    #    - First, create a rolling window object based on time:
    rolling_window = df_indexed['heart_rate'].rolling(window=f'{window_size_seconds}s')
    #    - Then calculate statistics on this window:
    hr_mean = rolling_window.mean()
    hr_std = rolling_window.std()
    
    # 4. Add the new columns back to the dataframe
    df_indexed['hr_rolling_mean'] = hr_mean
    df_indexed['hr_rolling_std'] = hr_std

    # 5. Reset index to bring timestamp back as a column
    df_result = df_indexed.reset_index()

    # 6. Handle any NaN values (rolling calculations create NaNs at the beginning)
    #    - You can use fillna, dropna, or other methods depending on your strategy
    df_result = df_result.fillna(method='ffill').dropna()  #ffill plus dropna() to remove the last NaN

    # Placeholder return - replace with your implementation
    return df_result

df_with_features = extract_rolling_features(data, window_size_seconds = 10)


  df_result = df_result.fillna(method='ffill').dropna()  #ffill plus dropna() to remove the last NaN


### 4. Data Preparation

In [45]:
def prepare_data_part2(df_with_features, test_size=0.2, random_state=42):
    """
    Prepare data for modeling with time-series features.
    
    Args:
        df_with_features: DataFrame with original and rolling features
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    features = ['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi', 'hr_rolling_mean', 'hr_rolling_std']
    target = ['disease_outcome']

    X = df_with_features[features]
    y = df_with_features[target]

    train_test_split(X, y, test_size = test_size, random_state = random_state)
    
    return train_test_split(X, y, test_size = test_size, random_state = random_state)

X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features, test_size=0.2, random_state=42)

### 5. Random Forest Model Training

In [46]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42):
    """
    Train a Random Forest classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of trees in the forest
        max_depth: Maximum depth of the trees
        random_state: Random seed for reproducibility
        
    Returns:
        Trained Random Forest model
    """
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    
    return rf

rf = train_random_forest(X_train, y_train, n_estimators=100, max_depth=10, random_state=42)

  return fit_method(estimator, *args, **kwargs)


### 6. XGBoost Model 

In [47]:
def train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42):
    """
    Train an XGBoost classifier.
    
    Args:
        X_train: Training features
        y_train: Training target
        n_estimators: Number of boosting rounds
        learning_rate: Boosting learning rate
        max_depth: Maximum depth of a tree
        random_state: Random seed for reproducibility
        
    Returns:
        Trained XGBoost model
    """
    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'eta': learning_rate,
        'max_depth': max_depth
    }

    bst = xgb.train(params, dtrain, n_estimators)

    return bst

bst = train_xgboost(X_train, y_train, n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)


### 7. Model Comparison

In [48]:
dtest = xgb.DMatrix(X_test, label=y_test)
xgb_pred = bst.predict(dtest)

rf_pred = rf.predict(X_test)

AUC_dict = {'XGBoost': float(roc_auc_score(y_test, xgb_pred)),
            'RandomForest':  float(roc_auc_score(y_test, rf_pred))
}

### 8. Storing Values

In [49]:
directory = 'results'
filename = 'results_part2.txt'
filepath = os.path.join(directory, filename)

with open(filepath, "w") as file:
    for model, auc in AUC_dict.items():
        file.write(f'{model}: {auc:.3f}\n')
    file.close()

print(f"AUC values have been written to '{filepath}'.")


AUC values have been written to 'results/results_part2.txt'.


### 9. Main Execution

In [52]:
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Extract rolling features
    window_size = 300  # 5 minutes in seconds
    df_with_features = extract_rolling_features(df, window_size)
    
    # 3. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part2(df_with_features)
    
    # 4. Train models
    rf_model = train_random_forest(X_train, y_train)

    def train_xgboost(X_train, y_train):
        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
        model.fit(X_train, y_train)
        return model
    
    xgb_model = train_xgboost(X_train, y_train)
    # 5. Calculate AUC scores
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    
    rf_auc = roc_auc_score(y_test, rf_probs)
    xgb_auc = roc_auc_score(y_test, xgb_probs)
    
    AUC_dict = {'XGBoost': float(rf_auc),
            'RandomForest':  float(xgb_auc)
            }
    
    #6. Saving Results
    directory = 'results'
    filename = 'results_part2.txt'
    filepath = os.path.join(directory, filename)

    with open(filepath, "w") as file:
        for model, auc in AUC_dict.items():
            file.write(f'{model}: {auc:.3f}\n')
        file.close()

    print(f"AUC values have been written to '{filepath}'.")

  df_result = df_result.fillna(method='ffill').dropna()  #ffill plus dropna() to remove the last NaN
  return fit_method(estimator, *args, **kwargs)


AUC values have been written to 'results/results_part2.txt'.


Parameters: { "use_label_encoder" } are not used.

