# Install necessary packages

In [None]:
%pip install -r requirements.txt

# Part 3: Practical Data Preparation

**Objective:** Handle categorical features using One-Hot Encoding and address class imbalance using SMOTE.

## 1. Setup

Import necessary libraries.

In [143]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

## 2. Data Loading

Load the dataset.

In [144]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas
    
    return pd.read_csv(file_path)  # Replace with actual implementation

In [145]:
data = load_data('data/synthetic_health_data.csv')
data

Unnamed: 0,patient_id,timestamp,age,systolic_bp,diastolic_bp,glucose_level,bmi,smoker_status,heart_rate,disease_outcome
0,1,2023-01-29 00:00:00.000000,57,113.063416,84.069561,117.475210,25.085796,no,62.719587,0
1,1,2023-01-31 07:33:55.507789,57,121.598849,89.672279,85.120875,24.120608,no,76.314434,0
2,1,2023-02-02 00:15:11.379377,57,126.623222,87.619685,,24.819332,no,62.427785,0
3,1,2023-02-04 09:37:12.589164,57,136.999366,89.199774,118.755648,25.039598,no,61.612981,0
4,1,2023-02-04 20:56:52.838198,57,127.546919,92.644673,98.882007,24.895024,no,77.649615,0
...,...,...,...,...,...,...,...,...,...,...
7321,150,2023-03-18 09:08:49.029823,54,115.038254,79.241741,84.586944,29.968156,no,73.599447,0
7322,150,2023-03-20 14:38:22.129593,54,116.389186,70.464818,91.476621,29.519510,no,64.162701,0
7323,150,2023-03-23 09:26:04.210673,54,123.419606,88.213054,96.985434,29.786678,no,71.641423,0
7324,150,2023-03-27 14:17:19.255961,54,,69.539940,85.670800,29.188655,no,72.781243,0


## 3. Categorical Feature Encoding

Implement `encode_categorical_features` using `OneHotEncoder`.

In [146]:
def encode_categorical_features(df, column_to_encode='smoker_status'):
    """
    Encode a categorical column using OneHotEncoder.
    
    Args:
        df: Input DataFrame
        column_to_encode: Name of the categorical column to encode
        
    Returns:
        DataFrame with the categorical column replaced by one-hot encoded columns
    """
    # YOUR CODE HERE
    # 1. Extract the categorical column
    # 2. Apply OneHotEncoder
    # 3. Create new column names
    # 4. Replace the original categorical column with the encoded columns
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_data = encoder.fit_transform(df[[column_to_encode]])
    categories = encoder.categories_[0]
    encoded_column_names = [f"{column_to_encode}_{category}" for category in categories]
    encoded_df = pd.DataFrame(encoded_data, columns=encoded_column_names, index=df.index)

    # Placeholder return - replace with your implementation
    return pd.concat([df.drop(column_to_encode, axis=1), encoded_df], axis=1)

In [147]:
encoded_data = encode_categorical_features(data)
encoded_data.head()

Unnamed: 0,patient_id,timestamp,age,systolic_bp,diastolic_bp,glucose_level,bmi,heart_rate,disease_outcome,smoker_status_former,smoker_status_no,smoker_status_yes
0,1,2023-01-29 00:00:00.000000,57,113.063416,84.069561,117.47521,25.085796,62.719587,0,0.0,1.0,0.0
1,1,2023-01-31 07:33:55.507789,57,121.598849,89.672279,85.120875,24.120608,76.314434,0,0.0,1.0,0.0
2,1,2023-02-02 00:15:11.379377,57,126.623222,87.619685,,24.819332,62.427785,0,0.0,1.0,0.0
3,1,2023-02-04 09:37:12.589164,57,136.999366,89.199774,118.755648,25.039598,61.612981,0,0.0,1.0,0.0
4,1,2023-02-04 20:56:52.838198,57,127.546919,92.644673,98.882007,24.895024,77.649615,0,0.0,1.0,0.0


## 4. Data Preparation

Implement `prepare_data_part3` to handle the train/test split correctly.

In [148]:
def prepare_data_part3(df, test_size=0.2, random_state=42):
    """
    Prepare data with categorical encoding.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Encode categorical features using the encode_categorical_features function
    # 2. Select relevant features (including the one-hot encoded ones) and the target
    # 3. Split data into training and testing sets
    # 4. Handle missing values
    df = encode_categorical_features(df, 'smoker_status')

    X = df.drop(columns=['disease_outcome', 'timestamp', 'patient_id'])
    y = df[['disease_outcome']]
    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)

    # Placeholder return - replace with your implementation
    return train_test_split(X_imputed, y, test_size=test_size, random_state=random_state, stratify=y)

In [149]:
X_train, X_test, y_train, y_test = prepare_data_part3(data)

## 5. Handling Imbalanced Data

Implement `apply_smote` to oversample the minority class.

In [157]:
def apply_smote(X_train, y_train, random_state=42):
    """
    Apply SMOTE to oversample the minority class.
    
    Args:
        X_train: Training features
        y_train: Training target
        random_state: Random seed for reproducibility
        
    Returns:
        Resampled X_train and y_train with balanced classes
    """
    # YOUR CODE HERE
    # Apply SMOTE to balance the classes
    smote = SMOTE(random_state=random_state, sampling_strategy=0.5)

    # Placeholder return - replace with your implementation
    return smote.fit_resample(X_train, y_train)

In [158]:
X_train, y_train = apply_smote(X_train, y_train)

## 6. Model Training and Evaluation

Train a model on the SMOTE-resampled data and evaluate it.

In [159]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # YOUR CODE HERE
    y_train = np.ravel(y_train)
    # Initialize and train a LogisticRegression model
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    return model  # Replace with actual implementation

def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # YOUR CODE HERE
    # 1. Generate predictions
    # 2. Calculate metrics: accuracy, precision, recall, f1, auc
    # 3. Create confusion matrix
    # 4. Return metrics in a dictionary
    preds = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    metrics = dict()

    metrics['accuracy'] = accuracy_score(y_test, preds)
    metrics['precision'] = precision_score(y_test, preds, zero_division=0)
    metrics['recall'] = recall_score(y_test, preds, zero_division=0)
    metrics['f1'] = f1_score(y_test, preds, zero_division=0)
    metrics['auc'] = roc_auc_score(y_test, y_proba)
    metrics['confusion_matrix'] = confusion_matrix(y_test, preds).tolist()

    # Placeholder return - replace with your implementation
    return metrics

In [160]:
model = train_logistic_regression(X_train, y_train)
metrics = calculate_evaluation_metrics(model, X_test, y_test)
metrics

{'accuracy': 0.9051841746248295,
 'precision': np.float64(0.5073170731707317),
 'recall': np.float64(0.7323943661971831),
 'f1': np.float64(0.5994236311239193),
 'auc': np.float64(0.9178226458448576),
 'confusion_matrix': [[1223, 101], [38, 104]]}

## 7. Save Results

Save the evaluation metrics to a text file.

In [161]:
# YOUR CODE HERE
# 1. Create 'results' directory if it doesn't exist
# 2. Format metrics as strings
# 3. Write metrics to 'results/results_part3.txt'

os.makedirs('results', exist_ok=True)
results_dir3 = os.path.join('results', 'results_part3.txt')
with open(results_dir3, 'w') as f:
    json.dump(metrics, f, indent=4)
    # f.write(str(metrics))

## 9. Compare Results

Implement a function to compare model performance between balanced and imbalanced data.

In [163]:
def compare_models(part1_metrics, part3_metrics):
    """
    Calculate percentage improvement between models trained on imbalanced vs. balanced data.
    
    Args:
        part1_metrics: Dictionary containing evaluation metrics from Part 1 (imbalanced)
        part3_metrics: Dictionary containing evaluation metrics from Part 3 (balanced)
        
    Returns:
        Dictionary with metric names as keys and improvement percentages as values
    """
    # YOUR CODE HERE
    # 1. Calculate percentage improvement for each metric
    # 2. Handle metrics where higher is better (most metrics) and where lower is better
    # 3. Return a dictionary with metric names and improvement percentages
    
    # Placeholder return - replace with your implementation
    improvements = dict()
    for metric in list(metrics)[:-1]:
        p1 = part1_metrics.get(metric, 0)
        p3 = part3_metrics.get(metric, 0)
        improvements[metric] = ((p3 - p1) / p1) * 100 if p1 != 0 else 0

    return improvements

## 8. Main Execution

Run the complete workflow.

In [162]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data with categorical encoding
    X_train, X_test, y_train, y_test = prepare_data_part3(df)

    # 3. Apply SMOTE to balance the training data
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)
    
    # 4. Train model on resampled data
    model = train_logistic_regression(X_train_resampled, y_train_resampled)
    
    # 5. Evaluate on original test set
    metrics = calculate_evaluation_metrics(model, X_test, y_test)

    # 6. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 7. Save results
    # (Your code for saving results)

    # 8. Load Part 1 results for comparison
    import json
    try:
        with open('results/results_part1.txt', 'r') as f:
            part1_metrics = json.load(f)
        # 9. Compare models
        comparison = compare_models(part1_metrics, metrics)
        print("\nModel Comparison (improvement percentages):")
        for metric, improvement in comparison.items():
            print(f"{metric}: {improvement:.2f}%")
    except FileNotFoundError:
        print("Part 1 results not found. Run part1_introduction.ipynb first.")

accuracy: 0.9052
precision: 0.5073
recall: 0.7324
f1: 0.5994
auc: 0.9178

Model Comparison (improvement percentages):
accuracy: -1.26%
precision: -23.31%
recall: 143.56%
f1: 44.98%
auc: 1.04%
