# Part 3: Practical Data Preparation

**Objective:** Handle categorical features using One-Hot Encoding and address class imbalance using SMOTE.

## 1. Setup

Import necessary libraries.

In [108]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

## 2. Data Loading

Load the dataset.

In [109]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """

    df = pd.read_csv(file_path)
    
    return df

## 3. Categorical Feature Encoding

Implement `encode_categorical_features` using `OneHotEncoder`.

In [110]:
def encode_categorical_features(df, column_to_encode='smoker_status'):
    """
    Encode a categorical column using OneHotEncoder.
    
    Args:
        df: Input DataFrame
        column_to_encode: Name of the categorical column to encode
        
    Returns:
        DataFrame with the categorical column replaced by one-hot encoded columns
    """
    # YOUR CODE HERE
    # 1. Extract the categorical column
    cat_col = pd.DataFrame(df[column_to_encode])

    # 2. Apply OneHotEncoder
    transform = OneHotEncoder()
    encoded = transform.fit_transform(cat_col)
    encoded = pd.DataFrame(encoded.toarray())

    # 3. Create new column names
    encoded.columns = transform.get_feature_names_out()

    # 4. Replace the original categorical column with the encoded columns
    df_new = pd.concat([df, encoded], axis=1).drop(columns=column_to_encode)
    
    # Placeholder return - replace with your implementation
    return df_new

## 4. Data Preparation

Implement `prepare_data_part3` to handle the train/test split correctly.

In [111]:
def prepare_data_part3(df, test_size=0.2, random_state=42):
    """
    Prepare data with categorical encoding.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Encode categorical features using the encode_categorical_features function
    df = encode_categorical_features(df)

    # 2. Select relevant features (including the one-hot encoded ones) and the target
    x = df[['smoker_status_former', 'smoker_status_no','smoker_status_yes']]
    y = df['disease_outcome']

    # 3. Split data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state, shuffle=True)
    
    # Placeholder return - replace with your implementation
    return x_train, x_test, y_train, y_test

## 5. Handling Imbalanced Data

Implement `apply_smote` to oversample the minority class.

In [None]:
def apply_smote(X_train, y_train, random_state=42):
    """
    Apply SMOTE to oversample the minority class.
    
    Args:
        X_train: Training features
        y_train: Training target
        random_state: Random seed for reproducibility
        
    Returns:
        Resampled X_train and y_train with balanced classes
    """
    # YOUR CODE HERE
    # Apply SMOTE to balance the classes
    smote = SMOTE(random_state=random_state)
    x_resample, y_resample = smote.fit_resample(X_train, y_train)
    
    # Placeholder return - replace with your implementation
    return x_resample, y_resample

## 6. Model Training and Evaluation

Train a model on the SMOTE-resampled data and evaluate it.

In [None]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # YOUR CODE HERE
    # Initialize and train a LogisticRegression model
    logreg = LogisticRegression()
    model = logreg.fit(X_train, y_train)
    
    return model

def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # YOUR CODE HERE
    # 2. Calculate metrics: accuracy, precision, recall, f1, auc
    # 3. Create confusion matrix
    # 4. Return metrics in a dictionary

    # Generate predictions
    y_predict = model.predict(X_test)

    # Calculate metrics: accuracy, precision, recall, f1, auc
    acc = accuracy_score(y_test, y_predict)
    prec = precision_score(y_test, y_predict)
    rec = recall_score(y_test, y_predict)
    f1 = f1_score(y_test, y_predict)
    auc = roc_auc_score(y_test, y_predict)

    # Create confusion matrix
    conf_matrix = confusion_matrix(y_test, y_predict)
    
    # Placeholder return - replace with your implementation
    return {'acc':acc, 'prec':prec, 'rec':rec, 'f1':f1, 'auc':auc, 'confusion_matrix':conf_matrix}

## 7. Save Results

Save the evaluation metrics to a text file.

## 9. Compare Results

Implement a function to compare model performance between balanced and imbalanced data.

In [None]:
def compare_models(part1_metrics, part3_metrics):
    """
    Calculate percentage improvement between models trained on imbalanced vs. balanced data.
    
    Args:
        part1_metrics: Dictionary containing evaluation metrics from Part 1 (imbalanced)
        part3_metrics: Dictionary containing evaluation metrics from Part 3 (balanced)
        
    Returns:
        Dictionary with metric names as keys and improvement percentages as values
    """
    # YOUR CODE HERE
    # 1. Calculate percentage improvement for each metric
    # 2. Handle metrics where higher is better (most metrics) and where lower is better
    # 3. Return a dictionary with metric names and improvement percentages
    
    # Placeholder return - replace with your implementation
    return {
        'accuracy': 0.0,
        'precision': 0.0,
        'recall': 0.0,
        'f1': 0.0,
        'auc': 0.0
    }

## 8. Main Execution

Run the complete workflow.

In [None]:
df = load_data('data/synthetic_health_data.csv')
x_train, x_test, y_train, y_test = prepare_data_part3(df)

np.int64(0)