# Part 3: Practical Data Preparation
Handle categorical features using One-Hot Encoding and address class imbalance using SMOTE.

In [1]:
%pip install -r requirements.txt






[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.3.6/libexec/bin/python -m pip install --upgrade pip[0m


Note: you may need to restart the kernel to use updated packages.


## 1. Setup
Import necessary libraries.

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

## 2. Data Loading
Load the dataset.

In [3]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # Load the CSV file using pandas
    df = pd.read_csv(file_path)
    
    return df

In [4]:
#test_df = load_data('data/synthetic_health_data.csv')
#test_df

## 3. Categorical Feature Encoding
Implement `encode_categorical_features` using `OneHotEncoder`.

In [5]:
def encode_categorical_features(df, column_to_encode='smoker_status'):
    """
    Encode a categorical column using OneHotEncoder.
    
    Args:
        df: Input DataFrame
        column_to_encode: Name of the categorical column to encode
        
    Returns:
        DataFrame with the categorical column replaced by one-hot encoded columns
    """
    # 1. Extract the categorical column
    categorical_col = df[[column_to_encode]]

    # 2. Apply OneHotEncoder
    encoder = OneHotEncoder(sparse_output = False, drop='first', handle_unknown='ignore')
    encoded_array = encoder.fit_transform(categorical_col)

    # 3. Create new column names
    new_cols = encoder.get_feature_names_out([column_to_encode])
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out())

    # 4. Replace the original categorical column with the encoded columns
    df_encoded = df.drop(columns=[column_to_encode])
    df_encoded = pd.concat([df_encoded, encoded_df], axis=1)
    
    return df_encoded

In [6]:
#test_encode = encode_categorical_features(test_df)
#test_encode

## 4. Data Preparation
Implement `prepare_data_part3` to handle the train/test split correctly.

In [7]:
def prepare_data_part3(df, test_size=0.2, random_state=42):
    """
    Prepare data with categorical encoding.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # 1. Encode categorical features using the encode_categorical_features function
    df = encode_categorical_features(df)

    # 2. Select relevant features (including the one-hot encoded ones) and the target
    features = [col for col in df.columns if col not in ['disease_outcome', 'timestamp']]
    X = df[features]
    y = df['disease_outcome']

    # 3. Handle missing values
    imputer = SimpleImputer(strategy='mean') 
    imputed_df = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # 4. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        imputed_df, y, test_size=test_size, random_state=random_state
    )
    
    return X_train, X_test, y_train, y_test

In [8]:
#test_Xtrain, test_Xtest, test_ytrain, test_ytest = prepare_data_part3(test_df)
#test_ytest

## 5. Handling Imbalanced Data
Implement `apply_smote` to oversample the minority class.

In [9]:
def apply_smote(X_train, y_train, random_state=42):
    """
    Apply SMOTE to oversample the minority class.
    
    Args:
        X_train: Training features
        y_train: Training target
        random_state: Random seed for reproducibility
        
    Returns:
        Resampled X_train and y_train with balanced classes
    """
    # Apply SMOTE to balance the classes
    smote = SMOTE(random_state=random_state)
    X_resample, y_resample = smote.fit_resample(X_train, y_train)
    
    return X_resample, y_resample

In [10]:
#test_Xresample, test_yresample = apply_smote(test_Xtrain, test_ytrain)
#test_yresample

## 6. Model Training and Evaluation
Train a model on the SMOTE-resampled data and evaluate it.

In [11]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # Initialize and train a LogisticRegression model
    model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    
    return model

def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # 1. Generate predictions
    y_pred = model.predict(X_test)

    # 2. Calculate metrics: accuracy, precision, recall, f1, auc
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)

    # 3. Create confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # 4. Return metrics in a dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc,
        'confusion_matrix': conf_matrix
    }

In [12]:
#test_model = train_logistic_regression(test_Xtrain, test_ytrain)
#test_model

#test_metrics = calculate_evaluation_metrics(test_model, test_Xtest, test_ytest)

## 7. Save Results
Save the evaluation metrics to a text file.

In [13]:
def save_eval_metrics(metrics):
    """
    Save the evaluation metric to a text file.
    
    Args:
        metrics: Evaluation metrics dictionary
        xgb_auc: XGBoost AUC
        
    Returns:
        Text file of evaluation metrics
    """
    # 1. Create 'results' directory if it doesn't exist
    os.makedirs('results', exist_ok=True)

    # 2. Format metrics as strings
    # 3. Write metrics to 'results/results_part3.txt'
    filepath = 'results/results_part3.txt'
    with open(filepath, 'w') as file:
        for name, value in metrics.items():
            file.write(f"{name}: {value} \n")

In [14]:
#save_eval_metrics(test_metrics)

## 8. Compare Results
Implement a function to compare model performance between balanced and imbalanced data.

In [15]:
def compare_models(part1_metrics, part3_metrics):
    """
    Calculate percentage improvement between models trained on imbalanced vs. balanced data.
    
    Args:
        part1_metrics: Dictionary containing evaluation metrics from Part 1 (imbalanced)
        part3_metrics: Dictionary containing evaluation metrics from Part 3 (balanced)
        
    Returns:
        Dictionary with metric names as keys and improvement percentages as values
    """
    # 1. Calculate percentage improvement for each metric
    percentages = {}

    # 2. Handle metrics where higher is better (most metrics) and where lower is better
    for metric in ['accuracy', 'precision', 'recall', 'f1', 'auc']:
        imbalanced = part1_metrics.get(metric, 0)
        balanced = part3_metrics.get(metric, 0)
        
        if imbalanced > 0:
            percentages[metric] = ((balanced - imbalanced) / imbalanced) * 100
        else:
            percentages[metric] = float('inf') if balanced > 0 else 0.0

    # 3. Return a dictionary with metric names and improvement percentages
    return percentages

## 9. Main Execution
Run the complete workflow.


In [16]:
def parse_metrics_from_txt(filename):
    """
    Get metrics from text file (implemented this function instead because the main execution tries to open it as JSON)
    
    Args:
        filename: File path
        
    Returns:
        Parsed metrics
    """
    metrics = {}
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()  
            if ':' in line and line:  
                key, value = line.split(':', 1)
                key = key.strip()
                value = value.strip()
                
                if value and value != '':
                    try:
                        metrics[key] = float(value)
                    except ValueError:
                        print(f"Skipping non-numeric value for '{key}': '{value}'")
                        continue
    return metrics

In [17]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data with categorical encoding
    X_train, X_test, y_train, y_test = prepare_data_part3(df)
    
    # 3. Apply SMOTE to balance the training data
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)
    
    # 4. Train model on resampled data
    model = train_logistic_regression(X_train_resampled, y_train_resampled)
    
    # 5. Evaluate on original test set
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 6. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 7. Save results
    save_eval_metrics(metrics)
    
    # 8. Load Part 1 results for comparison
    try:
        part1_metrics = parse_metrics_from_txt('results/results_part1.txt')
        
        # 9. Compare models
        comparison = compare_models(part1_metrics, metrics)
        print("\nModel Comparison (improvement percentages):")
        for metric, improvement in comparison.items():
            print(f"{metric}: {improvement:.2f}%")
    except FileNotFoundError:
        print("Part 1 results not found. Run part1_introduction.ipynb first.")

accuracy: 0.8547
precision: 0.3885
recall: 0.8531
f1_score: 0.5339
auc: 0.8540

Model Comparison (improvement percentages):
accuracy: -6.77%
precision: -41.26%
recall: 183.72%
f1: -100.00%
auc: -5.99%
