In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [10]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas
    df = pd.read_csv(file_path)
    
    return df  # Replace with actual implementation

In [11]:
def encode_categorical_features(df, column_to_encode='smoker_status'):
    """
    Encode a categorical column using OneHotEncoder.
    
    Args:
        df: Input DataFrame
        column_to_encode: Name of the categorical column to encode
        
    Returns:
        DataFrame with the categorical column replaced by one-hot encoded columns
    """
    # YOUR CODE HERE
    # 1. Extract the categorical column
    # 2. Apply OneHotEncoder
    # 3. Create new column names
    # 4. Replace the original categorical column with the encoded columns
    
    cat_data = df[[column_to_encode]]

    encoder = OneHotEncoder(sparse=False, drop=None)
    encoded_array = encoder.fit_transform(cat_data)

    encoded_cols = encoder.get_feature_names_out([column_to_encode])
    encoded_df = pd.DataFrame(encoded_array, columns=encoded_cols, index=df.index)

    df_encoded = pd.concat([df.drop(columns=[column_to_encode]), encoded_df], axis=1)

    
    # Placeholder return - replace with your implementation
    return df_encoded

In [12]:
def prepare_data_part3(df, test_size=0.2, random_state=42):
    """
    Prepare data with categorical encoding.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Encode categorical features using the encode_categorical_features function
    # 2. Select relevant features (including the one-hot encoded ones) and the target
    # 3. Split data into training and testing sets
    # 4. Handle missing values
    
    df_encoded = encode_categorical_features(df, column_to_encode='smoker_status')

    feature_cols = [
        'age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi', 'heart_rate',
        'smoker_status_former', 'smoker_status_no', 'smoker_status_yes'
    ]

    encoded_cols = [col for col in df_encoded.columns if col.startswith('smoker_status_')]
    features = feature_cols + encoded_cols

    X = df_encoded[features]
    y = df_encoded['disease_outcome']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    return X_train, X_test, y_train, y_test

In [13]:
def apply_smote(X_train, y_train, random_state=42):
    """
    Apply SMOTE to oversample the minority class.
    
    Args:
        X_train: Training features
        y_train: Training target
        random_state: Random seed for reproducibility
        
    Returns:
        Resampled X_train and y_train with balanced classes
    """
    # YOUR CODE HERE
    # Apply SMOTE to balance the classes
    smote = SMOTE(random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    return X_resampled, y_resampled

In [14]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # YOUR CODE HERE
    # Initialize and train a LogisticRegression model
    
    model = LogisticRegression().fit(X_train, y_train)
    
    return model  # Replace with actual implementation

def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # YOUR CODE HERE
    # 1. Generate predictions
    # 2. Calculate metrics: accuracy, precision, recall, f1, auc
    # 3. Create confusion matrix
    # 4. Return metrics in a dictionary
    
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred) 
    cm = confusion_matrix(y_test, y_pred)
    
    results = {'accuracy': acc, 'precision': prec, 'recall': recall, 'f1': f1, 'auc': auc, 'confusion_matrix': cm}
    
    return results

In [15]:
def compare_models(part1_metrics, part3_metrics):
    """
    Calculate percentage improvement between models trained on imbalanced vs. balanced data.
    
    Args:
        part1_metrics: Dictionary containing evaluation metrics from Part 1 (imbalanced)
        part3_metrics: Dictionary containing evaluation metrics from Part 3 (balanced)
        
    Returns:
        Dictionary with metric names as keys and improvement percentages as values
    """
    # YOUR CODE HERE
    # 1. Calculate percentage improvement for each metric
    # 2. Handle metrics where higher is better (most metrics) and where lower is better
    # 3. Return a dictionary with metric names and improvement percentages
    
    improvements = {}
    for key in part1_metrics:
        old = part1_metrics[key]
        new = part3_metrics[key]
        if old == 0:
            # Avoid division by zero; define 100% improvement if new is positive
            improvements[key] = float('inf') if new > 0 else 0.0
        else:
            improvements[key] = ((new - old) / abs(old)) * 100

    return improvements


In [16]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data with categorical encoding
    X_train, X_test, y_train, y_test = prepare_data_part3(df)
    
    # 3. Apply SMOTE to balance the training data
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)

    # 4. Train model on resampled data
    model = train_logistic_regression(X_train_resampled, y_train_resampled)
    
    # 5. Evaluate on original test set
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 6. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 7. Save results
    # (Your code for saving results)
    os.makedirs('results', exist_ok=True) # create dir if does not exist

    output_file = 'results/results_part3.txt'

    with open(output_file, "w") as file: # open file for writing
        file.write(f"Accuracy: {metrics['accuracy']}\n") # write results to output file
        file.write(f"Precision: {metrics['precision']}\n")
        file.write(f"Recall: {metrics['recall']}\n")
        file.write(f"F1 Score: {metrics['f1']}\n")
        file.write(f"Confusion Matrix: {metrics['confusion_matrix']}\n")

    file.close()

    print(f"\nSynthetic data saved to {output_file}")

    
    # 8. Load Part 1 results for comparison
    import json
    try:
        with open('results/results_part1.txt', 'r') as f:
            part1_metrics = json.load(f)
        
        # 9. Compare models
        comparison = compare_models(part1_metrics, metrics)
        print("\nModel Comparison (improvement percentages):")
        for metric, improvement in comparison.items():
            print(f"{metric}: {improvement:.2f}%")
    except FileNotFoundError:
        print("Part 1 results not found. Run part1_introduction.ipynb first.")

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'