In [58]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [59]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    df = pd.read_csv(file_path)
    return df

In [60]:
def encode_categorical_features(df, column_to_encode='smoker_status'):
    """
    Encode a categorical column using OneHotEncoder.
    
    Args:
        df: Input DataFrame
        column_to_encode: Name of the categorical column to encode
        
    Returns:
        DataFrame with the categorical column replaced by one-hot encoded columns
    """
    
    # 1. Extract the categorical column
    categorical_column = df[[column_to_encode]]
    
    # 2. Apply OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_values = encoder.fit_transform(categorical_column)
    
    # 3. Create new column names
    encoded_column_names = encoder.get_feature_names_out([column_to_encode])
    
    # 4. Replace the original categorical column with the encoded columns
    encoded_df = pd.DataFrame(encoded_values, columns=encoded_column_names, index=df.index)
    df_encoded = pd.concat([df, encoded_df], axis=1)
    df_encoded = df_encoded.drop(columns=[column_to_encode])
    
    # Placeholder return - replace with your implementation
    return df_encoded

In [61]:
def prepare_data_part3(df, test_size=0.2, random_state=42):
    
    # 1. Encode categorical features using the encode_categorical_features function
    df_encoded = encode_categorical_features(df)
    
    # 4. Handle missing values
    # Drop non-numeric or irrelevant columns
    df_encoded = df_encoded.drop(columns=['timestamp'])
    df_encoded = df_encoded.dropna()

    # 2. Select relevant features (all columns except 'disease_outcome') and target (disease_outcome)
    y = df_encoded['disease_outcome']
    X = df_encoded.drop(columns=['disease_outcome'])

    # 3. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    return X_train, X_test, y_train, y_test

In [62]:
def apply_smote(X_train, y_train, random_state=42):
    """
    Apply SMOTE to oversample the minority class.
    
    Args:
        X_train: Training features
        y_train: Training target
        random_state: Random seed for reproducibility
        
    Returns:
        Resampled X_train and y_train with balanced classes
    """
    
    # Apply SMOTE to balance the classes
    smote = SMOTE(random_state=random_state)
    
    # Apply SMOTE to the training data
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    return X_train_resampled, y_train_resampled

In [63]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """

    # Initialize and train a LogisticRegression model
    model = LogisticRegression(random_state=42, max_iter=5000)
    model.fit(X_train, y_train)
    
    return model

In [64]:
def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """

    # 1. Generate predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # 2. Calculate metrics: accuracy, precision, recall, f1, auc
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    # 3. Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # 4. Return metrics in a dictionary
    
    # Placeholder return - replace with your implementation
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'confusion_matrix': cm
    }

In [65]:
def save_results(metrics, results_dir='results', text_file='results_part3.txt', json_file='results_part3.json'):
    os.makedirs(results_dir, exist_ok=True)

    # 1. Save text file
    formatted_metrics = ""
    for metric, value in metrics.items():
        if metric == 'confusion_matrix':
            formatted_metrics += f"Confusion Matrix:\n{value}\n\n"
        else:
            formatted_metrics += f"{metric.capitalize()}: {value:.4f}\n"
    with open(os.path.join(results_dir, text_file), 'w') as f:
        f.write(formatted_metrics)

    # 2. Save as JSON
    with open(os.path.join(results_dir, json_file), 'w') as f:
        json.dump(metrics, f, indent=4)

In [66]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data with categorical encoding
    X_train, X_test, y_train, y_test = prepare_data_part3(df)
    
    # 3. Apply SMOTE to balance the training data
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)
    
    # 4. Train model on resampled data
    model = train_logistic_regression(X_train_resampled, y_train_resampled)
    
    # 5. Evaluate on original test set
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 6. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 7. Save results
    save_results(metrics=metrics)

accuracy: 0.8579
precision: 0.3964
recall: 0.8538
f1: 0.5415
auc: 0.9387


TypeError: Object of type ndarray is not JSON serializable

In [None]:
def compare_models(part3_metrics):
    """
    Calculate percentage improvement between models trained on imbalanced vs. balanced data.
    
    Args:
        part1_metrics: Dictionary containing evaluation metrics from Part 1 (imbalanced)
        part3_metrics: Dictionary containing evaluation metrics from Part 3 (balanced)
        
    Returns:
        Dictionary with metric names as keys and improvement percentages as values
    """
    if not part3_metrics:    
        try:
            with open('results/results_part3.json', 'r') as f:
                part3_metrics = json.load(f)
        except FileNotFoundError:
            print("Part 3 results not found. Run part3_data_preparation.ipynb first.")
            return {}

    try:
        with open('results/results_part1.json', 'r') as f:
            part1_metrics = json.load(f)
    except FileNotFoundError:
        print("Part 1 results not found. Run part1_introduction.ipynb first.")
        return {}

    # Calculate percentage improvement
    improvement = {}
    for metric in part1_metrics:
        if metric == 'confusion_matrix':
            continue
        imbalanced = part1_metrics[metric]
        balanced = part3_metrics[metric]
        if imbalanced != 0:
            improvement[metric] = ((balanced - imbalanced) / imbalanced) * 100
        else:
            improvement[metric] = 0

    return improvement

In [None]:
# 8. Load Part 1 results for comparison
metrics = None
comparison = compare_models(metrics)
for k,v in comparison.items():
    print(k," : ", v)

JSONDecodeError: Expecting value: line 7 column 25 (char 193)