In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [1]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas
    path = file_path
    df = pd.read_csv(
            file_path,
            parse_dates=["timestamp"],   # parses the column to datetime dtype
            infer_datetime_format=True)
    return df

In [12]:
def encode_categorical_features(df, column_to_encode='smoker_status'):
    """
    Encode a categorical column using OneHotEncoder.
    
    Args:
        df: Input DataFrame
        column_to_encode: Name of the categorical column to encode
        
    Returns:
        DataFrame with the categorical column replaced by one-hot encoded columns
    """
    # YOUR CODE HERE
    # 1. Extract the categorical column
    # 2. Apply OneHotEncoder
    # 3. Create new column names
    # 4. Replace the original categorical column with the encoded columns
    X_cat = df[[column_to_encode]]

    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=int)
    X_ohe = ohe.fit_transform(X_cat)

    new_cols = [f"{column_to_encode}_{cat}" for cat in ohe.categories_[0]]

    df_encoded = pd.DataFrame(X_ohe, columns=new_cols, index=df.index)
    df_out = pd.concat([df.drop(columns=[column_to_encode]), df_encoded], axis=1)

    return df_out

In [28]:
def prepare_data_part3(df, test_size=0.2, random_state=42):
    """
    Prepare data with categorical encoding.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Encode categorical features using the encode_categorical_features function
    # 2. Select relevant features (including the one-hot encoded ones) and the target
    # 3. Split data into training and testing sets
    # 4. Handle missing values
    df_enc = encode_categorical_features(df, column_to_encode="smoker_status")

    target_col = "disease_outcome"
    y = df_enc[target_col]
    X = df_enc.drop(columns=[target_col]).copy()

    # --- NEW: make every column numeric ---------------------------------
    for col in X.select_dtypes(include="datetime64[ns]").columns:
        # Option A -- keep the information (nanoseconds since 1970-01-01)
        X[col] = X[col].view("int64")
        # Option B -- if you don’t need it, just drop:
        # X = X.drop(columns=[col])
    # --------------------------------------------------------------------

    split_idx = int(len(X) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    imputer = SimpleImputer(strategy="median")
    X_train = pd.DataFrame(
        imputer.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    X_test = pd.DataFrame(
        imputer.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )

    return X_train, X_test, y_train, y_test

In [6]:
def apply_smote(X_train, y_train, random_state=42):
    """
    Apply SMOTE to oversample the minority class.
    
    Args:
        X_train: Training features
        y_train: Training target
        random_state: Random seed for reproducibility
        
    Returns:
        Resampled X_train and y_train with balanced classes
    """
    # YOUR CODE HERE
    sm = SMOTE(random_state=random_state)
    X_res_arr, y_res_arr = sm.fit_resample(X_train, y_train)

    # Convert back to DataFrame/Series to preserve column names and index
    X_res = pd.DataFrame(X_res_arr, columns=X_train.columns, index=None)
    y_res = pd.Series(y_res_arr, name=y_train.name, index=None)

    return X_res, y_res

In [9]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # YOUR CODE HERE
    # Initialize and train a LogisticRegression model
    clf = LogisticRegression(
        penalty="l2",
        solver="liblinear",      # good for small/medium-sized data
        max_iter=1000,
        class_weight=None        # set to 'balanced' if classes are still skewed
    )
    clf.fit(X_train, y_train)
    return clf

def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # YOUR CODE HERE
    # 1. Generate predictions
    # 2. Calculate metrics: accuracy, precision, recall, f1, auc
    # 3. Create confusion matrix
    # 4. Return metrics in a dictionary
    y_pred = model.predict(X_test)
    y_prob = getattr(model, "predict_proba", None)
    y_score = y_prob(X_test)[:, 1] if y_prob is not None else y_pred

    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    auc  = roc_auc_score(y_test, y_score)

    cm = confusion_matrix(y_test, y_pred)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc,
        "confusion_matrix": cm,
    }

In [10]:
# YOUR CODE HERE
# 1. Create 'results' directory if it doesn't exist
# 2. Format metrics as strings
# 3. Write metrics to 'results/results_part3.txt'
def save_to_file(metrics):
    results_dir = "results"
    os.makedirs(results_dir, exist_ok=True)
    outfile = os.path.join(results_dir, "results_part3.txt")
    with open(outfile, "w") as f:
        for k, v in metrics.items():
            if isinstance(v, (float, np.floating)):
                f.write(f"{k}: {v:.4f}\n")
            elif k == "confusion_matrix":
                # Flatten matrix for compact storage (TN FP FN TP)
                cm_str = " ".join(map(str, v.ravel()))
                f.write(f"{k}: {cm_str}\n")
            else:  # integers or anything else
                f.write(f"{k}: {v}\n")

In [34]:
def compare_models(part1_metrics, part3_metrics):
    """
    Calculate percentage improvement between models trained on imbalanced vs. balanced data.
    
    Args:
        part1_metrics: Dictionary containing evaluation metrics from Part 1 (imbalanced)
        part3_metrics: Dictionary containing evaluation metrics from Part 3 (balanced)
        
    Returns:
        Dictionary with metric names as keys and improvement percentages as values
    """
    # YOUR CODE HERE
    # 1. Calculate percentage improvement for each metric
    # 2. Handle metrics where higher is better (most metrics) and where lower is better
    # 3. Return a dictionary with metric names and improvement percentages
    
    # Placeholder return - replace with your implementation
    lower_is_better = {"log_loss"}
    improvements = {}

    for metric, p1_value in part1_metrics.items():
        # Skip arrays / lists / matrices
        if np.asarray(p1_value).ndim != 0:
            continue

        p3_value = part3_metrics.get(metric)
        if p3_value is None or np.asarray(p3_value).ndim != 0:
            continue

        if p1_value == 0:
            improvements[metric] = np.nan
            continue

        direction = -1 if metric in lower_is_better else 1
        pct_change = direction * (p3_value - p1_value) / abs(p1_value) * 100
        improvements[metric] = pct_change

    return improvements

In [None]:
def read_metrics_txt(path):
    """
    Parse a metrics text file that looks like:
        accuracy: 0.9195
        ...
        confusion_matrix:
        1302 22
        96   46

    Returns
    -------
    dict   # {'accuracy': 0.9195, ..., 'confusion_matrix': np.ndarray}
    """
    metrics = {}
    with open(path, "r") as f:
        lines = [ln.strip() for ln in f if ln.strip()]

    i = 0
    while i < len(lines):
        line = lines[i]
        if line.startswith("confusion_matrix"):
            # next two lines contain the 2×2 matrix
            row1 = list(map(int, lines[i + 1].split()))
            row2 = list(map(int, lines[i + 2].split()))
            metrics["confusion_matrix"] = np.array([row1, row2])
            i += 3
        else:
            k, v = line.split(":", 1)
            metrics[k.strip()] = float(v)
            i += 1
    return metrics

In [35]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)

    # 2. Prepare data with categorical encoding
    X_train, X_test, y_train, y_test = prepare_data_part3(df)

    # 3. Apply SMOTE to balance the training data
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)

    # 4. Train model on resampled data
    model = train_logistic_regression(X_train_resampled, y_train_resampled)

    # 5. Evaluate on original test set
    metrics = calculate_evaluation_metrics(model, X_test, y_test)

    # 6. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")

    # 7. Save results
    save_to_file(metrics)

    # 8. Load Part 1 results for comparison
    import json

    try:
        part1_metrics = read_metrics_txt("results/results_part1.txt")

        # 9. Compare models
        comparison = compare_models(part1_metrics, metrics)
        print("\nModel Comparison (improvement percentages):")
        for metric, improvement in comparison.items():
            print(f"{metric}: {improvement:.2f}%")
    except FileNotFoundError:
        print("Part 1 results not found. Run part1_introduction.ipynb first.")

accuracy: 0.0682
precision: 0.0682
recall: 1.0000
f1: 0.1277
auc: 0.4925

Model Comparison (improvement percentages):
accuracy: -92.58%
precision: -89.92%
recall: 208.74%
f1: -70.85%
auc: -44.37%


  df = pd.read_csv(
  X[col] = X[col].view("int64")
