In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

In [2]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas
    path = file_path
    df = pd.read_csv(
            file_path,
            parse_dates=["timestamp"],   # parses the column to datetime dtype
            infer_datetime_format=True)
    return df

In [3]:
def prepare_data_part1(df, test_size=0.2, random_state=42):
    """
    Prepare data for modeling: select features, split into train/test sets, handle missing values.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Select relevant features (age, systolic_bp, diastolic_bp, glucose_level, bmi)
    # 2. Select target variable (disease_outcome)
    # 3. Split data into training and testing sets
    # 4. Handle missing values using SimpleImputer
    
    # Placeholder return - replace with your implementation
    feature_cols = ["age", "systolic_bp", "diastolic_bp",
                    "glucose_level", "bmi"]
    X = df[feature_cols]
    y = df["disease_outcome"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y         
    )
    imputer = SimpleImputer(strategy="median")
    X_train = pd.DataFrame(
        imputer.fit_transform(X_train),
        columns=feature_cols,
        index=X_train.index
    )
    X_test = pd.DataFrame(
        imputer.transform(X_test),
        columns=feature_cols,
        index=X_test.index
    )

    return X_train, X_test, y_train, y_test

In [4]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # YOUR CODE HERE
    # Initialize and train a LogisticRegression model
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    return model

In [5]:
def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # YOUR CODE HERE
 # 1. Predictions
    y_pred = model.predict(X_test)

    # 2. Metrics
    accuracy  = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0, average="binary" if len(np.unique(y_test)) == 2 else "weighted")
    recall    = recall_score(y_test, y_pred, zero_division=0, average="binary" if len(np.unique(y_test)) == 2 else "weighted")
    f1        = f1_score(y_test, y_pred, zero_division=0, average="binary" if len(np.unique(y_test)) == 2 else "weighted")

    # AUC: only meaningful if probabilities available and at least two classes
    auc = None
    if hasattr(model, "predict_proba") and len(np.unique(y_test)) >= 2:
        y_proba = model.predict_proba(X_test)
        if y_proba.shape[1] == 2:  # binary
            auc = roc_auc_score(y_test, y_proba[:, 1])
        else:                      # multiclass
            auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="weighted")

    # 3. Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # 4. Return results
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc,
        "confusion_matrix": cm,
    }

In [9]:
# Create results directory and save metrics
# YOUR CODE HERE
def save_metrics_to_file(metrics,
                         results_dir: str = "results",
                         file_name: str = "results_part1.txt",
                         float_fmt: str = "{:.4f}"):
    # 1. Ensure the directory exists
    os.makedirs(results_dir, exist_ok=True)
    file_path = os.path.join(results_dir, file_name)

    # 2. Helper to convert each value to a readable string
    def _to_string(val):
        if isinstance(val, float):
            return float_fmt.format(val)
        if isinstance(val, np.ndarray):
            return "\n" + "\n".join(" ".join(map(str, row)) for row in val)
        return str(val)

    # 3. Write out the metrics
    with open(file_path, "w") as f:
        for k, v in metrics.items():
            f.write(f"{k}: {_to_string(v)}\n")

    return file_path

In [8]:
def interpret_results(metrics):
    """
    Analyze model performance on imbalanced data.
    
    Args:
        metrics: Dictionary containing evaluation metrics
        
    Returns:
        Dictionary with keys:
        - 'best_metric': Name of the metric that performed best
        - 'worst_metric': Name of the metric that performed worst
        - 'imbalance_impact_score': A score from 0-1 indicating how much
          the class imbalance affected results (0=no impact, 1=severe impact)
    """
    # YOUR CODE HERE
    scalar_keys = ["accuracy", "precision", "recall", "f1", "auc"]
    scalar_metrics = {k: v for k, v in metrics.items()
                      if k in scalar_keys and v is not None}

    if not scalar_metrics:
        raise ValueError("No scalar metrics found in the input dictionary.")

    best_metric  = max(scalar_metrics, key=scalar_metrics.get)
    worst_metric = min(scalar_metrics, key=scalar_metrics.get)

    acc    = scalar_metrics.get("accuracy", 0.0)
    f1     = scalar_metrics.get("f1", acc)        # fallback to acc if missing
    recall = scalar_metrics.get("recall", acc)

    imbalance_impact_score = (abs(acc - f1) + abs(acc - recall)) / 2.0

    return {
        "best_metric": best_metric,
        "worst_metric": worst_metric,
        "imbalance_impact_score": float(round(imbalance_impact_score, 4))
    }

In [12]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data//synthetic_health_data.csv'
    df = load_data(data_file)

    # 2. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part1(df)

    # 3. Train model
    model = train_logistic_regression(X_train, y_train)

    # 4. Evaluate model
    metrics = calculate_evaluation_metrics(model, X_test, y_test)

    # 5. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")

    # 6. Save results
    save_metrics_to_file(metrics)

    # 7. Interpret results
    interpretation = interpret_results(metrics)
    print("\nResults Interpretation:")
    for key, value in interpretation.items():
        print(f"{key}: {value}")

accuracy: 0.9195
precision: 0.6765
recall: 0.3239
f1: 0.4381
auc: 0.8852

Results Interpretation:
best_metric: accuracy
worst_metric: recall
imbalance_impact_score: 0.5385


  df = pd.read_csv(
