# Install necessary packages

In [26]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Part 1: Introduction to Classification & Evaluation

**Objective:** Load the synthetic health data, train a Logistic Regression model, and evaluate its performance.

## 1. Setup

Import necessary libraries.

In [27]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

## 2. Data Loading

Implement the `load_data` function to read the dataset.

In [28]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """ 
    df = pd.read_csv(file_path, parse_dates=["timestamp"])
    return df

## 3. Data Preparation

Implement `prepare_data_part1` to select features, split data, and handle missing values.

In [29]:
def prepare_data_part1(df, test_size=0.2, random_state=42):
    """
    Prepare data for modeling: select features, split into train/test sets, handle missing values.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Select relevant features (age, systolic_bp, diastolic_bp, glucose_level, bmi)
    # 2. Select target variable (disease_outcome)
    # 3. Split data into training and testing sets
    # 4. Handle missing values using SimpleImputer
    
    df = df.dropna(subset=["disease_outcome"])
    features = df[["age", "systolic_bp", "diastolic_bp", "glucose_level", "bmi"]]
    labels = df["disease_outcome"]

    X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=test_size, random_state=random_state, stratify=labels
)

    imputer = SimpleImputer(strategy="mean")
    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=features.columns, index=X_train.index)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=features.columns, index=X_test.index)

    return X_train, X_test, y_train, y_test

## 4. Model Training

Implement `train_logistic_regression`.

In [30]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    
    
    return clf

## 5. Model Evaluation

Implement `calculate_evaluation_metrics` to assess the model's performance.

In [31]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_probs = model.predict_proba(X_test)[:, 1]

    scores = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "auc": roc_auc_score(y_test, y_probs),
        "confusion_matrix": confusion_matrix(y_test, y_pred)
    }
    return scores

## 6. Save Results

Save the calculated metrics to a text file.

In [32]:
def export_results(results_dict, filepath="results/results_part1.txt"):
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "w") as file:
        for k, v in results_dict.items():
            if k != "confusion_matrix":
                file.write(f"{k}: {v:.4f}\n")

In [None]:
# Result interpretation
def analyze_performance(metrics):
    perf_keys = ["accuracy", "precision", "recall", "f1", "auc"]
    top_metric = max(perf_keys, key=lambda m: metrics[m])
    bottom_metric = min(perf_keys, key=lambda m: metrics[m])
    impact_score = round(abs(metrics["accuracy"] - metrics["f1"]), 3)

    return {
        "strongest_metric": top_metric,
        "weakest_metric": bottom_metric,
        "class_imbalance_impact": impact_score
    }

## 7. Main Execution

Run the complete workflow.

In [33]:
if __name__ == "__main__":
    dataset = load_data("data/synthetic_health_data.csv")
    X_train, X_test, y_train, y_test = prepare_data_part1(dataset)
    model = train_logistic_regression(X_train, y_train)
    results = evaluate_model(model, X_test, y_test)

    # Display metrics
    for metric, val in results.items():
        if metric != "confusion_matrix":
            print(f"{metric}: {val:.4f}")

    export_results(results)

    summary = analyze_performance(results)
    print("\nPerformance Summary:")
    for item, val in summary.items():
        print(f"{item}: {val}")


accuracy: 0.9195
precision: 0.6765
recall: 0.3239
f1: 0.4381
auc: 0.8853

Performance Summary:
strongest_metric: accuracy
weakest_metric: recall
class_imbalance_impact: 0.481


## 8. Interpret Results

Implement a function to analyze the model performance on imbalanced data.

In [34]:
# Result interpretation
def analyze_performance(metrics):
    perf_keys = ["accuracy", "precision", "recall", "f1", "auc"]
    top_metric = max(perf_keys, key=lambda m: metrics[m])
    bottom_metric = min(perf_keys, key=lambda m: metrics[m])
    impact_score = round(abs(metrics["accuracy"] - metrics["f1"]), 3)

    return {
        "strongest_metric": top_metric,
        "weakest_metric": bottom_metric,
        "class_imbalance_impact": impact_score
    }