# Install necessary packages

In [1]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Part 1: Introduction to Classification & Evaluation

**Objective:** Load the synthetic health data, train a Logistic Regression model, and evaluate its performance.

## 1. Setup

Import necessary libraries.

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

## 2. Data Loading

Implement the `load_data` function to read the dataset.

In [3]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas

    data = pd.read_csv(file_path)
    return data  # Replace with actual implementation

In [4]:
# Testing

data = load_data("data/synthetic_health_data.csv")
data

Unnamed: 0,patient_id,timestamp,age,systolic_bp,diastolic_bp,glucose_level,bmi,smoker_status,heart_rate,disease_outcome
0,1,2023-01-29 00:00:00.000000,57,113.063416,84.069561,117.475210,25.085796,no,62.719587,0
1,1,2023-01-31 07:33:55.507789,57,121.598849,89.672279,85.120875,24.120608,no,76.314434,0
2,1,2023-02-02 00:15:11.379377,57,126.623222,87.619685,,24.819332,no,62.427785,0
3,1,2023-02-04 09:37:12.589164,57,136.999366,89.199774,118.755648,25.039598,no,61.612981,0
4,1,2023-02-04 20:56:52.838198,57,127.546919,92.644673,98.882007,24.895024,no,77.649615,0
...,...,...,...,...,...,...,...,...,...,...
7321,150,2023-03-18 09:08:49.029823,54,115.038254,79.241741,84.586944,29.968156,no,73.599447,0
7322,150,2023-03-20 14:38:22.129593,54,116.389186,70.464818,91.476621,29.519510,no,64.162701,0
7323,150,2023-03-23 09:26:04.210673,54,123.419606,88.213054,96.985434,29.786678,no,71.641423,0
7324,150,2023-03-27 14:17:19.255961,54,,69.539940,85.670800,29.188655,no,72.781243,0


## 3. Data Preparation

Implement `prepare_data_part1` to select features, split data, and handle missing values.

In [5]:
def prepare_data_part1(df, test_size=0.2, random_state=42):
    """
    Prepare data for modeling: select features, split into train/test sets, handle missing values.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """

    # YOUR CODE HERE
    # 1. Select relevant features (age, systolic_bp, diastolic_bp, glucose_level, bmi)
    # 2. Select target variable (disease_outcome)
    # 3. Split data into training and testing sets
    # 4. Handle missing values using SimpleImputer
    
    X_full = df[["age", "systolic_bp", "diastolic_bp", "glucose_level", "bmi"]]
    y_full = df[["disease_outcome"]]

    X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=test_size, random_state=random_state)

    imputer = SimpleImputer(strategy="mean")
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    # Placeholder return - replace with your implementation
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = prepare_data_part1(data)

## 4. Model Training

Implement `train_logistic_regression`.

In [7]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # YOUR CODE HERE
    # Initialize and train a LogisticRegression model
    model = LogisticRegression().fit(X_train, y_train)

    return model  # Replace with actual implementation

In [8]:
model1 = train_logistic_regression(X_train, y_train)

  y = column_or_1d(y, warn=True)


## 5. Model Evaluation

Implement `calculate_evaluation_metrics` to assess the model's performance.

In [9]:
def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # YOUR CODE HERE
    # 1. Generate predictions
    # 2. Calculate metrics: accuracy, precision, recall, f1, auc
    # 3. Create confusion matrix
    # 4. Return metrics in a dictionary

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] 
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Placeholder return - replace with your implementation
    return {"accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc,
        "confusion_matrix": cm}

## 6. Save Results

Save the calculated metrics to a text file.

In [10]:
# Create results directory and save metrics
# YOUR CODE HERE
# 1. Create 'results' directory if it doesn't exist
# 2. Format metrics as strings
# 3. Write metrics to 'results/results_part1.txt'

def save_metrics_to_file(metrics, file_path="results/results_part1.txt"):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    metrics = calculate_evaluation_metrics(model1, X_test, y_test)

    formatted_metrics = []
    for key, value in metrics.items():
        if key == "confusion_matrix":
            formatted_metrics.append(f"{key}:\n{value[0]}\n{value[1]}")
        else:
            formatted_metrics.append(f"{key}: {value:.4f}")

    with open("results/results_part1.txt", "w") as f:
        f.write("\n".join(formatted_metrics))

## 7. Main Execution

Run the complete workflow.

In [13]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part1(df)
    
    # 3. Train model
    model = train_logistic_regression(X_train, y_train)
    
    # 4. Evaluate model
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 5. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 6. Save results
    # (Your code for saving results)
    save_metrics_to_file(metrics)
    
    # 7. Interpret results
    interpretation = interpret_results(metrics)
    print("\nResults Interpretation:")
    for key, value in interpretation.items():
        print(f"{key}: {value}")

accuracy: 0.9168
precision: 0.6615
recall: 0.3007
f1: 0.4135
auc: 0.9084

Results Interpretation:
best_metric: accuracy
worst_metric: recall
imbalance_impact_score: 0.5597


  y = column_or_1d(y, warn=True)


## 8. Interpret Results

Implement a function to analyze the model performance on imbalanced data.

In [12]:
def interpret_results(metrics):
    """
    Analyze model performance on imbalanced data.
    
    Args:
        metrics: Dictionary containing evaluation metrics
        
    Returns:
        Dictionary with keys:
        - 'best_metric': Name of the metric that performed best
        - 'worst_metric': Name of the metric that performed worst
        - 'imbalance_impact_score': A score from 0-1 indicating how much
          the class imbalance affected results (0=no impact, 1=severe impact)
    """
    # YOUR CODE HERE
    # 1. Determine which metric performed best and worst
    # 2. Calculate an imbalance impact score based on the difference
    #    between accuracy and more imbalance-sensitive metrics like F1 or recall
    # 3. Return the results as a dictionary
    
    filtered_metrics = {}
    for metric_name, metric_value in metrics.items():
        if metric_name != "confusion_matrix":
            filtered_metrics[metric_name] = metric_value

    best_metric_name = max(filtered_metrics, key=filtered_metrics.get)
    worst_metric_name = min(filtered_metrics, key=filtered_metrics.get)

    accuracy = metrics.get("accuracy", 0)
    recall = metrics.get("recall", 0)
    f1 = metrics.get("f1", 0)
    imbalance_sensitive_avg = (recall + f1) / 2
    imbalance_impact_score = abs(accuracy - imbalance_sensitive_avg)

    # Placeholder return - replace with your implementation
    return {
        'best_metric': best_metric_name,
        'worst_metric': worst_metric_name,
        'imbalance_impact_score': round(imbalance_impact_score, 4)
    }