## Install necessary packages

In [1]:
%pip install -r requirements.txt

Collecting scikit-learn<2.0,>=1.0 (from -r requirements.txt (line 4))
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting xgboost<3.0,>=1.5 (from -r requirements.txt (line 5))
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting imbalanced-learn<1.0,>=0.9 (from -r requirements.txt (line 6))
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting matplotlib<4.0,>=3.5 (from -r requirements.txt (line 7))
  Downloading matplotlib-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting seaborn<1.0,>=0.11 (from -r requirements.txt (line 8))
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn<2.0,>=1.0->-r requirements.txt (line 4))
  Downloading scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scik

# Part 1: Introduction to Classification & Evaluation

**Objective:** Load the synthetic health data, train a Logistic Regression model, and evaluate its performance.

## 1. Setup

Import necessary libraries.

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

## 2. Data Loading

Implement the `load_data` function to read the dataset.

In [3]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas

    df = pd.read_csv(file_path)
    
    return df  # Replace with actual implementation

## 3. Data Preparation

Implement `prepare_data_part1` to select features, split data, and handle missing values.

In [4]:
def prepare_data_part1(df, test_size=0.2, random_state=42):
    """
    Prepare data for modeling: select features, split into train/test sets, handle missing values.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Select relevant features (age, systolic_bp, diastolic_bp, glucose_level, bmi)
    # 2. Select target variable (disease_outcome)
    # 3. Split data into training and testing sets
    # 4. Handle missing values using SimpleImputer

    # splitting data into train and test sets
    X = df[['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi']]
    Y = df[['disease_outcome']]

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

    # handle missing data
    imputer = SimpleImputer(strategy = 'mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    # Placeholder return - replace with your implementation
    return X_train, X_test, y_train, y_test

## 4. Model Training

Implement `train_logistic_regression`.

In [5]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # YOUR CODE HERE
    # Initialize and train a LogisticRegression model

    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    return model  # Replace with actual implementation

## 5. Model Evaluation

Implement `calculate_evaluation_metrics` to assess the model's performance.


In [6]:
def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # YOUR CODE HERE
    # 1. Generate predictions
    # 2. Calculate metrics: accuracy, precision, recall, f1, auc
    # 3. Create confusion matrix
    # 4. Return metrics in a dictionary

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'confusion_matrix': conf_matrix
    }
    
    # Placeholder return - replace with your implementation
    return metrics

## 6. Save Results

Save the calculated metrics to a text file.

In [8]:
# Create results directory and save metrics
# YOUR CODE HERE
# 1. Create 'results' directory if it doesn't exist
# 2. Format metrics as strings
# 3. Write metrics to 'results/results_part1.txt'

# results_dir = 'results'
# if not os.path.exists(results_dir):
#     os.makedirs(results_dir)

# metrics_str = '\n'.join(f'{key}: {value:.4f}' for key, value in metrics.items())

# with open(os.path.join(results_dir, 'results_part1.txt'), 'w') as f:
#     f.write(metrics_str)

## 7. Main Execution

Run the complete workflow.

In [22]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part1(df)
    
    # 3. Train model
    model = train_logistic_regression(X_train, y_train)
    
    # 4. Evaluate model
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 5. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 6. Save results
    # (Your code for saving results)
    results_dir = 'results'
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    metrics_str = '\n'.join(
        f'{key}: {value:.4f}' if np.isscalar(value) else f'{key}: {np.array2string(value, precision=4)}'
        for key, value in metrics.items()
)
    with open(os.path.join(results_dir, 'results_part1.txt'), 'w') as f:
        f.write(metrics_str)
    
    # 7. Interpret results
    interpretation = interpret_results(metrics)
    print("\nResults Interpretation:")
    for key, value in interpretation.items():
        print(f"{key}: {value}")

accuracy: 0.9168
precision: 0.6615
recall: 0.3007
f1: 0.4135
auc: 0.6420

Results Interpretation:
best_metric: accuracy
worst_metric: recall
imbalance_impact_score: 0.6160810540073842


  y = column_or_1d(y, warn=True)


## 8. Interpret Results

Implement a function to analyze the model performance on imbalanced data.

In [None]:
def interpret_results(metrics):
    """
    Analyze model performance on imbalanced data.
    
    Args:
        metrics: Dictionary containing evaluation metrics
        
    Returns:
        Dictionary with keys:
        - 'best_metric': Name of the metric that performed best
        - 'worst_metric': Name of the metric that performed worst
        - 'imbalance_impact_score': A score from 0-1 indicating how much
          the class imbalance affected results (0=no impact, 1=severe impact)
    """
    # YOUR CODE HERE
    # 1. Determine which metric performed best and worst
    # 2. Calculate an imbalance impact score based on the difference
    #    between accuracy and more imbalance-sensitive metrics like F1 or recall
    # 3. Return the results as a dictionary

    for key, value in metrics.items():
        if isinstance(value, (list, np.ndarray)):
            metrics[key] = np.mean(value)  # Take the mean of the array if needed
    
    best_metric = None
    worst_metric = None

    # skip confusion matrix
    for key, value in metrics.items():
        if key != 'confusion_matrix' and value is not None:
            if best_metric is None or value > metrics[best_metric]:
                best_metric = key
            if worst_metric is None or value < metrics[worst_metric]:
                worst_metric = key

    # calculate imbalance impact score with accuracy and recall
    accuracy = metrics.get('accuracy', None)
    recall = metrics.get('recall', None)
    imbalance_impact_score = max(0, min(1, accuracy - recall))

    
    # Placeholder return - replace with your implementation
    return {
        'best_metric': best_metric,
        'worst_metric': worst_metric,
        'imbalance_impact_score': imbalance_impact_score
    }