# Install necessary packages

In [None]:
%pip install -r requirements.txt

# Part 1: Introduction to Classification & Evaluation

**Objective:** Load the synthetic health data, train a Logistic Regression model, and evaluate its performance.

## 1. Setup

Import necessary libraries.

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

## 2. Data Loading

Implement the `load_data` function to read the dataset.

In [3]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    df = pd.read_csv(file_path)
    
    return df

In [4]:
df = load_data("data/synthetic_health_data.csv")
print(df.head())

   patient_id                   timestamp  age  systolic_bp  diastolic_bp  \
0           1  2023-01-29 00:00:00.000000   57   113.063416     84.069561   
1           1  2023-01-31 07:33:55.507789   57   121.598849     89.672279   
2           1  2023-02-02 00:15:11.379377   57   126.623222     87.619685   
3           1  2023-02-04 09:37:12.589164   57   136.999366     89.199774   
4           1  2023-02-04 20:56:52.838198   57   127.546919     92.644673   

   glucose_level        bmi smoker_status  heart_rate  disease_outcome  
0     117.475210  25.085796            no   62.719587                0  
1      85.120875  24.120608            no   76.314434                0  
2            NaN  24.819332            no   62.427785                0  
3     118.755648  25.039598            no   61.612981                0  
4      98.882007  24.895024            no   77.649615                0  


## 3. Data Preparation

Implement `prepare_data_part1` to select features, split data, and handle missing values.

In [5]:
def prepare_data_part1(df, test_size=0.2, random_state=42):
    """
    Prepare data for modeling: select features, split into train/test sets, handle missing values.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Select relevant features (age, systolic_bp, diastolic_bp, glucose_level, bmi)
    # 2. Select target variable (disease_outcome)
    # 3. Split data into training and testing sets
    # 4. Handle missing values using SimpleImputer
    
       # 1. Select relevant features
    features = ['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi']
    X = df[features]

    # 2. Select target variable
    y = df['disease_outcome']

    # 3. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # 4. Handle missing values using SimpleImputer (mean imputation)
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    return X_train, X_test, y_train, y_test

In [6]:
df = load_data("data/synthetic_health_data.csv")
X_train, X_test, y_train, y_test = prepare_data_part1(df)
print(X_train[:5])

[[ 63.         118.90240003  67.32984448 112.35689001  26.61451981]
 [ 21.         124.9311563   95.4349238   76.18956617  28.22289055]
 [ 45.         116.31968843  84.72964664 113.45920276  27.06545629]
 [ 55.         112.96963737  81.77095534 107.21317944  34.37919599]
 [ 41.         109.63797157  78.96557225 103.63097698  27.24452612]]


## 4. Model Training

Implement `train_logistic_regression`.

In [8]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # Initialize the LogisticRegression model
    model = LogisticRegression(max_iter=1000, solver='lbfgs')  # Increase max_iter in case of convergence issues

    # Fit the model to the training data
    model.fit(X_train, y_train)

    return model
    

In [9]:
# Assuming you already have X_train, y_train
model = train_logistic_regression(X_train, y_train)

# To inspect coefficients:
print("Model coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Model coefficients: [[ 0.01073173  0.03721063  0.18304174  0.06343447 -0.03578521]]
Intercept: [-28.7998851]


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept


## 5. Model Evaluation

Implement `calculate_evaluation_metrics` to assess the model's performance.

In [10]:
def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # 1. Generate predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Probability for positive class (needed for AUC)

    # 2. Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_prob)

    # 3. Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # 4. Return metrics dictionary
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "auc": auc,
        "confusion_matrix": cm
    }

In [11]:
metrics = calculate_evaluation_metrics(model, X_test, y_test)

for key, value in metrics.items():
    print(f"{key}: \n{value}\n")

accuracy: 
0.9167803547066848

precision: 
0.6615384615384615

recall: 
0.3006993006993007

f1_score: 
0.41346153846153844

auc: 
0.9083561940704798

confusion_matrix: 
[[1301   22]
 [ 100   43]]



  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


## 6. Save Results

Save the calculated metrics to a text file.

In [12]:
def save_metrics_to_file(metrics, filename="results/results_part1.txt"):
    """
    Save evaluation metrics to a text file.
    
    Args:
        metrics: Dictionary of evaluation metrics
        filename: Path to the output text file
    """
    # 1. Create 'results' directory if it doesn't exist
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    # 2. Format metrics as strings
    lines = []
    for key, value in metrics.items():
        if key == "confusion_matrix":
            lines.append(f"{key}:\n{value}\n")
        else:
            lines.append(f"{key}: {value:.4f}")

    # 3. Write to file
    with open(filename, "w") as f:
        f.write("\n".join(lines))

In [13]:
metrics = calculate_evaluation_metrics(model, X_test, y_test)
save_metrics_to_file(metrics)

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


## 7. Main Execution

Run the complete workflow.

In [16]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part1(df)
    
    # 3. Train model
    model = train_logistic_regression(X_train, y_train)
    
    # 4. Evaluate model
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 5. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 6. Save results
    # (Your code for saving results)
    
    # 7. Interpret results
    interpretation = interpret_results(metrics)
    print("\nResults Interpretation:")
    for key, value in interpretation.items():
        print(f"{key}: {value}")

accuracy: 0.9168
precision: 0.6615
recall: 0.3007
f1_score: 0.4135
auc: 0.9084

Results Interpretation:
best_metric: accuracy
worst_metric: recall
imbalance_impact_score: 0.5597


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


## 8. Interpret Results

Implement a function to analyze the model performance on imbalanced data.

In [15]:
def interpret_results(metrics):
    """
    Analyze model performance on imbalanced data.
    
    Args:
        metrics: Dictionary containing evaluation metrics
        
    Returns:
        Dictionary with keys:
        - 'best_metric': Name of the metric that performed best
        - 'worst_metric': Name of the metric that performed worst
        - 'imbalance_impact_score': A score from 0-1 indicating how much
          the class imbalance affected results (0=no impact, 1=severe impact)
    """
    # Exclude confusion_matrix from numeric metric evaluation
    numeric_metrics = {k: v for k, v in metrics.items() if k != 'confusion_matrix'}

    # 1. Determine best and worst metric
    best_metric = max(numeric_metrics, key=numeric_metrics.get)
    worst_metric = min(numeric_metrics, key=numeric_metrics.get)

    # 2. Compute imbalance impact score
    # Compare accuracy (which can be misleading with imbalance)
    # to F1 and recall (which are more sensitive to minority class performance)
    accuracy = numeric_metrics.get("accuracy", 0)
    f1 = numeric_metrics.get("f1_score", 0)
    recall = numeric_metrics.get("recall", 0)

    # Measure of imbalance impact: large gap between accuracy and f1/recall → higher impact
    imbalance_gap = abs(accuracy - f1) + abs(accuracy - recall)
    imbalance_impact_score = min(1.0, round(imbalance_gap / 2, 4))  # Normalize to 0–1 range

    return {
        'best_metric': best_metric,
        'worst_metric': worst_metric,
        'imbalance_impact_score': imbalance_impact_score
    }


In [17]:
interpretation = interpret_results(metrics)
print(interpretation)

{'best_metric': 'accuracy', 'worst_metric': 'recall', 'imbalance_impact_score': 0.5597}
