# Install necessary packages

In [None]:
%pip install -r requirements.txt

# Part 1: Introduction to Classification & Evaluation

**Objective:** Load the synthetic health data, train a Logistic Regression model, and evaluate its performance.

## 1. Setup

Import necessary libraries.

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

## 2. Data Loading

Implement the `load_data` function to read the dataset.

In [2]:
def load_data(file_path):
    # check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist. Please run generate_data.py first.")
    
    # load the CSV file using pandas
    df = pd.read_csv(file_path)
    
    # display basic information about the loaded data
    print(f"Loaded {df.shape[0]} records with {df.shape[1]} features.")
    print(f"Features: {', '.join(df.columns)}")
    print(f"Number of unique patients: {df['patient_id'].nunique()}")
    print(f"Target distribution:\n{df['disease_outcome'].value_counts(normalize=True).rename('proportion')}")
    
    return df

## 3. Data Preparation

Implement `prepare_data_part1` to select features, split data, and handle missing values.

In [4]:
def prepare_data_part1(df, test_size=0.2, random_state=42):
    # select relevant features
    selected_features = ['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi']
    X = df[selected_features]
    
    # select target variable
    y = df['disease_outcome']
    
    # split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Testing set: {X_test.shape[0]} samples")
    
    # handle missing values using SimpleImputer
    imputer = SimpleImputer(strategy='mean')
    X_train = pd.DataFrame(
        imputer.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    
    X_test = pd.DataFrame(
        imputer.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )
    
    print("Missing values before imputation:")
    print(f"  Training set: {df[selected_features].iloc[X_train.index].isna().sum().sum()}")
    print(f"  Testing set: {df[selected_features].iloc[X_test.index].isna().sum().sum()}")
    print("Missing values after imputation:")
    print(f"  Training set: {X_train.isna().sum().sum()}")
    print(f"  Testing set: {X_test.isna().sum().sum()}")
    
    return X_train, X_test, y_train, y_test

## 4. Model Training

Implement `train_logistic_regression`.

In [5]:
def train_logistic_regression(X_train, y_train):
    # initialize the logistic regression model
    model = LogisticRegression(
        solver='liblinear',  
        class_weight='balanced',  
        random_state=42,  
        max_iter=1000  
    )
    
    # train the model
    model.fit(X_train, y_train)
    
    # model coefficients to understand feature importance
    print("Logistic Regression Model Trained Successfully")
    print("\nModel Coefficients:")
    for feature, coef in zip(X_train.columns, model.coef_[0]):
        print(f"{feature}: {coef:.4f}")
    
    print(f"\nIntercept: {model.intercept_[0]:.4f}")
    
    return model

## 5. Model Evaluation

Implement `calculate_evaluation_metrics` to assess the model's performance.

In [6]:
def calculate_evaluation_metrics(model, X_test, y_test):
    # generate predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # probability of positive class
    
    # calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # print confusion matrix
    print("\nConfusion Matrix:")
    print(f"TN: {cm[0][0]}, FP: {cm[0][1]}")
    print(f"FN: {cm[1][0]}, TP: {cm[1][1]}")
    
    # return metrics in a dictionary
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc,
        'confusion_matrix': cm
    }
    
    print("\nModel Evaluation Metrics:")
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    return metrics

## 6. Save Results

Save the calculated metrics to a text file.

In [7]:
# Create results directory and save metrics
def save_results(metrics, file_path='results/results_part1.txt'):
    # create 'results' directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    # format metrics as strings
    lines = [
        "LOGISTIC REGRESSION MODEL EVALUATION",
        "=" * 40,
        f"Accuracy:  {metrics['accuracy']:.4f}",
        f"Precision: {metrics['precision']:.4f}",
        f"Recall:    {metrics['recall']:.4f}",
        f"F1 Score:  {metrics['f1_score']:.4f}",
        f"AUC:       {metrics['auc']:.4f}",
        "\nCONFUSION MATRIX",
        "=" * 40,
        f"TN: {metrics['confusion_matrix'][0][0]}, FP: {metrics['confusion_matrix'][0][1]}",
        f"FN: {metrics['confusion_matrix'][1][0]}, TP: {metrics['confusion_matrix'][1][1]}",
        "\nEvaluation completed on: " + pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
    ]
    
    # write metrics to file
    with open(file_path, 'w') as f:
        f.write('\n'.join(lines))
    
    print(f"\nResults saved to {file_path}")

## 7. Main Execution

Run the complete workflow.

In [11]:
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part1(df)
    
    # 3. Train model
    model = train_logistic_regression(X_train, y_train)
    
    # 4. Evaluate model
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 5. Save results
    save_results(metrics)
    
    # 6. Interpret results
    interpretation = interpret_results(metrics)
    print("\nResults Interpretation:")
    for key, value in interpretation.items():
        print(f"{key}: {value}")

Loaded 7326 records with 10 features.
Features: patient_id, timestamp, age, systolic_bp, diastolic_bp, glucose_level, bmi, smoker_status, heart_rate, disease_outcome
Number of unique patients: 150
Target distribution:
disease_outcome
0    0.902812
1    0.097188
Name: proportion, dtype: float64
Training set: 5860 samples
Testing set: 1466 samples
Missing values before imputation:
  Training set: 600
  Testing set: 135
Missing values after imputation:
  Training set: 0
  Testing set: 0
Logistic Regression Model Trained Successfully

Model Coefficients:
age: 0.0105
systolic_bp: 0.0194
diastolic_bp: 0.1538
glucose_level: 0.0525
bmi: -0.1352

Intercept: -18.1845

Confusion Matrix:
TN: 1073, FP: 251
FN: 27, TP: 115

Model Evaluation Metrics:
accuracy: 0.8104
precision: 0.3142
recall: 0.8099
f1_score: 0.4528
auc: 0.8853

Results saved to results/results_part1.txt

Results Interpretation:
best_metric: auc
worst_metric: precision
imbalance_impact_score: 0.4413
imbalance_assessment: Moderate imp

## 8. Interpret Results

Implement a function to analyze the model performance on imbalanced data.

In [9]:
def interpret_results(metrics):
    # consider these metrics (exclude confusion_matrix)
    metric_keys = ['accuracy', 'precision', 'recall', 'f1_score', 'auc']
    metric_values = {k: metrics[k] for k in metric_keys}
    
    # determine which metric performed best and worst
    best_metric = max(metric_values, key=metric_values.get)
    worst_metric = min(metric_values, key=metric_values.get)
    
    # calculate an imbalance impact score
    min_recall_f1 = min(metrics['recall'], metrics['f1_score'])
    if metrics['accuracy'] > 0:  # Avoid division by zero
        imbalance_impact = 1 - (min_recall_f1 / metrics['accuracy'])
        imbalance_impact = min(1.0, max(0.0, imbalance_impact))  # Ensure between 0 and 1
    else:
        imbalance_impact = 1.0
    
    # return the results as a dictionary
    interpretation = {
        'best_metric': best_metric,
        'worst_metric': worst_metric,
        'imbalance_impact_score': round(imbalance_impact, 4)
    }
    
    # add interpretation text based on the impact score
    if imbalance_impact < 0.2:
        interpretation['imbalance_assessment'] = "Low impact from class imbalance"
    elif imbalance_impact < 0.5:
        interpretation['imbalance_assessment'] = "Moderate impact from class imbalance"
    else:
        interpretation['imbalance_assessment'] = "Severe impact from class imbalance"
    
    # add suggestion based on metrics
    if metrics['recall'] < 0.7:
        interpretation['suggestion'] = "Consider techniques to improve recall, such as adjusting the classification threshold or using more advanced resampling methods"
    elif metrics['precision'] < 0.7:
        interpretation['suggestion'] = "Consider techniques to improve precision, such as feature engineering or ensemble methods"
    else:
        interpretation['suggestion'] = "Model performance is good across metrics"
    
    return interpretation