In [1]:
%pip install -r requirements.txt

Collecting xgboost<3.0,>=1.5 (from -r requirements.txt (line 5))
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting imbalanced-learn<1.0,>=0.9 (from -r requirements.txt (line 6))
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn<1.0,>=0.9->-r requirements.txt (line 6))
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ----- ---------------------------------- 18.6/124.9 MB 91.5 MB/s eta 0:00:02
   ------------ -------------------------- 41.4/124.9 MB 100.0 MB/s eta 0:00:01
   -------------------- ------------------ 65.0/124.9 MB 104.0 MB/s eta 0:00:01
   --------------------------- ----------- 88.9/124.9 MB 105.4 MB/s eta 0:00:01
   --------------------------------- ---- 111.4/124.9 MB 105.6 MB/s eta 0:00:01
   ------------------


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## 1. Setup

In [32]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

## 2. Data Loading

In [41]:
def load_data(file_path):
    
    return pd.read_csv(file_path)
df = load_data('data/synthetic_health_data.csv')

## 3. Data Preparation

In [49]:
def prepare_data_part1(df, test_size=0.2, random_state=42):
    
    x = df[['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi']]
    y  = df['disease_outcome']
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(x)

    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

## 4. Model Training

In [50]:
def train_logistic_regression(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model

## 5. Model Evaluation

In [51]:
def calculate_evaluation_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    metric = dict()
    metric['accuracy']  = accuracy_score(y_test, y_pred)
    metric['precision'] = precision_score(y_test, y_pred)
    metric['recall'] = recall_score(y_test, y_pred)
    metric['f1'] = f1_score(y_test, y_pred)
    metric['auc'] = roc_auc_score(y_test, y_proba)
    metric['confusion_matrix'] = confusion_matrix(y_test, y_pred)
    return metric

In [52]:
X_train, X_test, y_train, y_test = prepare_data_part1(df)
model = train_logistic_regression(X_train, y_train)
metrics = calculate_evaluation_metrics(model, X_test, y_test)
metrics

{'accuracy': 0.9167803547066848,
 'precision': np.float64(0.6615384615384615),
 'recall': np.float64(0.3006993006993007),
 'f1': np.float64(0.41346153846153844),
 'auc': np.float64(0.9083614797900512),
 'confusion_matrix': array([[1301,   22],
        [ 100,   43]])}

## 6. Save Results

In [53]:
def save_metrics_to_file(metrics):
    os.makedirs('results', exist_ok=True)
    filename = 'results.txt'
    with open(filename, 'w') as f:
        for k, v in metrics.items():
            if k == 'confusion_matrix':
                f.write(f"{k}:\n{v}\n")
            else:
                f.write(f"{k}: {v:.4f}\n")

## 7. Interpret Results

In [54]:
def interpret_results(metrics):
    best = max({k: v for k, v in metrics.items() if k != 'confusion_matrix'}, key=metrics.get)
    worst = min({k: v for k, v in metrics.items() if k != 'confusion_matrix'}, key=metrics.get)

    
    imbalance = abs(metrics['accuracy'] - metrics['recall'] + metrics['accuracy'] - metrics['f1']) / 2
    imbalance = min(1.0, round(imbalance, 3)) 

    return {
        'best_metric': best,
        'worst_metric': worst,
        'imbalance_impact_score': imbalance
    }

## 8. Main Execution

In [55]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part1(df)
    
    # 3. Train model
    model = train_logistic_regression(X_train, y_train)
    
    # 4. Evaluate model
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 5. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 6. Save results
    save_metrics_to_file(metrics)
    
    # 7. Interpret results
    interpretation = interpret_results(metrics)
    print("\nResults Interpretation:")
    for key, value in interpretation.items():
        print(f"{key}: {value}")


accuracy: 0.9168
precision: 0.6615
recall: 0.3007
f1: 0.4135
auc: 0.9084

Results Interpretation:
best_metric: accuracy
worst_metric: recall
imbalance_impact_score: 0.56
