## 1. Setup

In [21]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

## 2. Data Loading

In [22]:
def load_data(file_path):
    return pd.read_csv(file_path)

## 3. Categorical Feature Encoding

In [23]:
def encode_categorical_features(df, column_to_encode='smoker_status'):

    encoder = OneHotEncoder(sparse_output=False)  
    cat_column = df[[column_to_encode]]
    encoded = encoder.fit_transform(cat_column)

    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([column_to_encode]))

    df = df.drop(columns=[column_to_encode]).reset_index(drop=True)
    df_encoded = pd.concat([df, encoded_df], axis=1)

    return df_encoded

## 4. Data Preparation

In [24]:
def prepare_data_part3(df, test_size=0.2, random_state=42):
    df_cat = encode_categorical_features(df)
    if 'timestamp' in df_cat.columns:
        df_cat = df_cat.drop(columns=['timestamp'])

    y = df_cat['disease_outcome']
    x = df_cat.drop(columns=['disease_outcome'])

    imputer = SimpleImputer(strategy='mean') 
    x_imputed = imputer.fit_transform(x)

    X_train, X_test, y_train, y_test = train_test_split(
        x_imputed, y, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

## 5. Handling Imbalanced Data

In [25]:
def apply_smote(X_train, y_train, random_state=42):
    smote = SMOTE(random_state=random_state)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    return X_train, y_train

## 6. Model Training and Evaluation

In [26]:
def train_logistic_regression(X_train, y_train):
    log_model = LogisticRegression()
    log_model.fit(X_train, y_train)
    return log_model

def calculate_evaluation_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_proba),
        'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()  
    }



## 7. Save Results

In [27]:
def save_metrics_to_file(metrics, filename='results/results_part3.txt'):
    os.makedirs('results', exist_ok=True)
    with open(filename, 'w') as f:
        for k, v in metrics.items():
            if k == 'confusion_matrix':
                f.write(f"{k}:\n{np.array(v)}\n")
            else:
                f.write(f"{k}: {v:.4f}\n")

In [28]:
def compare_models(part1_metrics, part3_metrics):
    improvements = {}
    for metric in part1_metrics:
        if metric != 'confusion_matrix':
            old = part1_metrics[metric]
            new = part3_metrics[metric]
            if old != 0:
                improvements[metric] = round((new - old) / old * 100, 2)
            else:
                improvements[metric] = float('inf')
    return improvements

In [29]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data with categorical encoding
    X_train, X_test, y_train, y_test = prepare_data_part3(df)
    
    # 3. Apply SMOTE to balance the training data
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)
    
    # 4. Train model on resampled data
    model = train_logistic_regression(X_train_resampled, y_train_resampled)
    
    # 5. Evaluate on original test set
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 6. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 7. Save results
    save_metrics_to_file(metrics)
    
    # 8. Load Part 1 results for comparison
    import json
    try:
        with open('results/results_part1.txt', 'r') as f:
            part1_metrics = json.load(f)
        
        # 9. Compare models
        comparison = compare_models(part1_metrics, metrics)
        print("\nModel Comparison (improvement percentages):")
        for metric, improvement in comparison.items():
            print(f"{metric}: {improvement:.2f}%")
    except FileNotFoundError:
        print("Part 1 results not found. Run part1_introduction.ipynb first.")

accuracy: 0.7879
precision: 0.2961
recall: 0.8531
f1: 0.4396
auc: 0.8784

Model Comparison (improvement percentages):
accuracy: -14.06%
precision: -55.24%
recall: 183.72%
f1: 6.33%
auc: -3.30%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
