# Install necessary packages

In [None]:
%pip install -r requirements.txt

# Part 3: Practical Data Preparation

**Objective:** Handle categorical features using One-Hot Encoding and address class imbalance using SMOTE.

## 1. Setup

Import necessary libraries.

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

## 2. Data Loading

Load the dataset.

In [6]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    df = pd.read_csv(file_path)
    
    return df

In [9]:
df = pd.read_csv("data/synthetic_health_data.csv")

## 3. Categorical Feature Encoding

Implement `encode_categorical_features` using `OneHotEncoder`.

In [12]:
def encode_categorical_features(df, column_to_encode='smoker_status'):
    """
    Encode a categorical column using OneHotEncoder.
    
    Args:
        df: Input DataFrame
        column_to_encode: Name of the categorical column to encode
        
    Returns:
        DataFrame with the categorical column replaced by one-hot encoded columns
    """
     # 1. Extract the column as a DataFrame (2D shape required for encoder)
    cat_data = df[[column_to_encode]]

    # 2. Apply OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity
    encoded_array = encoder.fit_transform(cat_data)

    # 3. Create new column names
    encoded_cols = encoder.get_feature_names_out([column_to_encode])

    # 4. Create a DataFrame from the encoded data
    encoded_df = pd.DataFrame(encoded_array, columns=encoded_cols, index=df.index)

    # 5. Drop the original column and concatenate the new one-hot columns
    df_encoded = df.drop(columns=[column_to_encode])
    df_encoded = pd.concat([df_encoded, encoded_df], axis=1)

    return df_encoded

In [13]:
df_encoded = encode_categorical_features(df, column_to_encode='smoker_status')
print(df_encoded.head())

   patient_id                   timestamp  age  systolic_bp  diastolic_bp  \
0           1  2023-01-29 00:00:00.000000   57   113.063416     84.069561   
1           1  2023-01-31 07:33:55.507789   57   121.598849     89.672279   
2           1  2023-02-02 00:15:11.379377   57   126.623222     87.619685   
3           1  2023-02-04 09:37:12.589164   57   136.999366     89.199774   
4           1  2023-02-04 20:56:52.838198   57   127.546919     92.644673   

   glucose_level        bmi  heart_rate  disease_outcome  smoker_status_no  \
0     117.475210  25.085796   62.719587                0               1.0   
1      85.120875  24.120608   76.314434                0               1.0   
2            NaN  24.819332   62.427785                0               1.0   
3     118.755648  25.039598   61.612981                0               1.0   
4      98.882007  24.895024   77.649615                0               1.0   

   smoker_status_yes  
0                0.0  
1                0.0  

## 4. Data Preparation

Implement `prepare_data_part3` to handle the train/test split correctly.

In [16]:
def prepare_data_part3(df, test_size=0.2, random_state=42):
    """
    Prepare data with categorical encoding.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # 1. Encode the categorical feature (e.g., 'smoker_status')
    df_encoded = encode_categorical_features(df, column_to_encode='smoker_status')

    # 2. Select features and target
    target = 'disease_outcome'

    # Drop the target, then keep only numeric columns for modeling
    X = df_encoded.drop(columns=[target]).select_dtypes(include='number')
    y = df_encoded[target]

    # 3. Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # 4. Handle missing values with SimpleImputer
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    return X_train, X_test, y_train, y_test

In [17]:
X_train, X_test, y_train, y_test = prepare_data_part3(df)
print("✅ Data prepared with categorical encoding.")
print(f"X_train shape: {X_train.shape}")

✅ Data prepared with categorical encoding.
X_train shape: (5860, 9)


## 5. Handling Imbalanced Data

Implement `apply_smote` to oversample the minority class.

In [31]:
def apply_smote(X_train, y_train, random_state=42):
    """
    Apply SMOTE to oversample the minority class.
    
    Args:
        X_train: Training features
        y_train: Training target
        random_state: Random seed for reproducibility
        
    Returns:
        Resampled X_train and y_train with balanced classes
    """
    
    # Initialize SMOTE
    smote = SMOTE(random_state=random_state)

    # Apply SMOTE to training data
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    return X_resampled, y_resampled

In [32]:
X_train_balanced, y_train_balanced = apply_smote(X_train, y_train)

print("✅ SMOTE applied.")
print(f"Before SMOTE: {y_train.value_counts().to_dict()}")
print(f"After SMOTE:  {pd.Series(y_train_balanced).value_counts().to_dict()}")

✅ SMOTE applied.
Before SMOTE: {0: 5291, 1: 569}
After SMOTE:  {0: 5291, 1: 5291}


## 6. Model Training and Evaluation

Train a model on the SMOTE-resampled data and evaluate it.

In [20]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    model = LogisticRegression(max_iter=1000, solver='lbfgs')
    model.fit(X_train, y_train)
    return model

def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # 1. Generate predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Probability for AUC

    # 2. Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_prob)

    # 3. Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # 4. Return as dictionary
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "auc": auc,
        "confusion_matrix": cm
    }

In [21]:
# 1. Apply SMOTE to balance the training data
X_train_bal, y_train_bal = apply_smote(X_train, y_train)

# 2. Train logistic regression on resampled data
logreg_model = train_logistic_regression(X_train_bal, y_train_bal)

# 3. Evaluate on original test set
metrics = calculate_evaluation_metrics(logreg_model, X_test, y_test)

# 4. Print results
print("✅ Evaluation on original test set (after SMOTE training):")
for key, value in metrics.items():
    print(f"{key}:\n{value}\n" if key == "confusion_matrix" else f"{key}: {value:.4f}")

✅ Evaluation on original test set (after SMOTE training):
accuracy: 0.8547
precision: 0.3885
recall: 0.8531
f1_score: 0.5339
auc: 0.9267
confusion_matrix:
[[1131  192]
 [  21  122]]



  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


## 7. Save Results

Save the evaluation metrics to a text file.

In [22]:
def save_results_part3(metrics, filename="results/results_part3.txt"):
    """
    Save evaluation metrics from the SMOTE-trained model to a text file.
    
    Args:
        metrics: Dictionary of evaluation metrics
        filename: Path to output file
    """
    # 1. Create 'results' directory if it doesn't exist
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    # 2. Format metrics
    lines = []
    for key, value in metrics.items():
        if key == "confusion_matrix":
            lines.append(f"{key}:\n{value}\n")
        else:
            lines.append(f"{key}: {value:.4f}")

    # 3. Write to file
    with open(filename, "w") as f:
        f.write("\n".join(lines))

    print(f"✅ Metrics saved to {filename}")

In [23]:

save_results_part3(metrics)

✅ Metrics saved to results/results_part3.txt


## 8. Main Execution

Run the complete workflow.

In [40]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)

    # 2. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part3(df)

    # 3. Apply SMOTE
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)

    # 4. Train logistic regression
    model = train_logistic_regression(X_train_resampled, y_train_resampled)

    # 5. Evaluate on test set
    metrics = calculate_evaluation_metrics(model, X_test, y_test)

    # 6. Print metrics
    print("✅ Evaluation on original test set (after SMOTE training):")
    for k, v in metrics.items():
        print(f"{k}:\n{v}\n" if k == "confusion_matrix" else f"{k}: {v:.4f}")

    # 7. Save Part 3 results
    save_results_part3(metrics)

    # 8. Load Part 1 results (plain text)
    part1_metrics = {}
    try:
        with open('results/results_part1.txt', 'r') as f:
            for line in f:
                if ':' in line and 'confusion_matrix' not in line:
                    key, value = line.strip().split(':', 1)
                    try:
                        part1_metrics[key.strip()] = float(value.strip())
                    except ValueError:
                        continue

        # 9. Compare Part 1 vs Part 3
        comparison = compare_models(part1_metrics, metrics)
        print("\n📊 Model Comparison (Improvement % from Part 1 to Part 3):")
        for metric, improvement in comparison.items():
            print(f"{metric}: {improvement:.2f}%")

    except FileNotFoundError:
        print("❗ Part 1 results not found. Please run part1_introduction.ipynb first.")


✅ Evaluation on original test set (after SMOTE training):
accuracy: 0.8547
precision: 0.3885
recall: 0.8531
f1_score: 0.5339
auc: 0.9267
confusion_matrix:
[[1131  192]
 [  21  122]]

✅ Metrics saved to results/results_part3.txt

📊 Model Comparison (Improvement % from Part 1 to Part 3):
accuracy: -6.77%
precision: -41.26%
recall: 183.72%
f1_score: 29.12%
auc: 2.01%


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


## 9. Compare Results

Implement a function to compare model performance between balanced and imbalanced data.

In [39]:
def compare_models(part1_metrics, part3_metrics):
    """
    Calculate percentage improvement between models trained on imbalanced vs. balanced data.
    
    Args:
        part1_metrics: Dictionary containing evaluation metrics from Part 1 (imbalanced)
        part3_metrics: Dictionary containing evaluation metrics from Part 3 (balanced)
        
    Returns:
        Dictionary with metric names as keys and improvement percentages as values
    """
    comparison = {}

    # Metrics where higher is better
    higher_is_better = ['accuracy', 'precision', 'recall', 'f1_score', 'auc']

    for metric in higher_is_better:
        val1 = part1_metrics.get(metric)
        val3 = part3_metrics.get(metric)

        if val1 is None or val3 is None:
            comparison[metric] = "N/A"
            continue

        try:
            if val1 == 0:
                improvement = float('inf') if val3 > 0 else 0.0
            else:
                improvement = ((val3 - val1) / abs(val1)) * 100
            comparison[metric] = improvement
        except Exception:
            comparison[metric] = "N/A"

    return comparison