# Install necessary packages

In [None]:
%pip install -r requirements.txt

# Part 3: Practical Data Preparation

**Objective:** Handle categorical features using One-Hot Encoding and address class imbalance using SMOTE.

## 1. Setup

Import necessary libraries.

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

## 2. Data Loading

Load the dataset.

In [10]:
def load_data(file_path):
    # check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist. Please run generate_data.py first.")
    
    # load the CSV file using pandas
    df = pd.read_csv(file_path)
    
    # display basic information about the loaded data
    print(f"Loaded {df.shape[0]} records with {df.shape[1]} features.")
    print(f"Features: {', '.join(df.columns)}")
    print(f"Number of unique patients: {df['patient_id'].nunique()}")
    
    # display information about categorical variables
    if 'smoker_status' in df.columns:
        print(f"\nSmoker status categories: {df['smoker_status'].unique()}")
        print(f"Smoker status distribution:\n{df['smoker_status'].value_counts(normalize=True)}")
    
    # display target variable distribution
    print(f"\nTarget distribution ('disease_outcome'):")
    print(df['disease_outcome'].value_counts(normalize=True).rename('proportion'))
    
    return df

## 3. Categorical Feature Encoding

Implement `encode_categorical_features` using `OneHotEncoder`.

In [11]:
def encode_categorical_features(df, column_to_encode='smoker_status'):
    # make a copy of the dataframe to avoid modifying the original
    df_encoded = df.copy()
    
    # check if the column exists
    if column_to_encode not in df_encoded.columns:
        print(f"Warning: Column '{column_to_encode}' not found in the dataframe. Returning original dataframe.")
        return df_encoded
    
    # extract the categorical column
    categorical_data = df_encoded[[column_to_encode]]
    
    # initialize and fit the OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop first to avoid multicollinearity
    encoded_data = encoder.fit_transform(categorical_data)
    
    # create new column names from encoder categories
    all_categories = encoder.categories_[0]
    # get feature names (excluding the dropped one)
    feature_names = [f"{column_to_encode}_{category}" for category in all_categories[1:]]
    
    # create a DataFrame with the encoded data
    encoded_df = pd.DataFrame(encoded_data, columns=feature_names, index=df_encoded.index)
    
    # drop the original categorical column
    df_encoded = df_encoded.drop(columns=[column_to_encode])
    
    # concatenate the original dataframe with the encoded columns
    df_encoded = pd.concat([df_encoded, encoded_df], axis=1)
    
    # print information about the encoding
    original_categories = ', '.join(all_categories)
    encoded_columns = ', '.join(feature_names)
    print(f"Encoded '{column_to_encode}' with values [{original_categories}]")
    print(f"Created {len(feature_names)} columns: {encoded_columns}")
    print(f"First category '{all_categories[0]}' was dropped as the reference category")
    
    return df_encoded

## 4. Data Preparation

Implement `prepare_data_part3` to handle the train/test split correctly.

In [12]:
def prepare_data_part3(df, test_size=0.2, random_state=42):
    # encode categorical features using the encode_categorical_features function
    df_encoded = encode_categorical_features(df, column_to_encode='smoker_status')
    
    # select relevant features (including the one-hot encoded ones) and the target
    # identify the smoker status encoded columns
    smoker_columns = [col for col in df_encoded.columns if col.startswith('smoker_status_')]
    
    # select basic health features plus the encoded categorical columns
    selected_features = [
        'age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi', 'heart_rate'
    ] + smoker_columns
    
    X = df_encoded[selected_features]
    y = df_encoded['disease_outcome']
    
    # split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    print(f"Training set: {X_train.shape[0]} samples with {X_train.shape[1]} features")
    print(f"Testing set: {X_test.shape[0]} samples with {X_test.shape[1]} features")
    print(f"Features: {', '.join(X.columns)}")
    
    # handle missing values using SimpleImputer
    imputer = SimpleImputer(strategy='mean')
    X_train = pd.DataFrame(
        imputer.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    
    X_test = pd.DataFrame(
        imputer.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )
    
    # report on missing values
    print("\nMissing values before imputation:")
    print(f"  Training set: {df_encoded[selected_features].iloc[X_train.index].isna().sum().sum()}")
    print(f"  Testing set: {df_encoded[selected_features].iloc[X_test.index].isna().sum().sum()}")
    print("Missing values after imputation:")
    print(f"  Training set: {X_train.isna().sum().sum()}")
    print(f"  Testing set: {X_test.isna().sum().sum()}")
    
    return X_train, X_test, y_train, y_test

## 5. Handling Imbalanced Data

Implement `apply_smote` to oversample the minority class.

In [13]:
def apply_smote(X_train, y_train, random_state=42):
    # get class distribution before SMOTE
    unique, counts = np.unique(y_train, return_counts=True)
    before_counts = dict(zip(unique, counts))
    
    # initialize SMOTE with the specified random state
    smote = SMOTE(random_state=random_state)
    
    # apply SMOTE to generate synthetic samples for the minority class
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # get class distribution after SMOTE
    unique, counts = np.unique(y_resampled, return_counts=True)
    after_counts = dict(zip(unique, counts))
    
    print("Class distribution before SMOTE:")
    for cls, count in before_counts.items():
        print(f"  Class {cls}: {count} samples ({count/sum(before_counts.values()):.1%})")
    
    print("\nClass distribution after SMOTE:")
    for cls, count in after_counts.items():
        print(f"  Class {cls}: {count} samples ({count/sum(after_counts.values()):.1%})")
    
    print(f"\nGenerated {len(X_resampled) - len(X_train)} synthetic samples")
    
    return X_resampled, y_resampled

## 6. Model Training and Evaluation

Train a model on the SMOTE-resampled data and evaluate it.

In [14]:
def train_logistic_regression(X_train, y_train):
    # initialize the logistic regression model
    model = LogisticRegression(
        solver='liblinear',  # small datasets
        random_state=42,     # reproducibility
        max_iter=1000        # increase iterations for convergence
    )
    
    # train the model
    model.fit(X_train, y_train)
    
    # model coefficients to understand feature importance
    print("Logistic Regression Model Trained on SMOTE-Resampled Data")
    print("\nModel Coefficients:")
    for feature, coef in zip(X_train.columns, model.coef_[0]):
        print(f"{feature}: {coef:.4f}")
    
    print(f"\nIntercept: {model.intercept_[0]:.4f}")
    
    return model

def calculate_evaluation_metrics(model, X_test, y_test):
    # generate predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # probability of positive class
    
    # calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    # create confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # confusion matrix in a readable format
    print("\nConfusion Matrix:")
    print(f"TN: {cm[0][0]}, FP: {cm[0][1]}")
    print(f"FN: {cm[1][0]}, TP: {cm[1][1]}")
    
    # return metrics in a dictionary
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc,
        'confusion_matrix': cm
    }
    
    print("\nModel Evaluation Metrics (Testing on Original Imbalanced Test Data):")
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    return metrics

## 7. Save Results

Save the evaluation metrics to a text file.

In [15]:
def save_results(metrics, file_path='results/results_part3.txt'):
    # create 'results' directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    # format metrics as strings
    lines = [
        "SMOTE-BALANCED LOGISTIC REGRESSION MODEL EVALUATION",
        "=" * 50,
        f"Accuracy:  {metrics['accuracy']:.4f}",
        f"Precision: {metrics['precision']:.4f}",
        f"Recall:    {metrics['recall']:.4f}",
        f"F1 Score:  {metrics['f1_score']:.4f}",
        f"AUC:       {metrics['auc']:.4f}",
        "\nCONFUSION MATRIX",
        "=" * 50,
        f"TN: {metrics['confusion_matrix'][0][0]}, FP: {metrics['confusion_matrix'][0][1]}",
        f"FN: {metrics['confusion_matrix'][1][0]}, TP: {metrics['confusion_matrix'][1][1]}",
        "\nNOTES",
        "=" * 50,
        "This model was trained on data resampled with SMOTE to address class imbalance.",
        "The model was evaluated on the original imbalanced test set to ensure realistic performance assessment.",
        "Categorical feature 'smoker_status' was encoded using one-hot encoding with 'drop=first' strategy.",
        "\nEvaluation completed on: " + pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
    ]
    
    # write metrics to file
    with open(file_path, 'w') as f:
        f.write('\n'.join(lines))
    
    print(f"\nResults saved to {file_path}")
    
    # save as JSON
    import json
    metrics_for_json = {k: float(v) if isinstance(v, np.float64) else v.tolist() if isinstance(v, np.ndarray) else v 
                        for k, v in metrics.items()}
    
    json_path = file_path.replace('.txt', '.json')
    with open(json_path, 'w') as f:
        json.dump(metrics_for_json, f)

## 8. Main Execution

Run the complete workflow.

In [20]:
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data with categorical encoding
    X_train, X_test, y_train, y_test = prepare_data_part3(df)
    
    # 3. Apply SMOTE to balance the training data
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)
    
    # 4. Train model on resampled data
    model = train_logistic_regression(X_train_resampled, y_train_resampled)
    
    # 5. Evaluate on original test set
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 6. Save results
    save_results(metrics)
    
    # 7. Load Part 1 results for comparison
    import json
    try:
        with open('results/results_part1.json', 'r') as f:
            part1_metrics = json.load(f)
        
        # 8. Compare models
        comparison = compare_models(part1_metrics, metrics)
        print("\nModel Comparison (improvement percentages):")
        for metric, improvement in comparison.items():
            print(f"{metric}: {improvement:.2f}%")
    except FileNotFoundError:
        print("\nPart 1 results JSON file not found. Run part1_introduction.ipynb first.")
        print("Alternatively, check if results/results_part1.txt exists but not the JSON version.")

Loaded 7326 records with 10 features.
Features: patient_id, timestamp, age, systolic_bp, diastolic_bp, glucose_level, bmi, smoker_status, heart_rate, disease_outcome
Number of unique patients: 150

Smoker status categories: ['no' 'former' 'yes']
Smoker status distribution:
smoker_status
no        0.557467
yes       0.241605
former    0.200928
Name: proportion, dtype: float64

Target distribution ('disease_outcome'):
disease_outcome
0    0.902812
1    0.097188
Name: proportion, dtype: float64
Encoded 'smoker_status' with values [former, no, yes]
Created 2 columns: smoker_status_no, smoker_status_yes
First category 'former' was dropped as the reference category
Training set: 5860 samples with 8 features
Testing set: 1466 samples with 8 features
Features: age, systolic_bp, diastolic_bp, glucose_level, bmi, heart_rate, smoker_status_no, smoker_status_yes

Missing values before imputation:
  Training set: 600
  Testing set: 135
Missing values after imputation:
  Training set: 0
  Testing se

## 9. Compare Results

Implement a function to compare model performance between balanced and imbalanced data.

In [17]:
def compare_models(part1_metrics, part3_metrics):
    # metrics to compare (exclude confusion_matrix)
    metrics_to_compare = ['accuracy', 'precision', 'recall', 'f1_score', 'auc']
    improvements = {}
    
    print("COMPARISON: IMBALANCED vs. SMOTE-BALANCED MODEL")
    print("=" * 50)
    print(f"{'Metric':<12} {'Part 1':<10} {'Part 3':<10} {'Change':<10} {'% Improvement':<15}")
    print("-" * 50)
    
    # calculate percentage improvement for each metric
    for metric in metrics_to_compare:
        if metric in part1_metrics and metric in part3_metrics:
            part1_value = part1_metrics[metric]
            part3_value = part3_metrics[metric]
            
            # calculate absolute change
            change = part3_value - part1_value
            
            # calculate percentage improvement
            if part1_value != 0: 
                pct_improvement = (change / part1_value) * 100
            else:
                pct_improvement = float('inf') if change > 0 else float('-inf')
            
            improvements[metric] = pct_improvement
            
            # formatted comparison
            change_symbol = "↑" if change >= 0 else "↓"
            print(f"{metric:<12} {part1_value:.4f}    {part3_value:.4f}    {change_symbol} {abs(change):.4f}    {pct_improvement:+.2f}%")
    
    print("\nSUMMARY:")
    best_metric = max(improvements.items(), key=lambda x: x[1])
    worst_metric = min(improvements.items(), key=lambda x: x[1])
    print(f"- Greatest improvement: {best_metric[0]} ({best_metric[1]:+.2f}%)")
    print(f"- Least improvement: {worst_metric[0]} ({worst_metric[1]:+.2f}%)")
    
    # analyze the impact of SMOTE
    if improvements.get('recall', 0) > improvements.get('precision', 0):
        print("\nSMOTE has improved recall more than precision, suggesting better identification of minority class cases.")
    else:
        print("\nSMOTE has improved precision more than recall, suggesting more accurate positive predictions.")
        
    return improvements