# Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.utils import parallel_backend

# Load Dataset

In [2]:
# Directories
data_dir = '../data'
processed_data_dir = f'{data_dir}/processed'

# Dataset name and path
dataset_name = 'brfss_heart_2022.csv'
dataset_path = f'{processed_data_dir}/{dataset_name}'

# Load dataframe
df = pd.read_csv(dataset_path)
df

Unnamed: 0,had_heart_attack,age_category,alcohol_drinker,bmi,chest_scan,covid_positive,difficulty_concentrating,difficulty_dressing_bathing,difficulty_errands,difficulty_hearing,...,last_checkup_time,mental_health_days,physical_activities,physical_health_days,pneumovax_ever,race_ethnicity_category,removed_teeth,sleep_hours,smoker_status,tetanus_last10_tdap
0,0,0.075709,0,27.99,0,0.058272,0,0,0,0,...,0.062507,0.0,1,4.0,1,0.057635,0.028827,9.0,0.082050,0.040204
1,0,0.093510,0,30.13,0,0.058441,0,0,0,0,...,0.062551,0.0,1,0.0,1,0.057738,0.028633,6.0,0.081592,0.062026
2,0,0.114254,1,31.66,1,0.048924,0,0,0,0,...,0.062452,0.0,0,0.0,1,0.057837,0.113620,8.0,0.081567,0.059106
3,0,0.135313,0,31.32,0,0.049145,0,0,0,0,...,0.062507,0.0,1,5.0,1,0.057635,0.028827,9.0,0.037141,0.058362
4,0,0.137355,0,33.07,0,0.058441,0,0,0,0,...,0.062551,15.0,1,3.0,1,0.057738,0.058593,5.0,0.037006,0.058032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,0,0.057494,1,32.28,0,0.058433,0,0,0,0,...,0.025271,0.0,1,0.0,0,0.057837,0.028788,6.0,0.037285,0.062477
246018,0,0.004168,0,24.34,0,0.047966,0,0,0,0,...,0.062581,7.0,1,0.0,0,0.046604,0.028423,7.0,0.036984,0.058493
246019,0,0.074250,1,29.86,0,0.048924,0,0,0,0,...,0.062452,15.0,1,0.0,1,0.062470,0.058299,7.0,0.037285,0.062477
246020,0,0.035057,0,28.66,0,0.058272,0,0,0,0,...,0.062507,2.0,1,2.0,0,0.046833,0.028827,7.0,0.037141,0.062332


# Helpers

In [3]:
# Calculates evaluation metrics
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

# Displays evaluation metrics
def display_metrics(accuracy, precision, recall, f1):
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

# Calculates and displays training time
def record_training_time(start_time, end_time):
    training_time = end_time - start_time
    print(f'Training Time: {training_time:.4f} seconds')

# Global Variables

In [4]:
num_features = ['bmi', 'mental_health_days', 'physical_health_days', 'sleep_hours']

X = df.drop('had_heart_attack', axis=1) # Features
y = df['had_heart_attack']              # Target

RANDOM_STATE = 42

# Logistic Regression

Train

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Scaler for numerical columns
preprocessor = ColumnTransformer(
    transformers=[('num', RobustScaler(), num_features)],
    remainder='passthrough'
)

# Create pipeline
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE))
    ]
)

# Hyperparameter grid
param_grid = {
    'model__C': np.logspace(-6, 6, 13),    # Inverse of regularization strength
    'model__penalty': ['elasticnet'],  # Regularization type
    'model__l1_ratio': np.linspace(0, 1, 11),   # L1 Regularization Ratio
    'model__solver': ['saga']    # Solver type
}

# Use GridSearchCV for hyperparameter tuning
logistic_grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

# Fit GridSearchCV
start_time = time.time()
logistic_grid_search.fit(X_train, y_train)
end_time = time.time()
record_training_time(start_time, end_time)

# Best hyperparameters
print(f'Best parameters: {logistic_grid_search.best_params_}')

# Extract best model
best_model = logistic_grid_search.best_estimator_

PicklingError: Could not pickle the task to send it to the workers.

Evaluate

In [9]:
y_pred = best_model.predict(X_test)
acc, prec, rec, f1 = calculate_metrics(y_test, y_pred)

print('Test Metrics:')
display_metrics(acc, prec, rec, f1)
print(f'\nClassification Report:\n{classification_report(y_test, y_pred)}')

Test Metrics:
Accuracy: 0.9483589066151814
Precision: 0.537759336099585
Recall: 0.24620060790273557
F1 Score: 0.3377638780297107

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     46573
           1       0.54      0.25      0.34      2632

    accuracy                           0.95     49205
   macro avg       0.75      0.62      0.66     49205
weighted avg       0.94      0.95      0.94     49205

