In [73]:
# Import necessary modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, f1_score, roc_auc_score, precision_recall_curve, auc
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 1. Load the dataset
heart_disease_uci = pd.read_csv('../Data/heart_disease_uci.csv')

# 2. Examine the Dataset: Investigate the dataset before cleaning to find NA values, duplicates, etc.
# heart_disease_uci.head()
# heart_disease_uci.info()
# heart_disease_uci.describe()
# heart_disease_uci.tail()
# heart_disease_uci.shape #dimensions
# heart_disease_uci.columns
heart_disease_uci.dtypes

# 3. Preprocess the data
# print(heart_disease_uci.isnull().sum())

## A. drop duplicate rows
heart_disease_uci = heart_disease_uci.drop_duplicates()  # Removes duplicate rows

## B. Remove Redundant Features   
def remove_na_features(data, threshold=0.5):
    """Remove redundant or duplicate columns.
    :param data: pandas DataFrame
    :param threshold: float, threshold percentage
    :return: pandas DataFrame
    """
    null_percent = data.isnull().mean() # to get the % of null values
    # Drop columns with null percent greater than the threshold
    to_drop = null_percent[null_percent > threshold].index
    return data.drop(columns=to_drop)

## Drop columns
heart_disease_uci = remove_na_features(heart_disease_uci, threshold=0.5) # this will remove 'ca' and 'thal'

## C. Imputate missing values
def impute_missing_values(data, strategy='mean'):
    """
    Fill missing values in the dataset.
    :param data: pandas DataFrame
    :param strategy: str, imputation method ('mean', 'median', 'mode')
    :return: pandas DataFrame
    """    
    # Isolate numerical columns from the dataset to account for TypeError
    num_cols = data.select_dtypes(include=[np.number]).columns
    cat_cols = data.select_dtypes(include=['object']).columns

    if strategy == "mean":
        data[num_cols] = data[num_cols].fillna(data[num_cols].mean()) # NA values replaced with the mean of the column
    elif strategy == "median":
        data[num_cols] = data[num_cols].fillna(data[num_cols].median()) # NA values replaced with the median of the column
    elif strategy == "mode":
        for col in num_cols:
            data[col] = data[col].fillna(data[col].mode().iloc[0]) # iloc[0] takes the first value as mode
    else:
        raise ValueError("Invalid strategy! Please choose either Mean, Median, or Mode.")
    
    for col in cat_cols:
        data[col] = data[col].astype('category')
        data[col] = data[col].fillna(data[col].mode().iloc[0]) # iloc[0] takes the first value as mode
    
    return data

heart_disease_uci = impute_missing_values(heart_disease_uci, strategy='mean')
print("Missing values handled successfully.")
print(heart_disease_uci.isnull().sum()) # every column returns 0 now


## D. Convert categorical features to numerical - ONE HOT ENCODING
# isolate categorical columns again
cat_columns = heart_disease_uci.select_dtypes(include=['category']).columns 

# apply One-Hot Encoding steps
# initializing OneHotEncoder
onehot = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' column to avoid multicollinearity and redundancy
encoded_data = onehot.fit_transform(heart_disease_uci[cat_columns])

# create the encoded data DataFrame and add column names using OneHotEncoder
encoded_df = pd.DataFrame(encoded_data, columns=onehot.get_feature_names_out(cat_columns))
encoded_df.columns = [col.replace(" ", "_") for col in encoded_df.columns] # to remove spaces in some column names

# Concatenate the original DataFrame with the encoded DataFrame, dropping the original categorical columns
heart_disease_encoded = pd.concat([heart_disease_uci.drop(columns=cat_columns), encoded_df], axis=1)
print(heart_disease_encoded.head()) # print to check



Missing values handled successfully.
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
num         0
dtype: int64
    age  trestbps   chol  thalch  oldpeak  num  sex_Male  dataset_Hungary  \
0  63.0     145.0  233.0   150.0      2.3  0.0       1.0              0.0   
1  67.0     160.0  286.0   108.0      1.5  2.0       1.0              0.0   
2  67.0     120.0  229.0   129.0      2.6  1.0       1.0              0.0   
3  37.0     130.0  250.0   187.0      3.5  0.0       1.0              0.0   
4  41.0     130.0  204.0   172.0      1.4  0.0       0.0              0.0   

   dataset_Switzerland  dataset_VA_Long_Beach  cp_atypical_angina  \
0                  0.0                    0.0                 0.0   
1                  0.0                    0.0                 0.0   
2                  0.0                    0.0                 0.0   
3                  0.0       

In [74]:
# 4. Build and Evaluate Models

# Split dataset into features and target: chol
X = heart_disease_encoded.drop(columns=['chol'])  # chol feature
y = heart_disease_encoded['chol'] # chol target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4.1. Regression Models: Linear Regression and ElasticNet
print(X.isnull().sum())
# print(y.isnull().sum())

# # Linear Regression
# lin_reg = LinearRegression()
# lin_reg.fit(X_train, y_train)
# y_pred_lr = lin_reg.predict(X_test)

# # Evaluation metrics for Linear Regression
# r2_lr = r2_score(y_test, y_pred_lr)
# rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

# # ElasticNet
# elastic_net = ElasticNet()
# elastic_net.fit(X_train, y_train)
# y_pred_en = elastic_net.predict(X_test)

# # Evaluation metrics for ElasticNet
# r2_en = r2_score(y_test, y_pred_en)
# rmse_en = np.sqrt(mean_squared_error(y_test, y_pred_en))

# # ElasticNet Hyperparameter Tuning (alpha and l1_ratio)
# param_grid = {'alpha': np.logspace(-4, 4, 10), 'l1_ratio': np.linspace(0, 1, 10)}
# grid_search = GridSearchCV(ElasticNet(), param_grid, scoring='neg_mean_squared_error', cv=5)
# grid_search.fit(X_train, y_train)

# # Best ElasticNet hyperparameters and performance
# best_alpha = grid_search.best_params_['alpha']
# best_l1_ratio = grid_search.best_params_['l1_ratio']
# best_rmse = np.sqrt(-grid_search.best_score_)

# # Plot Heatmap for R² and RMSE for ElasticNet Hyperparameter Tuning
# scores = grid_search.cv_results_['mean_test_score'].reshape(10, 10)
# plt.figure(figsize=(10, 8))
# sns.heatmap(scores, annot=True, fmt=".2f", cmap='coolwarm', xticklabels=np.linspace(0, 1, 10), yticklabels=np.logspace(-4, 4, 10))
# plt.xlabel('l1_ratio')
# plt.ylabel('alpha')
# plt.title('ElasticNet Hyperparameter Tuning (RMSE)')
# plt.colorbar()
# plt.show()

age                         2
trestbps                    2
thalch                      2
oldpeak                     2
num                         2
sex_Male                    2
dataset_Hungary             2
dataset_Switzerland         2
dataset_VA_Long_Beach       2
cp_atypical_angina          2
cp_non-anginal              2
cp_typical_angina           2
fbs_1.0                     2
restecg_normal              2
restecg_st-t_abnormality    2
exang_1.0                   2
slope_flat                  2
slope_upsloping             2
dtype: int64


In [75]:
# Classification Models: Logistic Regression and k-NN

# Logistic Regression
log_reg = LogisticRegression(solver='liblinear')  # Solver chosen for binary classification
log_reg.fit(X_train, y_train)
y_pred_lr_class = log_reg.predict(X_test)

# Evaluation metrics for Logistic Regression
accuracy_lr = accuracy_score(y_test, y_pred_lr_class)
f1_lr = f1_score(y_test, y_pred_lr_class)
roc_auc_lr = roc_auc_score(y_test, y_pred_lr_class)
precision, recall, _ = precision_recall_curve(y_test, log_reg.predict_proba(X_test)[:, 1])
auprc_lr = auc(recall, precision)

# k-NN Classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn_class = knn.predict(X_test)

# Evaluation metrics for k-NN
accuracy_knn = accuracy_score(y_test, y_pred_knn_class)
f1_knn = f1_score(y_test, y_pred_knn_class)
roc_auc_knn = roc_auc_score(y_test, y_pred_knn_class)
precision_knn, recall_knn, _ = precision_recall_curve(y_test, knn.predict_proba(X_test)[:, 1])
auprc_knn = auc(recall_knn, precision_knn)

# Plot AUROC and AUPRC for both Logistic Regression and k-NN
# AUROC
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(np.linspace(0, 1, len(roc_auc_lr)), roc_auc_lr, label='Logistic Regression (AUROC)')
plt.plot(np.linspace(0, 1, len(roc_auc_knn)), roc_auc_knn, label='k-NN (AUROC)')
plt.title('AUROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()

# AUPRC
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label='Logistic Regression (AUPRC)')
plt.plot(recall_knn, precision_knn, label='k-NN (AUPRC)')
plt.title('AUPRC Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()

plt.show()

# Print evaluation metrics
print(f"Linear Regression R²: {r2_lr:.4f}, RMSE: {rmse_lr:.4f}")
print(f"ElasticNet R²: {r2_en:.4f}, RMSE: {rmse_en:.4f}")
print(f"Best ElasticNet Hyperparameters: alpha = {best_alpha}, l1_ratio = {best_l1_ratio}")
print(f"Logistic Regression - Accuracy: {accuracy_lr:.4f}, F1: {f1_lr:.4f}, AUROC: {roc_auc_lr:.4f}, AUPRC: {auprc_lr:.4f}")
print(f"k-NN - Accuracy: {accuracy_knn:.4f}, F1: {f1_knn:.4f}, AUROC: {roc_auc_knn:.4f}, AUPRC: {auprc_knn:.4f}")

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values