In [None]:
# general libraries 

import os 
import numpy as np
import pandas as pd 
import seaborn as sns
from tqdm.notebook import tqdm 
import matplotlib.pyplot as plt 

In [None]:
# sklearn libraries 

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.metrics import roc_auc_score,roc_curve

In [None]:
# PyTorch libraries 

import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from torch.autograd import Variable

In [None]:
df = pd.read_csv('../Data/heart.csv')
df.head()

In [None]:
# check for any NaN values

df.isna().sum()

Categorical Feature Analysis

In [None]:
categorical_list = ["sex", "cp","fbs","restecg","exng","slp","caa","thall","output"]

In [None]:
df_categoric = df.loc[:, categorical_list]
for i in categorical_list:
    plt.figure()
    sns.countplot(x = i, data = df_categoric, hue ='output')
    plt.title(i)

Numeric Feature Analysis
* Bivariate data analysis with scatter plot

In [None]:
numeric_list = ["age", "trtbps","chol","thalachh","oldpeak",'output']

In [None]:
df_numeric = df.loc[:, numeric_list]
sns.pairplot(df_numeric, hue="output", kind="kde")
plt.show()

Standardization

In [None]:
scaler = RobustScaler()
scaler

In [None]:
scaled_array = scaler.fit_transform(df[numeric_list[:-1]])
scaled_array

In [None]:
df_dummy = pd.DataFrame(scaled_array, columns = numeric_list[:-1])
df_dummy.head()

In [None]:
plt.figure(figsize = (14,10))
sns.heatmap(df.corr(), annot = True, fmt = ".1f", linewidths = .7)
plt.show()

Outlier Detection¶
* Outliers can disrupt ML process.


In [None]:
numeric_list = ["age", "trtbps","chol","thalachh","oldpeak"]
df_numeric = df.loc[:, numeric_list]
df_numeric.head()

Modelling

In [None]:
df1 = df.copy()

In [None]:
df1 = pd.get_dummies(df1, columns = categorical_list[:-1], drop_first = True)
df1.head()

In [None]:
X = df1.drop(["output"], axis = 1)
y = df1[["output"]]

In [None]:
scaler = RobustScaler()
scaler

In [None]:
X[numeric_list[:-1]] = scaler.fit_transform(X[numeric_list[:-1]])
X.head()

Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 3)
print("X_train: {}".format(X_train.shape))
print("X_test: {}".format(X_test.shape))
print("y_train: {}".format(y_train.shape))
print("y_test: {}".format(y_test.shape))


Logistic Regression

In [None]:
model= LogisticRegression()
model

In [None]:
# fitting = training
model.fit(X_train, y_train)

In [None]:
# calculate probabilities
y_pred_prob = model.predict_proba(X_test)
y_pred_prob

In [None]:
y_pred = np.argmax(y_pred_prob, axis = 1)
y_pred

In [None]:
print("Test accuracy: {}".format(accuracy_score(y_pred, y_test)))

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1])

In [None]:
# plot curve
plt.plot([0,1],[0,1],"k--")
plt.plot(fpr, tpr, label = "Logistic Regression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Logistic Regression ROC Curve")
plt.show()

### Model Inference

In [None]:
# sample request form 

parameters = {
    'age' : 55, 
    'sex' : 'male', 
    'chest_pain_type': 2, 
    'resting_blood_pressure': 154, 
    'cholesterol': True, 
    'fasting_blood_sugar': True,
    'resting_electro_cardio_graphic_result': 2, # between 0-2 
    'max_heart_rate_achieved': 180, 
    'exercise_induced_angina': False, 
    'old_peak': 1.5, 
    
}

In [None]:
from sklearn.preprocessing import RobustScaler

numeric_features = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
scaler = RobustScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])


In [None]:
def preprocess_request(parameters, feature_columns, trained_scaler):
    import pandas as pd
    import numpy as np

    # Map and prepare raw input
    processed = {
        'age': parameters['age'],
        'trtbps': parameters['resting_blood_pressure'],
        'chol': int(parameters['cholesterol']),
        'thalachh': parameters['max_heart_rate_achieved'],
        'oldpeak': parameters['old_peak'],
        'sex': 1 if parameters['sex'] == 'male' else 0,
        'cp': parameters['chest_pain_type'],
        'fbs': int(parameters['fasting_blood_sugar']),
        'restecg': parameters['resting_electro_cardio_graphic_result'],
        'exng': int(parameters['exercise_induced_angina'])
    }

    # Create DataFrame
    df = pd.DataFrame([processed])

    # One-hot encode categorical vars
    df = pd.get_dummies(df, columns=['sex', 'cp', 'fbs', 'restecg', 'exng'], drop_first=True)

    # Ensure all expected columns are present
    for col in feature_columns:
        if col not in df.columns:
            df[col] = 0  # Add missing columns with default 0

    # Reorder columns to match model training
    df = df[feature_columns]

    # Scale numeric features
    numeric_features = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
    df[numeric_features] = trained_scaler.transform(df[numeric_features])

    return df.values


### Saving the model and the scaler

In [None]:
import joblib

# Save the best model from GridSearchCV
joblib.dump(model,  '../Models/baseline_model.pt')

In [None]:
from joblib import dump
dump(scaler, '../Models/baseline_scaler.joblib', compress=True)