In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import copy
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("../data/heart_cleveland_upload.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [3]:
df.shape

(297, 14)

In [4]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'condition'],
      dtype='object')

In [5]:
df.isnull().sum()

age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
condition    0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   chol       297 non-null    int64  
 5   fbs        297 non-null    int64  
 6   restecg    297 non-null    int64  
 7   thalach    297 non-null    int64  
 8   exang      297 non-null    int64  
 9   oldpeak    297 non-null    float64
 10  slope      297 non-null    int64  
 11  ca         297 non-null    int64  
 12  thal       297 non-null    int64  
 13  condition  297 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.6 KB


In [7]:
df['condition'].unique()

array([0, 1])

In [8]:
X = df[["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]].to_numpy()
Y = df[["condition"]].to_numpy()

In [9]:
cat_cols = ["cp", "restecg", "slope", "thal"]
for col in cat_cols:
    df[col] = df[col].astype("category").cat.codes


In [10]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


In [11]:
def z_normalization(X_data):
    mu = np.mean(X_data, axis=0)
    sigma = np.std(X_data, axis=0)
    X_norm = (X_data - mu)/sigma
    return(X_norm, mu, sigma)
         

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(237, 13)
(60, 13)
(237, 1)
(60, 1)


In [13]:
mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)

X_train = (X_train - mu) / sigma
X_test = (X_test - mu) / sigma


In [14]:
def sigmoid(z):
    g = 1/(1+np.exp(-z))
    return g
    

In [15]:
def probability_function(X, w, b):
    z = np.dot(X, w) + b
    p = sigmoid(z)
    return p

In [16]:
def predict(X, w, b):
    p = probability_function(X, w, b)
    return (p >= 0.5).astype(int)



In [17]:
def compute_cost_logistic(X, y, w, b):
    m = X.shape[0]
    cost = 0.0
    for i in range(m):
        z_i = np.dot(X[i],w) + b
        f_wb_i = sigmoid(z_i)
        cost +=  -y[i]*np.log(f_wb_i) - (1-y[i])*np.log(1-f_wb_i)
             
    cost = cost / m
    return cost

In [18]:
def compute_cost_reg(X, y, w, b, lambda_ = 0.1):

    m, n = X.shape
    
    cost_without_reg = compute_cost_logistic(X, y, w, b) 
    
    reg_cost_sum = 0.
    reg_cost = 0
   
    for i in range(n):
          reg_cost_sum += (w[i])**2
        
    reg_cost = (lambda_ / (2 * m)) * reg_cost_sum


    total_cost = cost_without_reg + reg_cost

    return total_cost

In [19]:
def compute_gradient(X, y, w, b, lambda_):
    m,n = X.shape
    dj_dw = np.zeros((n,))
    dj_db = 0

    for i in range(m):
        f_wb_i = sigmoid(np.dot(X[i], w) + b)
        err_i = f_wb_i - y[i]
        for j in range (n):
            dj_dw[j] += err_i*X[i,j]
        dj_db += err_i
    dj_dw = dj_dw/m
    dj_db = dj_db/m

    dj_dw += (lambda_/m) * w
    return dj_dw, dj_db


In [20]:
def compute_gradiest_descent(X, y, w_in, b_in, alpha, num_iters):
    J_history = []
    w = copy.deepcopy(w_in)
    b = b_in

    for i in range(num_iters):
        dj_dw, dj_db = compute_gradient(X, y, w, b, lambda_=0.1)

        w = w - alpha* dj_dw 
        b = b - alpha*dj_db

        if i<100000:       
            J_history.append( compute_cost_logistic(X, y, w, b) )

        
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]}   ")
        
    return w, b, J_history         


In [None]:
n = X.shape[1]
w_init = np.zeros(n)
b_init = 0
alpha = 0.001    
num_iters = 10000
Y_tr = Y_train.reshape(-1)    
w_final, b_final, J_hist = compute_gradiest_descent(X_train, Y_tr, w_init, b_init, alpha, num_iters)
print(f"w: {w_final}, b: {b_final}")



Iteration    0: Cost 0.6927331814947549   


Iteration 1000: Cost 0.47276544960284905   
Iteration 2000: Cost 0.40334472207824446   
Iteration 3000: Cost 0.37165055254295826   


In [None]:

y_pred = predict(X_test, w_final, b_final)
y_test_flat = Y_test.flatten()

accuracy = np.mean(y_pred == y_test_flat)
print("Accuracy:", accuracy)


Accuracy: 0.75


In [None]:
import pickle

model_data = {
    "w": w_final,
    "b": b_final,
    "mu": mu,
    "sigma": sigma
}

with open("model.pkl", "wb") as f:
    pickle.dump(model_data, f)

print("Model saved successfully as model.pkl")
