In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("heart_disease_uci.csv")
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [2]:
df.shape

(920, 16)

In [3]:
df.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [4]:
# Check unique values in num
print(df['num'].value_counts())

num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64


In [5]:
# Create binary target 
df['target'] = df['num'].apply(lambda x: 0 if x == 0 else 1)

In [6]:
# Check balance of new target
print(df['target'].value_counts())

target
1    509
0    411
Name: count, dtype: int64


In [7]:
# Drop unnecessary columns
df = df.drop(columns=['id', 'dataset', 'num'])
 # Confirm columns
print(df.columns)
print(df.shape)

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')
(920, 14)


In [8]:
# Check missing values
print(df.isnull().sum())
# Quick look at data types
print(df.dtypes)

age           0
sex           0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
target        0
dtype: int64
age           int64
sex          object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
target        int64
dtype: object


In [9]:
# Drop columns with too many missing values
df = df.drop(columns=['ca', 'thal'])
# Check missing values again
print(df.isnull().sum())

# Check shape
print(df.shape)

age           0
sex           0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
target        0
dtype: int64
(920, 12)


In [10]:
# Separate numerical and categorical columns
num_cols = ['trestbps', 'chol', 'thalch', 'oldpeak']
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope']

# Fill numerical with median
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical with mode
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Verify no missing values remain
print(df.isnull().sum())


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
target      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [11]:
print(df.dtypes)


age           int64
sex          object
cp           object
trestbps    float64
chol        float64
fbs            bool
restecg      object
thalch      float64
exang          bool
oldpeak     float64
slope        object
target        int64
dtype: object


In [12]:
from sklearn.preprocessing import LabelEncoder

#Initialize lebalEnoder
le = LabelEncoder()

#columns to encode
cat_cols = ['sex', 'cp', 'restecg', 'slope']

#apply lebal encoding
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Verify encoding
print(df.head())
print(df.dtypes)

   age  sex  cp  trestbps   chol    fbs  restecg  thalch  exang  oldpeak  \
0   63    1   3     145.0  233.0   True        0   150.0  False      2.3   
1   67    1   0     160.0  286.0  False        0   108.0   True      1.5   
2   67    1   0     120.0  229.0  False        0   129.0   True      2.6   
3   37    1   2     130.0  250.0  False        1   187.0  False      3.5   
4   41    0   1     130.0  204.0  False        0   172.0  False      1.4   

   slope  target  
0      0       0  
1      1       1  
2      1       1  
3      0       0  
4      2       0  
age           int64
sex           int32
cp            int32
trestbps    float64
chol        float64
fbs            bool
restecg       int32
thalch      float64
exang          bool
oldpeak     float64
slope         int32
target        int64
dtype: object


In [13]:
#separate feature and target
X = df.drop('target', axis=1)
y= df['target']

print(X.shape)
print(y.shape)

(920, 11)
(920,)


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split (
    X, y, test_size = 0.2, random_state = 42, stratify=y)

print(X_train.shape)
print(X_test.shape)

(736, 11)
(184, 11)


In [15]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit on training, transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Quick check
print("Mean of scaled training features:", X_train_scaled.mean(axis=0))
print("Std of scaled training features:", X_train_scaled.std(axis=0))


Mean of scaled training features: [ 3.63236011e-16  8.93005476e-17  5.55111512e-17  7.86810230e-16
  9.65411326e-18 -3.37893964e-17 -4.82705663e-18  9.65411326e-17
 -1.44811699e-17  9.65411326e-18  2.00322850e-16]
Std of scaled training features: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [16]:
from sklearn.linear_model import LogisticRegression

# Initialize model
log_reg = LogisticRegression(max_iter=1000)

# Train
log_reg.fit(X_train_scaled, y_train)


In [17]:
#make predictions
y_pred = log_reg.predict(X_test_scaled)


In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Precision, Recall, F1
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.8206521739130435
Confusion Matrix:
 [[65 17]
 [16 86]]

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.79      0.80        82
           1       0.83      0.84      0.84       102

    accuracy                           0.82       184
   macro avg       0.82      0.82      0.82       184
weighted avg       0.82      0.82      0.82       184



In [19]:
from sklearn.linear_model import LogisticRegression

# L2 Regularization (default)
log_l2 = LogisticRegression(penalty='l2', C=1.0, max_iter=1000)
log_l2.fit(X_train_scaled, y_train)
y_pred_l2 = log_l2.predict(X_test_scaled)

# L1 Regularization
log_l1 = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, max_iter=1000)
log_l1.fit(X_train_scaled, y_train)
y_pred_l1 = log_l1.predict(X_test_scaled)


In [20]:
from sklearn.metrics import accuracy_score, recall_score

print("L2 Accuracy:", accuracy_score(y_test, y_pred_l2))
print("L2 Recall (Disease):", recall_score(y_test, y_pred_l2))

print("\nL1 Accuracy:", accuracy_score(y_test, y_pred_l1))
print("L1 Recall (Disease):", recall_score(y_test, y_pred_l1))


L2 Accuracy: 0.8206521739130435
L2 Recall (Disease): 0.8431372549019608

L1 Accuracy: 0.8206521739130435
L1 Recall (Disease): 0.8431372549019608


In [21]:
import pandas as pd

# Create coefficient DataFrames
coef_l2 = pd.DataFrame({
    'feature': X.columns,
    'L2_coef': log_l2.coef_[0]
})

coef_l1 = pd.DataFrame({
    'feature': X.columns,
    'L1_coef': log_l1.coef_[0]
})

print("L2 Coefficients:\n", coef_l2)
print("\nL1 Coefficients:\n", coef_l1)


L2 Coefficients:
      feature   L2_coef
0        age  0.415540
1        sex  0.520247
2         cp -0.513234
3   trestbps  0.051687
4       chol -0.432728
5        fbs  0.165883
6    restecg -0.071179
7     thalch -0.303508
8      exang  0.542027
9    oldpeak  0.602177
10     slope -0.120325

L1 Coefficients:
      feature   L1_coef
0        age  0.414141
1        sex  0.514990
2         cp -0.507104
3   trestbps  0.042012
4       chol -0.421707
5        fbs  0.157548
6    restecg -0.057481
7     thalch -0.298428
8      exang  0.538773
9    oldpeak  0.597440
10     slope -0.112855


In [22]:
#STRONG L1 REGULARIZATION
log_l1_strong = LogisticRegression(
    penalty='l1',
    solver='liblinear',
    C=0.1,
    max_iter=1000
)

log_l1_strong.fit(X_train_scaled, y_train)

# Coefficients
coef_l1_strong = pd.DataFrame({
    'feature': X.columns,
    'L1_strong_coef': log_l1_strong.coef_[0]
})

print(coef_l1_strong)


     feature  L1_strong_coef
0        age        0.361572
1        sex        0.439065
2         cp       -0.425880
3   trestbps        0.000000
4       chol       -0.307438
5        fbs        0.082016
6    restecg        0.000000
7     thalch       -0.278533
8      exang        0.478231
9    oldpeak        0.506718
10     slope       -0.051017


In [23]:
#Train a Linear SVM
from sklearn.svm import SVC

# Linear SVM
svm_linear = SVC(kernel='linear', C=1.0)

svm_linear.fit(X_train_scaled, y_train)

# Predictions
y_pred_svm = svm_linear.predict(X_test_scaled)

from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Recall (Disease):", recall_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


SVM Accuracy: 0.8369565217391305
SVM Recall (Disease): 0.8529411764705882
Confusion Matrix:
 [[67 15]
 [15 87]]


In [24]:
#Train RBF SVM
# RBF SVM
svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale')

svm_rbf.fit(X_train_scaled, y_train)

y_pred_rbf = svm_rbf.predict(X_test_scaled)

print("RBF SVM Accuracy:", accuracy_score(y_test, y_pred_rbf))
print("RBF SVM Recall (Disease):", recall_score(y_test, y_pred_rbf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rbf))


RBF SVM Accuracy: 0.842391304347826
RBF SVM Recall (Disease): 0.8725490196078431
Confusion Matrix:
 [[66 16]
 [13 89]]
