### Loading and Inspecting the data

In [14]:
import pandas as pd

# Load the dataset
data = pd.read_csv('Data/Heart_Disease_Prediction.csv')

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      270 non-null    int64  
 1   Sex                      270 non-null    int64  
 2   Chest pain type          270 non-null    int64  
 3   BP                       270 non-null    int64  
 4   Cholesterol              270 non-null    int64  
 5   FBS over 120             270 non-null    int64  
 6   EKG results              270 non-null    int64  
 7   Max HR                   270 non-null    int64  
 8   Exercise angina          270 non-null    int64  
 9   ST depression            270 non-null    float64
 10  Slope of ST              270 non-null    int64  
 11  Number of vessels fluro  270 non-null    int64  
 12  Thallium                 270 non-null    int64  
 13  Heart Disease            270 non-null    object 
dtypes: float64(1), int64(12), 

In [17]:
data.describe()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [18]:
#Checking for null values
print("Check for presence of null values:\n")
print(data.isnull().sum())

Check for presence of null values:

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64


In [19]:
duplicated_rows= data[data.duplicated()]
print("Check for any duplicated values:\n")
print(duplicated_rows)

Check for any duplicated values:

Empty DataFrame
Columns: [Age, Sex, Chest pain type, BP, Cholesterol, FBS over 120, EKG results, Max HR, Exercise angina, ST depression, Slope of ST, Number of vessels fluro, Thallium, Heart Disease]
Index: []


### Data Preprocessing

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Display the first few rows of the original data in a DataFrame format
print("Original Data:")
display(data.head())  # Using display() for better table view

# Show a summary of the dataset
print("\nData Summary:")
display(data.describe())  # Descriptive statistics in a DataFrame view


Original Data:


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence



Data Summary:


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


#### Transformation- Label Encoding

In [22]:
# Convert the target column to binary (1 for 'Presence' and 0 for 'Absence')
print("\nBefore Label Encoding 'Heart Disease':")
display(data[['Heart Disease']].head())

label_encoder = LabelEncoder()
data['Heart Disease'] = label_encoder.fit_transform(data['Heart Disease'])

# Show the transformation of the 'Heart Disease' column
print("\nAfter Label Encoding 'Heart Disease':")
display(data[['Heart Disease']].head())



Before Label Encoding 'Heart Disease':


Unnamed: 0,Heart Disease
0,Presence
1,Absence
2,Presence
3,Absence
4,Absence



After Label Encoding 'Heart Disease':


Unnamed: 0,Heart Disease
0,1
1,0
2,1
3,0
4,0


#### Splitting Features and target

In [29]:
# Separate features (X) and target (y)
X = data.drop('Heart Disease', axis=1)
y = data['Heart Disease']
display("Features(X):\n",X)
display("\nTarget(Y):\n",y)

'Features(X):\n'

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6


'\nTarget(Y):\n'

0      1
1      0
2      1
3      0
4      0
      ..
265    0
266    0
267    0
268    0
269    1
Name: Heart Disease, Length: 270, dtype: int64

#### Scaling Features

In [31]:
# Display features before scaling
print("\nFeatures before Scaling:")
display(X.head())

# Standardize the numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Show the scaled features
print("\nFeatures after Scaling:")
display(pd.DataFrame(X_scaled, columns=X.columns).head())


Features before Scaling:


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3



Features after Scaling:


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,1.712094,0.6895,0.870928,-0.07541,1.402212,-0.417029,0.981664,-1.759208,-0.701222,1.181012,0.676419,2.472682,-0.875706
1,1.38214,-1.450327,-0.183559,-0.916759,6.093004,-0.417029,0.981664,0.446409,-0.701222,0.481153,0.676419,-0.711535,1.189277
2,0.282294,0.6895,-1.238045,-0.41195,0.219823,-0.417029,-1.026285,-0.375291,-0.701222,-0.656118,-0.954234,-0.711535,1.189277
3,1.052186,0.6895,0.870928,-0.18759,0.258589,-0.417029,-1.026285,-1.932198,1.426081,-0.7436,0.676419,0.349871,1.189277
4,2.152032,-1.450327,-1.238045,-0.63631,0.37489,-0.417029,0.981664,-1.240239,1.426081,-0.7436,-0.954234,0.349871,-0.875706


#### Splitting- Training and Testing data

In [36]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=2)

# Show the shapes of training and testing sets
print("\nTraining and Testing Set Shapes:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")

print("\nData Preprocessing Done")


Training and Testing Set Shapes:
X_train: (216, 13), X_test: (54, 13)
y_train: (216,), y_test: (54,)

Data Preprocessing Done


### Training and Evaluating Model

In [7]:
# Import necessary libraries for model building and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Choose the model - Logistic Regression
model = LogisticRegression()

# Step 2: Train the model using the training data
model.fit(X_train, y_train)

# Step 3: Make predictions on the testing data
y_pred = model.predict(X_test)

# Step 4: Evaluate the model's performance

# Accuracy: How many correct predictions were made?
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Confusion Matrix: Gives insight into true positives, false positives, etc.
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report: Provides precision, recall, F1-score
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.83

Confusion Matrix:
[[46  3]
 [11 21]]

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.94      0.87        49
           1       0.88      0.66      0.75        32

    accuracy                           0.83        81
   macro avg       0.84      0.80      0.81        81
weighted avg       0.83      0.83      0.82        81



### Improvement

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Set up the Logistic Regression with class weighting
model = LogisticRegression(class_weight='balanced')

# Step 2: Tune the regularization parameter using GridSearchCV
# We're testing different values of C (regularization strength)
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# Step 3: Train the model using cross-validation
grid_search.fit(X_train, y_train)

# Best parameters and model
best_model = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")

# Step 4: Evaluate the best model
y_pred = best_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Best parameters: {'C': 0.1}

Accuracy: 0.83

Confusion Matrix:
[[45  4]
 [10 22]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.92      0.87        49
           1       0.85      0.69      0.76        32

    accuracy                           0.83        81
   macro avg       0.83      0.80      0.81        81
weighted avg       0.83      0.83      0.82        81

