In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
hdf=pd.read_csv('heart_disease_dataset.csv')
# Load and preprocess the dataset

In [3]:
hdf.head()

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,Female,228,119,66,Current,Heavy,1,No,No,Yes,8,119,Yes,Atypical Angina,1
1,48,Male,204,165,62,Current,,5,No,No,No,9,70,Yes,Typical Angina,0
2,53,Male,234,91,67,Never,Heavy,3,Yes,No,Yes,5,196,Yes,Atypical Angina,1
3,69,Female,192,90,72,Current,,4,No,Yes,No,7,107,Yes,Non-anginal Pain,0
4,62,Female,172,163,93,Never,,6,No,Yes,No,2,183,Yes,Asymptomatic,0


In [4]:
# Checking unique values in categorical columns

print("Unique values in 'Gender':", hdf['Gender'].unique())
print("Unique values in 'Smoking':", hdf['Smoking'].unique())
print("Unique values in 'Alcohol Intake':", hdf['Alcohol Intake'].unique())
print("Unique values in 'Family History':", hdf['Family History'].unique())
print("Unique values in 'Diabetes':", hdf['Diabetes'].unique())
print("Unique values in 'Obesity':", hdf['Obesity'].unique())
print("Unique values in 'Exercise Induced Angina':", hdf['Exercise Induced Angina'].unique())
print("Unique values in 'Chest Pain Type':", hdf['Chest Pain Type'].unique())

Unique values in 'Gender': ['Female' 'Male']
Unique values in 'Smoking': ['Current' 'Never' 'Former']
Unique values in 'Alcohol Intake': ['Heavy' nan 'Moderate']
Unique values in 'Family History': ['No' 'Yes']
Unique values in 'Diabetes': ['No' 'Yes']
Unique values in 'Obesity': ['Yes' 'No']
Unique values in 'Exercise Induced Angina': ['Yes' 'No']
Unique values in 'Chest Pain Type': ['Atypical Angina' 'Typical Angina' 'Non-anginal Pain' 'Asymptomatic']


In [5]:
# Check for missing values
print("Total missing values in Heart Disease data:", hdf.isnull().sum().sum())

Total missing values in Heart Disease data: 340


In [6]:
# Convert Gender to numeric (One-Hot Encoding)
hdf['Gender'] = hdf['Gender'].map({'Male': 1, 'Female': 0})

In [7]:
# Smoking and Alcohol Intake encoding
hdf['Smoking'] = hdf['Smoking'].map({'Current': 1, 'Never': 0, 'Former': 2})
hdf['Alcohol Intake'] = hdf['Alcohol Intake'].fillna('Unknown')
hdf['Alcohol Intake'] = hdf['Alcohol Intake'].map({'Heavy': 2, 'Moderate': 1, 'Unknown': 0})

In [8]:
# Other categorical encodings
hdf['Family History'] = hdf['Family History'].map({'Yes': 1, 'No': 0})
hdf['Diabetes'] = hdf['Diabetes'].map({'Yes': 1, 'No': 0})
hdf['Obesity'] = hdf['Obesity'].map({'Yes': 1, 'No': 0})
hdf['Exercise Induced Angina'] = hdf['Exercise Induced Angina'].map({'Yes': 1, 'No': 0})

In [9]:
# Chest Pain Type encoding
hdf['Chest Pain Type'] = hdf['Chest Pain Type'].map({
    'Typical Angina': 0,
    'Atypical Angina': 1,
    'Non-anginal Pain': 2,
    'Asymptomatic': 3
})

In [10]:
#Handling Missing Values in numerical columns if any
hdf = hdf.fillna(hdf.median())

In [11]:
# Handling invalid values
hdf['Age'] = hdf['Age'].apply(lambda x: hdf['Age'].median() if x < 0 else x)
hdf['Cholesterol'] = hdf['Cholesterol'].apply(lambda x: min(max(x, 100), 400))
hdf['Blood Pressure'] = hdf['Blood Pressure'].apply(lambda x: min(max(x, 80), 200))
hdf['Blood Sugar'] = hdf['Blood Sugar'].apply(lambda x: min(max(x, 70), 200))

In [12]:
# Filter valid diagnosis labels
hdf = hdf[hdf['Heart Disease'].isin([0, 1])]

In [13]:
# Basic statistics and distribution
print(hdf.describe())

               Age       Gender  Cholesterol  Blood Pressure   Heart Rate  \
count  1000.000000  1000.000000  1000.000000       1000.0000  1000.000000   
mean     52.293000     0.497000   249.939000        135.2810    79.204000   
std      15.727126     0.500241    57.914673         26.3883    11.486092   
min      25.000000     0.000000   150.000000         90.0000    60.000000   
25%      39.000000     0.000000   200.000000        112.7500    70.000000   
50%      52.000000     0.000000   248.000000        136.0000    79.000000   
75%      66.000000     1.000000   299.000000        159.0000    89.000000   
max      79.000000     1.000000   349.000000        179.0000    99.000000   

           Smoking  Alcohol Intake  Exercise Hours  Family History  \
count  1000.000000     1000.000000     1000.000000     1000.000000   
mean      0.988000        1.006000        4.529000        0.499000   
std       0.815181        0.828644        2.934241        0.500249   
min       0.000000        

In [14]:
print(hdf['Heart Disease'].value_counts())

Heart Disease
0    608
1    392
Name: count, dtype: int64


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report

In [16]:
X = hdf.drop(columns='Heart Disease', axis=1)
y = hdf['Heart Disease']
# Feature scaling and data splitting

In [17]:
# Confirm data types
print("Data types of features:\n", X.dtypes)

Data types of features:
 Age                        int64
Gender                     int64
Cholesterol                int64
Blood Pressure             int64
Heart Rate                 int64
Smoking                    int64
Alcohol Intake             int64
Exercise Hours             int64
Family History             int64
Diabetes                   int64
Obesity                    int64
Stress Level               int64
Blood Sugar                int64
Exercise Induced Angina    int64
Chest Pain Type            int64
dtype: object


In [18]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [19]:
#Splitting Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=2)

In [20]:
#Train SVM Model
model = svm.SVC(kernel='linear')
model.fit(X_train, y_train)

In [21]:
#Model Evaluation
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Training Accuracy:", train_score)
print("Test Accuracy:", test_score)

Training Accuracy: 0.86
Test Accuracy: 0.89


In [22]:
# Classification Report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       122
           1       0.89      0.82      0.85        78

    accuracy                           0.89       200
   macro avg       0.89      0.88      0.88       200
weighted avg       0.89      0.89      0.89       200



In [23]:
#Prediction on New Data
input_data = (75, 'Female', 228, 119, 66, 'Current', 'Heavy', 1, 'No', 'No', 'Yes', 8, 119, 'Yes', 'Atypical Angina')

# Convert categorical fields to the same numerical encoding as the training data
input_data_processed = [
    input_data[0],  # Age
    0 if input_data[1] == 'Female' else 1,  # Gender: Female -> 0, Male -> 1
    input_data[2],  # Cholesterol
    input_data[3],  # Blood Pressure
    input_data[4],  # Heart Rate
    1 if input_data[5] == 'Current' else (0 if input_data[5] == 'Never' else 2),  # Smoking: Current -> 1, Never -> 0, Former -> 2
    1 if input_data[6] == 'Heavy' else (0 if input_data[6] == 'Moderate' else np.nan),  # Alcohol Intake: Heavy -> 1, Moderate -> 0
    input_data[7],  # Exercise Hours
    1 if input_data[8] == 'Yes' else 0,  # Family History: Yes -> 1, No -> 0
    1 if input_data[9] == 'Yes' else 0,  # Diabetes: Yes -> 1, No -> 0
    1 if input_data[10] == 'Yes' else 0,  # Obesity: Yes -> 1, No -> 0
    input_data[11],  # Stress Level
    input_data[12],  # Blood Sugar
    1 if input_data[13] == 'Yes' else 0,  # Exercise Induced Angina: Yes -> 1, No -> 0
    0 if input_data[14] == 'Atypical Angina' else (1 if input_data[14] == 'Typical Angina' else (2 if input_data[14] == 'Non-anginal Pain' else 3))  # Chest Pain Type
]
# Convert to numpy array and reshape
input_data_np = np.asarray(input_data_processed).reshape(1, -1)
input_data_np = np.nan_to_num(input_data_np)
# Standardize the input data
input_data_scaled = scaler.transform(input_data_np)

prediction = model.predict(input_data_scaled)
if prediction[0] == 0:
    print('The person does not have heart disease')
else:
    print('The person has heart disease')


The person has heart disease


