In [1]:
import numpy as np
import pandas as pd
import csv
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination 

In [3]:
heartDisease = pd.read_csv(r'D:\OneDrive\Desktop\Practicals\Practicals\IR\avishkar\heart.csv')
heartDisease = heartDisease.replace('?',np.nan) 


In [4]:
print('Sample instances from the dataset are given below')
heartDisease.head() 

Sample instances from the dataset are given below


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heartdisease
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [5]:
print('\n Attributes and datatypes')
heartDisease.dtypes 


 Attributes and datatypes


age               int64
sex               int64
cp                int64
trestbps          int64
chol              int64
fbs               int64
restecg           int64
thalach           int64
exang             int64
oldpeak         float64
slope             int64
ca               object
thal             object
heartdisease      int64
dtype: object

In [6]:
heartDisease.describe() 

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,heartdisease
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


In [7]:
heartDisease.isnull().sum() 

age             0
sex             0
cp              0
trestbps        0
chol            0
fbs             0
restecg         0
thalach         0
exang           0
oldpeak         0
slope           0
ca              4
thal            2
heartdisease    0
dtype: int64

In [8]:
# Drop rows with missing values
heartDisease = heartDisease.dropna(subset=['ca', 'thal']) 


In [9]:
heartDisease.isnull().sum() 

age             0
sex             0
cp              0
trestbps        0
chol            0
fbs             0
restecg         0
thalach         0
exang           0
oldpeak         0
slope           0
ca              0
thal            0
heartdisease    0
dtype: int64

In [10]:
heartDisease 

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heartdisease
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57,0,4,140,241,0,0,123,1,0.2,2,0,7,1
298,45,1,1,110,264,0,0,132,0,1.2,2,0,7,1
299,68,1,4,144,193,1,0,141,0,3.4,2,2,7,2
300,57,1,4,130,131,0,0,115,1,1.2,2,1,7,3


In [11]:
# Rename 'target' column to 'heartdisease'
heartDisease = heartDisease.rename(columns={'target': 'heartdisease'})

# Now you can continue with the rest of your code
# Define the Bayesian Network structure
model = BayesianModel([
    ('age', 'heartdisease'),
    ('sex', 'heartdisease'),
    ('exang', 'heartdisease'),
    ('cp', 'heartdisease'),
    ('restecg', 'heartdisease')
])

# Fit the model using Maximum Likelihood Estimator
print('\nLearning CPD using Maximum likelihood estimators')
model.fit(heartDisease, estimator=MaximumLikelihoodEstimator)

# Set up inference for querying
HeartDiseasetest_infer = VariableElimination(model)

# Query the probability of heart disease given evidence
print('\n1. Probability of HeartDisease given evidence= restecg')
ql = HeartDiseasetest_infer.query(variables=['heartdisease'], evidence={'restecg': 1})
print(ql)

print('\n2. Probability of HeartDisease given evidence= cp')
q2 = HeartDiseasetest_infer.query(variables=['heartdisease'], evidence={'cp': 2})
print(q2)





Learning CPD using Maximum likelihood estimators

1. Probability of HeartDisease given evidence= restecg
+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.1973 |
+-----------------+---------------------+
| heartdisease(1) |              0.1970 |
+-----------------+---------------------+
| heartdisease(2) |              0.1977 |
+-----------------+---------------------+
| heartdisease(3) |              0.1977 |
+-----------------+---------------------+
| heartdisease(4) |              0.2103 |
+-----------------+---------------------+

2. Probability of HeartDisease given evidence= cp
+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.3030 |
+-----------------+---------------------+
| heartdisease(1) |              0.2181 |
+-----------------+---------------------+
| heartdisease(2) |              0.1578 |
+-----------------+---------------------+
| h