In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
           'oldpeak', 'slope', 'ca', 'thal', 'target']

In [3]:
# Load the data into a pandas dataframe
data = pd.read_csv(url, header=None, names=columns, na_values='?')
print("Dataset Loaded:")
print(data.head())

Dataset Loaded:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  target  
0    3.0  0.0   6.0       0  
1    2.0  3.0   3.0       2  
2    2.0  2.0   7.0       1  
3    3.0  0.0   3.0       0  
4    1.0  0.0   3.0       0  


In [4]:
# Checking for missing values
print("Checking for missing values:")
print(data.isnull().sum())

Checking for missing values:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64


In [5]:
# Dropping rows with missing values
data_cleaned = data.dropna()
print("Missing values after dropping rows:")
print(data_cleaned.isnull().sum())

Missing values after dropping rows:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [6]:
# Encoding categorical variables
# Convert categorical variables to dummy variables using get_dummies
data_encoded = pd.get_dummies(data_cleaned, columns=['cp', 'restecg', 'slope', 'thal'])
print("Dataset after encoding categorical variables:")
print(data_encoded.head())

Dataset after encoding categorical variables:
    age  sex  trestbps   chol  fbs  thalach  exang  oldpeak   ca  target  ...  \
0  63.0  1.0     145.0  233.0  1.0    150.0    0.0      2.3  0.0       0  ...   
1  67.0  1.0     160.0  286.0  0.0    108.0    1.0      1.5  3.0       2  ...   
2  67.0  1.0     120.0  229.0  0.0    129.0    1.0      2.6  2.0       1  ...   
3  37.0  1.0     130.0  250.0  0.0    187.0    0.0      3.5  0.0       0  ...   
4  41.0  0.0     130.0  204.0  0.0    172.0    0.0      1.4  0.0       0  ...   

   cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  slope_1.0  slope_2.0  \
0   False        False        False         True      False      False   
1    True        False        False         True      False       True   
2    True        False        False         True      False       True   
3   False         True        False        False      False      False   
4   False        False        False         True       True      False   

   slope_3.0  thal_3.0

In [7]:
# Encoding binary categorical variables using Label Encoding
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [8]:
# Encode 'sex' and 'fbs' columns
data_encoded['sex'] = labelencoder.fit_transform(data_encoded['sex'])
data_encoded['fbs'] = labelencoder.fit_transform(data_encoded['fbs'])
data_encoded['exang'] = labelencoder.fit_transform(data_encoded['exang'])

In [9]:
print("Dataset after label encoding binary variables:")
print(data_encoded.head())

Dataset after label encoding binary variables:
    age  sex  trestbps   chol  fbs  thalach  exang  oldpeak   ca  target  ...  \
0  63.0    1     145.0  233.0    1    150.0      0      2.3  0.0       0  ...   
1  67.0    1     160.0  286.0    0    108.0      1      1.5  3.0       2  ...   
2  67.0    1     120.0  229.0    0    129.0      1      2.6  2.0       1  ...   
3  37.0    1     130.0  250.0    0    187.0      0      3.5  0.0       0  ...   
4  41.0    0     130.0  204.0    0    172.0      0      1.4  0.0       0  ...   

   cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  slope_1.0  slope_2.0  \
0   False        False        False         True      False      False   
1    True        False        False         True      False       True   
2    True        False        False         True      False       True   
3   False         True        False        False      False      False   
4   False        False        False         True       True      False   

   slope_3.0  thal_3.

In [10]:
# Save and export the encoded dataset
data_encoded.to_csv('encoded_heart_disease_data.csv', index=False)
print("Encoded dataset saved as 'encoded_heart_disease_data.csv'")

Encoded dataset saved as 'encoded_heart_disease_data.csv'
