In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
# metadata 
print(heart_disease.metadata) 
  
# variable information 
print(heart_disease.variables) 


{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'ID': 231, 'type': 'NATIVE', 'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M

In [2]:
data = heart_disease.data

In [3]:
import pandas as pd

# Extract the features, targets, and headers
features = data['features']  # The feature columns
targets = data['targets']    # The target column
headers = data['headers']    # The headers, if needed

# Merge features and targets into a single DataFrame
heart_disease_df = pd.concat([features, targets], axis=1)

# Optionally, set the column names from headers
heart_disease_df.columns = headers

# Display the resulting DataFrame
heart_disease_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0,1
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0,2
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0,1


In [4]:
heart_disease_df['sex'] = heart_disease_df['sex'].replace({1: 'male', 0: 'female'})
heart_disease_df['cp'] = heart_disease_df['cp'].replace({1: 'typical angina', 2: 'atypical angina', 3:'non-anginal pain', 4:'asymptomatic'})
heart_disease_df['restecg'] = heart_disease_df['restecg'].replace({0: 'normal', 1: 'having ST-T wave abnormality', 2:"showing probable or definite left ventricular hypertrophy by Estes' criteria"})
heart_disease_df['fbs'] = heart_disease_df['fbs'].replace({0: 'False', 1: 'True'})
heart_disease_df['exang'] = heart_disease_df['exang'].replace({0: 'no', 1: 'yes'})
heart_disease_df['exang'] = heart_disease_df['exang'].replace({0: 'no', 1: 'yes'})
heart_disease_df['slope'] = heart_disease_df['slope'].replace({1: 'upsloping', 2: 'flat', 3: 'downsloping'})
heart_disease_df['thal'] = heart_disease_df['thal'].replace({3: 'normal', 6: 'fixed defect', 7: 'reversable defect'})
#heart_disease_df['num'] = heart_disease_df['num'].replace({0: '< 50% diameter narrowing', 1: '> 50% diameter narrowing'})

In [5]:
heart_disease_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,male,typical angina,145,233,True,showing probable or definite left ventricular ...,150,no,2.3,downsloping,0.0,fixed defect,0
1,67,male,asymptomatic,160,286,False,showing probable or definite left ventricular ...,108,yes,1.5,flat,3.0,normal,2
2,67,male,asymptomatic,120,229,False,showing probable or definite left ventricular ...,129,yes,2.6,flat,2.0,reversable defect,1
3,37,male,non-anginal pain,130,250,False,normal,187,no,3.5,downsloping,0.0,normal,0
4,41,female,atypical angina,130,204,False,showing probable or definite left ventricular ...,172,no,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,male,typical angina,110,264,False,normal,132,no,1.2,flat,0.0,reversable defect,1
299,68,male,asymptomatic,144,193,True,normal,141,no,3.4,flat,2.0,reversable defect,2
300,57,male,asymptomatic,130,131,False,normal,115,yes,1.2,flat,1.0,reversable defect,3
301,57,female,atypical angina,130,236,False,showing probable or definite left ventricular ...,174,no,0.0,flat,1.0,normal,1


In [6]:
feature_names = [
    "Age (in years)",
    "Sex",
    "Chest pain type",
    "Resting blood pressure (in mm Hg on admission to the hospital)",
    "Serum cholesterol (in mg/dl)",
    "Fasting blood sugar > 120 mg/dl",
    "Resting electrocardiographic results",
    "Maximum heart rate achieved",
    "Exercise-induced angina",
    "ST depression induced by exercise relative to rest",
    "Slope of the peak exercise ST segment",
    "Number of major vessels (0–3) colored by fluoroscopy",
    "Thalassemia",
    "Diagnosis of heart disease"
]

heart_disease_df.columns = feature_names

In [7]:
# remove 2 and 3 in 'Diagnosis of heart disease column since they are not mentioned
# filtered_df = heart_disease_df[
#     (heart_disease_df['Diagnosis of heart disease'] != 2) &
#     (heart_disease_df['Diagnosis of heart disease'] != 3) &
#     (heart_disease_df['Diagnosis of heart disease'] != 4)
# ]
# filtered_df

In [8]:
heart_disease_df['Diagnosis of heart disease'].unique()

array([0, 2, 1, 3, 4])

In [9]:
# Save the filtered DataFrame to a CSV file
file_path = "pretransformed_heart_disease.csv"
heart_disease_df.to_csv(file_path, index=False)