In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
           'oldpeak', 'slope', 'ca', 'thal', 'target']

In [3]:
# Load the data into a pandas dataframe
data = pd.read_csv(url, header=None, names=columns, na_values='?')

In [10]:
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [4]:
# Drop rows with missing values
data_cleaned = data.dropna()

In [5]:
# Normalize and scale the data
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_cleaned.drop('target', axis=1)), columns=data_cleaned.columns[:-1])

In [6]:
# Balance the dataset using SMOTE
X = data_scaled
y = data_cleaned['target']

In [7]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [8]:
# Combine the resampled data
data_balanced = pd.concat([X_res, y_res], axis=1)

In [9]:
# Save the balanced dataset
data_balanced.to_csv('balanced_heart_disease_data.csv', index=False)