In [4]:
import numpy as numpy
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [7]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
data = pd.read_csv(url, names=columns, na_values="?")

print(f"{data.shape} = {data.size}")
print(data.head(1))

(303, 14) = 4242
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   

   slope   ca  thal  target  
0    3.0  0.0   6.0       0  


In [8]:
def save_data(data:pd.DataFrame, filename:str="cleveland.csv"):
    path = "../../data/raw/"+filename
    if not os.path.exists(path):
        data.to_csv(path, index=False)

In [None]:
save_data(data)



In [None]:
'''
Columns:
    age - feature numeric
    sex - feature categorical (0-female, 1-male)
    cp - feature categorical (1-typical angina, 2-atypical angina, 3-nin-anginal pain, 4-asymptomatic)
    trestbps - feature numeric
    chol - feature numeric
    fbs - feature categorical (1- >=120mg/dl, 0- <120mg/dl)
    restecg - feature categorical (0-normal, 1-wave abnormability, 2-ventricular hypertrophy)
    thalach - feature numeric
    exang - feature categorical (1-yes, 0-no)
    oldpeak - feature numeric
    slope - feature categorical (1-upsloping, 2-flat, 3-downsloping)
    ca - feature numeric
    thal - feature categorical (3-normal, 6-fixed defect, 7-reversable defect)
    target - target numeric
'''

'Columns:\n    age - feature numeric\n    sex - feature categorical (0-female, 1-male)\n    cp - feature categorical (1-typical angina, 2-atypical angina, 3-nin-anginal pain, 4-asymptomatic)\n    trestbps - feature numeric\n    chol - feature numeric\n    fbs - feature categorical (1- >=120mg/dl, 0- <120mg/dl)\n    restecg - feature categorical (0-normal, 1-wave abnormability)\n    thalach - feature numeric\n    exang - feature categorical (1-yes, 0-no)\n    oldpeak - feature numeric\n    slope - feature categorical (1-upsloping, 2-flat, 3-downsloping)\n    ca - feature numeric\n    thal - feature categorical (3-normal, 6-fixed defect, 7-reversable defect)\n    num - target numeric\n'

In [None]:
ohe_columns = ['cp', 'restecg', 'slope', 'thal']
numeric_columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

def encode_ohe(data: pd.DataFrame, columns:list):
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    ohe.fit(data[columns])

    encoded_data = ohe.transform(data[columns])
    encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(columns))

    result_data = data.drop(columns, axis=1)
    result_data = pd.concat([result_data, encoded_df], axis=1)
    
    return result_data

def standartize(data:pd.DataFrame, columns:list):
    scaler = StandardScaler()
    

encoded_data = encode_ohe(data, ohe_columns)
print(f"{data.columns}\n{encoded_data.columns}")

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')
Index(['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak',
       'ca', 'target', 'cp_1.0', 'cp_2.0', 'cp_3.0', 'cp_4.0', 'restecg_0.0',
       'restecg_1.0', 'restecg_2.0', 'slope_1.0', 'slope_2.0', 'slope_3.0',
       'thal_3.0', 'thal_6.0', 'thal_7.0', 'thal_nan'],
      dtype='object')


In [15]:
data['target'] = data['target'].apply(lambda x: 1 if x > 0 else 0)
print(data['target'].unique())

[0 1]


In [14]:
data_no_ca = data.dropna(ignore_index=True)

print(data_no_ca.shape)
print(data_no_ca.info())

(297, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    float64
 1   sex       297 non-null    float64
 2   cp        297 non-null    float64
 3   trestbps  297 non-null    float64
 4   chol      297 non-null    float64
 5   fbs       297 non-null    float64
 6   restecg   297 non-null    float64
 7   thalach   297 non-null    float64
 8   exang     297 non-null    float64
 9   oldpeak   297 non-null    float64
 10  slope     297 non-null    float64
 11  ca        297 non-null    float64
 12  thal      297 non-null    float64
 13  target    297 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 32.6 KB
None
