In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [18]:
train_data = pd.read_csv('../../data/processed/cleveland_train.csv')
test_data = pd.read_csv('../../data/processed/cleveland_test.csv')

In [19]:
print(train_data.shape)
print(test_data.shape)

(237, 14)
(60, 14)


In [23]:
ohe_columns = ['cp', 'restecg', 'slope', 'thal']
numeric_columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

def encode_ohe(data: pd.DataFrame, columns:list):
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')    
    ohe.fit(data[columns])

    encoded_data = ohe.transform(data[columns])
    encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(columns))

    result_data = data.drop(columns, axis=1)
    result_data = pd.concat([result_data, encoded_df], axis=1)
    
    return result_data

def scale_numeric(data: pd.DataFrame, columns:list):
    
    scaler = StandardScaler()
    scaler.fit(data[columns])

    scaled_data = scaler.transform(data[columns])
    scaled_df = pd.DataFrame(scaled_data, columns=columns)

    result_data = data.drop(columns, axis=1)
    result_data = pd.concat([result_data, scaled_df], axis=1)
    
    return result_data

In [25]:
train_data_encoded = encode_ohe(train_data, ohe_columns)
train_data_preprocessed = scale_numeric(train_data_encoded, numeric_columns)

test_data_encoded = encode_ohe(test_data, ohe_columns)
test_data_preprocessed = scale_numeric(test_data_encoded, numeric_columns)

print(test_data_preprocessed.columns)
print(train_data_preprocessed.columns)

Index(['sex', 'fbs', 'exang', 'ca', 'target', 'cp_1.0', 'cp_2.0', 'cp_3.0',
       'cp_4.0', 'restecg_0.0', 'restecg_1.0', 'restecg_2.0', 'slope_1.0',
       'slope_2.0', 'slope_3.0', 'thal_3.0', 'thal_6.0', 'thal_7.0', 'age',
       'trestbps', 'chol', 'thalach', 'oldpeak'],
      dtype='object')
Index(['sex', 'fbs', 'exang', 'ca', 'target', 'cp_1.0', 'cp_2.0', 'cp_3.0',
       'cp_4.0', 'restecg_0.0', 'restecg_1.0', 'restecg_2.0', 'slope_1.0',
       'slope_2.0', 'slope_3.0', 'thal_3.0', 'thal_6.0', 'thal_7.0', 'age',
       'trestbps', 'chol', 'thalach', 'oldpeak'],
      dtype='object')


In [31]:
x_train = train_data_preprocessed.drop('target', axis=1)
y_train = train_data_preprocessed['target']

x_test = test_data_preprocessed.drop('target', axis=1)
y_test = test_data_preprocessed['target']

# print(f"x_train:{x_train.shape}\ny_train{y_train.shape}\nx_test:{x_test.shape}\ny_text{y_test.shape}")

print(train_data_preprocessed.head())

print(x_train.head())
print(y_train.head())

   sex  fbs  exang   ca  target  cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  \
0  0.0  0.0    0.0  0.0       0     0.0     0.0     1.0     0.0          1.0   
1  0.0  0.0    0.0  0.0       0     1.0     0.0     0.0     0.0          1.0   
2  0.0  0.0    0.0  2.0       0     1.0     0.0     0.0     0.0          1.0   
3  1.0  0.0    0.0  0.0       1     0.0     1.0     0.0     0.0          0.0   
4  1.0  0.0    0.0  0.0       0     0.0     0.0     1.0     0.0          1.0   

   ...  slope_2.0  slope_3.0  thal_3.0  thal_6.0  thal_7.0       age  \
0  ...        1.0        0.0       1.0       0.0       0.0 -1.741679   
1  ...        0.0        0.0       1.0       0.0       0.0  0.601114   
2  ...        0.0        0.0       1.0       0.0       0.0  1.605169   
3  ...        1.0        0.0       1.0       0.0       0.0  0.377991   
4  ...        0.0        0.0       1.0       0.0       0.0 -0.849186   

   trestbps      chol   thalach   oldpeak  
0  0.319284 -0.544213  0.151798 -0.915041 