In [1]:
import numpy as np
import pandas as pd

In [3]:
heart_data = pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [5]:
heart_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [7]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [9]:
X = heart_data.drop(columns = ['DEATH_EVENT'])
y = heart_data['DEATH_EVENT']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 13)

In [13]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((239, 12), (239,), (60, 12), (60,))

In [15]:
from sklearn.linear_model import LinearRegression

base_model = LinearRegression()
base_model.fit(X_train, y_train)

In [17]:
pred_y_train = base_model.predict(X_train)
pred_y_test = base_model.predict(X_test)

### Mean Absolute Error

In [19]:
def mae_cal(actuals, predictions):
    dif = actuals - predictions
    dif = np.abs(dif)
    result = dif.mean()
    return result

In [21]:
y_train = y_train.to_numpy()
mae = mae_cal(y_train, pred_y_train)
print(f'Mean Absolute Error (Training): {mae}')

Mean Absolute Error (Training): 0.2974461866298427


In [23]:
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_train, pred_y_train))

0.2974461866298427


### Mean Squared Error

In [25]:
def mse_cal(actuals, predictions):
    dif = actuals - predictions ** 2
    result = dif.mean()
    return result

In [27]:
#y_train = y_train.to_numpy()
mse = mse_cal(y_train, pred_y_train)
print(f'Mean Squared Error (Training): {mse}')

Mean Squared Error (Training): 0.13150457697600088


In [29]:
from sklearn.metrics import mean_squared_error

print(mean_squared_error(y_train, pred_y_train))

0.13150457697603035


### Root Mean Squared Error

In [31]:
def rmse_cal(actuals, predictions):
    dif = actuals - predictions ** 2
    result = np.sqrt(dif.mean())
    return result

In [33]:
#y_train = y_train.to_numpy()
rmse = rmse_cal(y_train, pred_y_train)
print(f'Root Mean Squared Error (Training): {rmse}')

Root Mean Squared Error (Training): 0.3626355980540257


In [35]:
from sklearn.metrics import root_mean_squared_error

print(root_mean_squared_error(y_train, pred_y_train))

0.3626355980540663


### Standard Scaling

In [115]:
def cal_sigma(data):
    data = data.to_numpy()
    mean_data = data.mean()
    sigma_squared = (data - mean_data) ** 2
    sigma_squared_mean = sigma_squared.mean()
    return np.sqrt(sigma_squared_mean)

In [117]:
sigma_of_columns = {c: i for i, c in enumerate(heart_data.columns)}
sigma_of_columns

{'age': 0,
 'anaemia': 1,
 'creatinine_phosphokinase': 2,
 'diabetes': 3,
 'ejection_fraction': 4,
 'high_blood_pressure': 5,
 'platelets': 6,
 'serum_creatinine': 7,
 'serum_sodium': 8,
 'sex': 9,
 'smoking': 10,
 'time': 11,
 'DEATH_EVENT': 12}

In [119]:
for col in list(heart_data):
    sigma_of_columns[col] = cal_sigma(heart_data[col])

In [131]:
sigma_of_columns

{'age': 11.874901429842655,
 'anaemia': 0.49527696249988684,
 'creatinine_phosphokinase': 968.6639668032415,
 'diabetes': 0.49324017403854936,
 'ejection_fraction': 11.815033462318585,
 'high_blood_pressure': 0.4773361502524231,
 'platelets': 97640.54765451424,
 'serum_creatinine': 1.0327786652795918,
 'serum_sodium': 4.405092379513557,
 'sex': 0.4773361502524231,
 'smoking': 0.46688771549471964,
 'time': 77.48430960326975,
 'DEATH_EVENT': 0.46688771549471964}

In [147]:
std = heart_data.std()
type(std)

pandas.core.series.Series

In [133]:
std_scaled_data = heart_data.copy()

In [141]:
for col in list(std_scaled_data):
    mean_of_col = std_scaled_data[col].mean()
    sigma_of_col = sigma_of_columns[col]
    
    for x in std_scaled_data[col]:
        x = (x - mean_of_col)/sigma_of_col