In [15]:
import pandas as pd

In [16]:
df = pd.read_csv('heart_failure_clinical_records_dataset_edited.csv')
print(df.head(5))

   Unnamed: 0   age  anaemia  creatinine_phosphokinase diabetes  \
0           0  75.0        0                       582      yes   
1           1  55.0        0                      7861      yes   
2           2  65.0        0                       146      yes   
3           3  50.0        1                       111      yes   
4           4  65.0        1                       160       no   

   ejection_fraction  high_blood_pressure  platelets  serum_creatinine  \
0                 20                    1  265000.00               1.9   
1                 38                    0  263358.03               1.1   
2                 20                    0  162000.00               1.3   
3                 20                    0  210000.00               1.9   
4                 20                    0  327000.00               2.7   

   serum_sodium     sex  smoking  time  DEATH_EVENT  
0           130    male        0     4            1  
1           136    male        0     6      

In [17]:
print(df.describe())

       Unnamed: 0         age     anaemia  creatinine_phosphokinase  \
count  299.000000  299.000000  299.000000                299.000000   
mean   149.000000   60.833893    0.431438                581.839465   
std     86.458082   11.894809    0.496107                970.287881   
min      0.000000   40.000000    0.000000                 23.000000   
25%     74.500000   51.000000    0.000000                116.500000   
50%    149.000000   60.000000    0.000000                250.000000   
75%    223.500000   70.000000    1.000000                582.000000   
max    298.000000   95.000000    1.000000               7861.000000   

       ejection_fraction  high_blood_pressure      platelets  \
count         299.000000           299.000000     299.000000   
mean           38.083612             0.351171  263358.029264   
std            11.834841             0.478136   97804.236869   
min            14.000000             0.000000   25100.000000   
25%            30.000000             0.0

In [18]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                299 non-null    int64  
 1   age                       299 non-null    float64
 2   anaemia                   299 non-null    int64  
 3   creatinine_phosphokinase  299 non-null    int64  
 4   diabetes                  299 non-null    object 
 5   ejection_fraction         299 non-null    int64  
 6   high_blood_pressure       299 non-null    int64  
 7   platelets                 299 non-null    float64
 8   serum_creatinine          299 non-null    float64
 9   serum_sodium              299 non-null    int64  
 10  sex                       299 non-null    object 
 11  smoking                   299 non-null    int64  
 12  time                      299 non-null    int64  
 13  DEATH_EVENT               299 non-null    int64  
dtypes: float64

In [19]:
print(df['DEATH_EVENT'].value_counts())
#distribution is a little bit imbalanced, but it is not highly skewed.

DEATH_EVENT
0    203
1     96
Name: count, dtype: int64


In [20]:
#Convert sex and diabetes into dummy variables (e.g., male = 1, female = 0)

In [21]:
df['sex'] = df['sex'].apply(lambda x: 1 if x == 'male' else 0)
df['diabetes'] = df['diabetes'].apply(lambda x: 1 if x == 'yes' else 0)

In [22]:
#Scale Numerical Variables

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['age', 'creatinine_phosphokinase', 
                                           'ejection_fraction', 'platelets',
                                           'serum_creatinine', 'serum_sodium', 'time']])
df_scaled = pd.DataFrame(scaled_features, columns=['age', 'creatinine_phosphokinase', 
                                                   'ejection_fraction', 'platelets',
                                                   'serum_creatinine', 'serum_sodium', 'time'])
df_scaled = pd.concat([df_scaled, df[['anaemia', 'high_blood_pressure', 
                                      'diabetes', 'smoking', 'DEATH_EVENT']]], axis=1)
#Step1:Scaling the Features Step2:Create a New DataFrame with Scaled Features Step 3:Add Non-Numerical Columns Back to the Scaled Data

In [25]:
#Split Data into Train and Test Sets

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X = df_scaled.drop('DEATH_EVENT', axis=1)
y = df_scaled['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X-independent variables; y-dependent variable

In [28]:
#Build and Train the Logistic Regression Model

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [31]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [32]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
#TN 33;FP 2;FN 11; TP 14
#33 patients survived, and the model correctly predicted they survived
#2 patients survived, but the model incorrectly predicted they died
#11 patients died, but the model incorrectly predicted they survived
#14 patients died, and the model correctly predicted their deaths

Confusion Matrix:
[[33  2]
 [11 14]]


In [33]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
#The model only identifies 56% of the patients who actually died


Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.94      0.84        35
           1       0.88      0.56      0.68        25

    accuracy                           0.78        60
   macro avg       0.81      0.75      0.76        60
weighted avg       0.80      0.78      0.77        60



In [None]:
#Interpret the Coefficients

In [34]:
import numpy as np

In [35]:
coef_df = pd.DataFrame(np.exp(logreg.coef_[0]), 
                       X.columns, 
                       columns=['Odds Ratio']).sort_values(by='Odds Ratio', ascending=False)
print(coef_df)

                          Odds Ratio
serum_creatinine            2.128179
age                         1.830023
creatinine_phosphokinase    1.087660
anaemia                     0.943902
high_blood_pressure         0.934834
smoking                     0.889849
platelets                   0.884606
serum_sodium                0.797119
diabetes                    0.697478
ejection_fraction           0.425940
time                        0.197769


In [None]:
#Serum Creatinine and Age are the strongest predictors for increased risk.
#Ejection Fraction and Follow-Up Time have the most protective effects

In [None]:
#A 1% increase in ejection fraction decreases the odds of death by 57%.
#This strong protective effect reflects the critical role of heart function in survival.

In [None]:
#For every additional day of follow-up, the odds of death decrease by 80%.
#Longer monitoring likely indicates stable patients who survived longer.