# Importing all the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import plotly.express as px

# Reading the CSV file

In [None]:
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head()

In [None]:
df.info()

**As we can see their are no null/missing values as well as no categorical columns , which is great!!**

**Just Renaming few columns**

In [None]:
df.rename(columns={'creatinine_phosphokinase' : 'cpk' ,'DEATH_EVENT':'death_event'}  , inplace=True)
df.head()

Printing out the number of unique values for each column

In [None]:
for i in df.columns:
  print(i,df[i].nunique())

# **EDA**

# **Anaemia**

In [None]:
ds = df['anaemia'].value_counts().reset_index()
ds.columns = ['anaemia', 'count']
fig = px.bar(ds, x='anaemia', y="count", orientation='v', title='Count of Patients with Anaemia', width=500)
fig.show()

In [None]:
pd.crosstab(df.anaemia  ,df.death_event).plot(kind='bar')
plt.title('Death Event as per Anaemia')
plt.xlabel('Anaemia')
plt.ylabel('Death')
plt.show()

# Smoking

In [None]:
ds = df['smoking'].value_counts().reset_index()
ds.columns = ['smoking', 'count']
fig = px.bar(ds, x='smoking', y="count", orientation='v', title='Count of Patients who Smoke', width=500)
fig.show()

In [None]:
pd.crosstab(df.smoking ,df.death_event).plot(kind='bar')
plt.title('Death Event as per Smoking ')
plt.xlabel('Smoking')
plt.ylabel('Death')
plt.show()

In [None]:
print("Percentage of people who died and are smokers:", 
      df["death_event"][df["smoking"] == 1].value_counts(normalize = True)[1]*100)


In [None]:
print("Percentage of people who died and are not smokers:", 
      df["death_event"][df["smoking"] == 0].value_counts(normalize = True)[1]*100)


**Quite Surprisingly people who smoke have a mortality rate of around 31% and people who don't have a mortality rate of 32.5%, that means..😶**

# Blood Pressure

In [None]:
ds = df['high_blood_pressure'].value_counts().reset_index()
ds.columns = ['high_blood_pressure', 'count']
fig = px.bar(ds, x='high_blood_pressure', y="count", orientation='v', title='Count of Patients with high blood pressure', width=500)
fig.show()

In [None]:
pd.crosstab(df.high_blood_pressure  ,df.death_event).plot(kind='bar')
plt.title('Death Event as per BLood Pressure ')
plt.xlabel('BP')
plt.ylabel('Death')
plt.show()

# Diabetes

In [None]:
ds = df['diabetes'].value_counts().reset_index()
ds.columns = ['diabetes', 'count']
fig = px.bar(ds, x='diabetes', y="count", orientation='v', title='Count of Patients with diabetes', width=500)
fig.show()

In [None]:
pd.crosstab(df.diabetes ,df.death_event).plot(kind='bar')
plt.title('Death Event as per diabetes ')
plt.xlabel('diabetes ')
plt.ylabel('Death')
plt.show()

# Sex

In [None]:
ds = df['sex'].value_counts().reset_index()
ds.columns = ['sex', 'count']
fig = px.bar(ds, x='sex', y="count", orientation='v', title='Count of Patients according to sex', width=500)
fig.show()

In [None]:
pd.crosstab(df.sex ,df.death_event).plot(kind='bar')
plt.title('Death Event as per Sex')
plt.xlabel('Sex')
plt.ylabel('Death')
plt.show()

In [None]:
print("Females:", 
      df["death_event"][df["sex"] == 0].value_counts(normalize = True)[1]*100)

print("Males:", 
      df["death_event"][df["sex"] == 1].value_counts(normalize = True)[1]*100)


**As we can see there are very high number of entries for males but if we calculate by percentage we can observe that females have higher mortality rate**

# Age

**As there are too many unique values for age , I have grouped them in 4 groups - Age between 0-30, Age between 30-50, Age between 50-70 and Age greater than 70**

In [None]:
g_30=list()
g_50=list()
g_70=list()
greater70 = list()
for i in df.age:
  if i<=30:
    g_30.append(1)
    g_50.append(0)
    g_70.append(0)
    greater70.append(0)

  elif i>30 and i<=50:
    g_30.append(0)
    g_50.append(1)
    g_70.append(0)
    greater70.append(0)
  
  elif i>50 and i<=70:
    g_30.append(0)
    g_50.append(0)
    g_70.append(1)
    greater70.append(0)
  
  elif i>70:
    g_30.append(0)
    g_50.append(0)
    g_70.append(0)
    greater70.append(1)

In [None]:
df['age_till_30'] = g_30
df['age_bet_30_50'] = g_50
df['age_bet_50_70'] = g_70
df['age_gret_70'] = greater70


# Age below 30

**For the graphs below observe only value = 1 on x axis which means that the ages fall in that group , if that made any sense**

In [None]:
df.age_till_30.value_counts()

**As we can see there are NO entries for ages between 0-30**

In [None]:
ds = df['age_till_30'].value_counts().reset_index()
ds.columns = ['age_till_30', 'count']
fig = px.bar(ds, x='age_till_30', y="count", orientation='v', title='Count of Patients with age till 30', width=500)
fig.show()

In [None]:
pd.crosstab(df.age_till_30 ,df.death_event).plot(kind='bar')
plt.title('Death Event for people with Age till 30')
plt.xlabel('Age')
plt.ylabel('Death')
plt.show()

# Age between 30 and 50

In [None]:
df.age_bet_30_50.value_counts()

In [None]:
ds = df['age_bet_30_50'].value_counts().reset_index()
ds.columns = ['age_bet_30_50', 'count']
fig = px.bar(ds, x='age_bet_30_50', y="count", orientation='v', title='Count of Patients with age between 30 and 50', width=500)
fig.show()

In [None]:
pd.crosstab(df.age_bet_30_50 ,df.death_event).plot(kind='bar')
plt.title('Death Event for people with Age between 30 and 50')
plt.xlabel('Age')
plt.ylabel('Death')
plt.show()

In [None]:
print("Mortlity Rate:", 
      df["death_event"][df["age_bet_30_50"] == 1].value_counts(normalize = True)[1]*100)

**As we can see there are decent amount of entries for this age group as well with a mortality rate of 25%**

# Age between 50 and 70

In [None]:
df.age_bet_50_70.value_counts()

In [None]:
ds = df['age_bet_50_70'].value_counts().reset_index()
ds.columns = ['age_bet_50_70', 'count']
fig = px.bar(ds, x='age_bet_50_70', y="count", orientation='v', title='Count of Patients with age between 50 and 70', width=500)
fig.show()

In [None]:
pd.crosstab(df.age_bet_50_70 ,df.death_event).plot(kind='bar')
plt.title('Death Event for people with Age bet 50 and 70')
plt.xlabel('Age')
plt.ylabel('Death')
plt.show()

In [None]:
print("Mortality Rate:", 
      df["death_event"][df["age_bet_50_70"] == 1].value_counts(normalize = True)[1]*100)

**We can clearly see that this the age group with maximum entries and a mortality rate of around 26.5%**

# Age greater than 70

In [None]:
df.age_gret_70.value_counts()

In [None]:
ds = df['age_gret_70'].value_counts().reset_index()
ds.columns = ['age_gret_70', 'count']
fig = px.bar(ds, x='age_gret_70', y="count", orientation='v', title='Count of Patients with age greater than 70', width=500)
fig.show()

In [None]:
pd.crosstab(df.age_gret_70 ,df.death_event).plot(kind='bar')
plt.title('Death Event for people with Age greater than 70')
plt.xlabel('Age')
plt.ylabel('Death')
plt.show()

In [None]:
print("Mortality Rate:", 
      df["death_event"][df["age_gret_70"] == 1].value_counts(normalize = True)[1]*100)


**Not so surprisingly this the age group with the highest mortality rate of 59.6%%**

# Long Form Graphs

In [None]:
fig = px.bar(df, x="diabetes", y="age", color="death_event", title="Long-Form Input")
fig.show()

In [None]:
fig = px.bar(df, x="sex", y="age", color="death_event", title="Long-Form Input")
fig.show()

In [None]:
fig = px.bar(df, x="smoking", y="age", color="death_event", title="Long-Form Input")
fig.show()

# Box Plots

In [None]:
import plotly.express as px
fig = px.violin(df, y="age", x="sex", color="death_event", box=True, points="all", hover_data=df.columns)
fig.update_layout(title_text="Analysis of Age and Sex on Death Event")
fig.show()

> **According to this dataset median age of males who die due to heart failure is more than median age of females**

In [None]:
import plotly.express as px
fig = px.violin(df, y="age", x="smoking", color="death_event", box=True, points="all", hover_data=df.columns)
fig.update_layout(title_text="Analysis of Age and Smoking on Death Event")
fig.show()

# 🚬!=💘

**People who smoke and die have a median age of 68.5 while people who don't smoke and die have a median age of 60.33 which means smoking isn't very risky for heart.**

**Although that doen't mean you start smoking 🙅‍♂️🙅‍♂️🙅‍♂️**

# **Training**

> **I am using CatBoostClassifier for Classification**

In [None]:
X = df.drop(columns=('death_event'),axis=1)
y = df.death_event

> I am applying min max scaling to only platelets and cpk column becuase that seemed to work out fine for me

In [None]:
scaler = MinMaxScaler(feature_range=(0,100))
X['platelets'] = scaler.fit_transform(X[['platelets']])
X['cpk'] = scaler.fit_transform(X[['cpk']])
X.head()

# Splitting the data in train and test set

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size = 0.25, random_state=1 )

# Fitting the model

In [None]:
y.value_counts()

> **There is a high class imbalanace in the dataset so I tried giving them class weights.**

# With Class Weights

In [None]:
class_weight = {0:1 , 1:2}
model = CatBoostClassifier(n_estimators=400  ,  depth = 4 , class_weights = class_weight)
model.fit(X_train , y_train)

In [None]:
print('Training Accuracy: {:.3f}'.format(accuracy_score(y_train, model.predict(X_train))))
print('Testing Accuracy: {:.3f}'.format(accuracy_score(y_test, model.predict(X_test))))

**but that didn't seem to work for me very well!**

# Without Class Weights

In [None]:
model = CatBoostClassifier(n_estimators=400  ,  depth = 4)
model.fit(X_train , y_train)

In [None]:
print('Training Accuracy: {:.3f}'.format(accuracy_score(y_train, model.predict(X_train))))
print('Testing Accuracy: {:.3f}'.format(accuracy_score(y_test, model.predict(X_test))))

**As we can see my model is very slightly overfitting and it is giving a accuracy of around 93%, which is great with such less data**

# Classification Report

In [None]:
pred = model.predict(X_test)

print(classification_report(y_test, pred))

**This is the classification report with test scores on various metrics**

# Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, pred)
print(cm)
print('True Positive' , cm[0,0])
print('False Positive' , cm[0,1])
print('True Negative' , cm[1,1])
print('False Negative' , cm[1,0])

In [None]:
plot_confusion_matrix(model, X_test, y_test)
plt.show()

# If you found my work useful don't forget to upvote!😉