In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load Dataset

In [None]:
df = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

## Descriptive Analysis

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

## Unique Values

In [None]:
for col in df:
    print(col,df[col].nunique())

## Find Null Values

In [None]:
df.isnull().sum().plot(kind="bar")

No null values present

## Drop Features

In [None]:
df.drop("time",axis = 1, inplace= True)

In [None]:
df.shape

## Feature Engineering

In [None]:
def to_range(x,interval = 10):
    
    start = int(x/interval) * interval
    end = start + (interval - 1)
    
    return str(start) + "-" + str(end)

In [None]:
df["Age_Band"] = df["age"].apply(to_range)

### What is a healthy platelet count?
> A normal platelet count ranges from 150,000 to 450,000 platelets per microliter of blood. Having more than 450,000 platelets is a condition called thrombocytosis; having less than 150,000 is known as thrombocytopenia. You get your platelet number from a routine blood test called a complete blood count (CBC).

[https://www.hopkinsmedicine.org/health/conditions-and-diseases/what-are-platelets-and-why-are-they-important](https://www.hopkinsmedicine.org/health/conditions-and-diseases/what-are-platelets-and-why-are-they-important)

In [None]:
def plat_range(x):
    
    if x >= 150000 and x <= 450000:
        return "normal"
    elif x < 150000:
        return "low"
    else:
        return "high"

df['plat_count'] = df['platelets'].apply(plat_range)

### Creatine phosphokinase test
> Total CPK normal values:
10 to 120 micrograms per liter (mcg/L)

> Normal value ranges may vary slightly among different laboratories. Some labs use different measurements or test different samples. Talk to your provider about the meaning of your specific test results.
> 
[https://www.ucsfbenioffchildrens.org/medical-tests/creatine-phosphokinase-test](https://www.ucsfbenioffchildrens.org/medical-tests/creatine-phosphokinase-test)

In [None]:
def cpk_range(x):
    
    if x >= 10 and x <= 120:
        return "normal"
    else:
        return "abnormal"

df['creatinine_phosphokinase_Band'] = df['creatinine_phosphokinase'].apply(cpk_range)

###  Ejection fraction

> A normal left ventricular ejection fraction (LVEF) ranges from 55% to 70%. An LVEF of 65%, for example, means that 65% of the total amount of blood in the left ventricle is pumped out with each heartbeat. Your EF can go up and down, based on your heart condition and how well your treatment works.

[https://my.clevelandclinic.org/health/articles/16950-ejection-fraction#:~:text=A%20normal%20left%20ventricular%20ejection,how%20well%20your%20treatment%20works.](https://my.clevelandclinic.org/health/articles/16950-ejection-fraction#:~:text=A%20normal%20left%20ventricular%20ejection,how%20well%20your%20treatment%20works.)

In [None]:
def ejecfrac_range(x):
    
    if x >= 55 and x <= 70:
        return "normal"
    elif x >= 40 and x <= 54:
        return "Slightly Below Normal"
    elif x >= 35 and x <= 39:
        return "Moderately below normal"
    elif x < 35 :
        return "Severely below normal"
    
    else:
        return "High"

df['ejection_fraction_Band'] = df['ejection_fraction'].apply(ejecfrac_range)

## serum_creatinine

> The typical reference range for serum creatinine is 60 to 110 micromoles per liter (μmol/L) (0.7 to 1.2 milligrams per deciliter (mg/dL)) for men and 45 to 90 μmol/L (0.5 to 1.0 mg/dL) for women.
>

[https://www.medicalnewstoday.com/articles/322380#what-does-the-test-involve](https://www.medicalnewstoday.com/articles/322380#what-does-the-test-involve)

In [None]:
def creatinine_range(x):
    
    if x.sex == 1 :
        if x['serum_creatinine'] >= 0.7 and  x['serum_creatinine'] <= 1.2:
            return "normal"
        elif  x['serum_creatinine'] < 0.7:
            return "less"
        else:
            return "high"
    else:
        if x['serum_creatinine'] >= 0.5 and  x['serum_creatinine'] <= 1.0:
            return "normal"
        elif  x['serum_creatinine'] < 0.5:
            return "less"
        else:
            return "high"
        
        
    
df['serum_creatinine_Band'] = df[['serum_creatinine','sex']].apply(creatinine_range, axis =1)

### Serum sodium

A normal blood sodium level is between 135 and 145 milliequivalents per liter (mEq/L). Hyponatremia occurs when the sodium in your blood falls below 135 mEq/L.


[https://www.mayoclinic.org/diseases-conditions/hyponatremia/symptoms-causes/syc-20373711#:~:text=A%20normal%20blood%20sodium%20level,Certain%20medications.](https://www.mayoclinic.org/diseases-conditions/hyponatremia/symptoms-causes/syc-20373711#:~:text=A%20normal%20blood%20sodium%20level,Certain%20medications.)

In [None]:
def sodium_range(x):
    
    if x>= 135 and  x <= 145:
        return "normal"
    elif  x < 135:
        return "Hyponatremia"
    else:
        return "hypernatremia"
        
        
    
df['serum_sodium_Band'] = df['serum_sodium'].apply(sodium_range)

In [None]:
df.drop(["serum_sodium", "serum_creatinine", "ejection_fraction","creatinine_phosphokinase","platelets","age"],axis =1, inplace = True)

In [None]:
df.head()

## EDA

In [None]:
import seaborn as sns
from scipy.stats import chi2_contingency

factors_paired = [(i,j) for i in df.columns.values for j in df.columns.values] 

chi2, p_values =[], []

for f in factors_paired:
    if f[0] != f[1]:
        chitest = chi2_contingency(pd.crosstab(df[f[0]], df[f[1]]))   
        chi2.append(chitest[0])
        p_values.append(chitest[1])
    else:      # for same factor pair
        chi2.append(0)
        p_values.append(0)

chi2 = np.array(chi2).reshape((12,12)) # shape it as a matrix
chi2 = pd.DataFrame(chi2, index=df.columns.values, columns=df.columns.values) # the

In [None]:
chi2['DEATH_EVENT'].sort_values(ascending = False)

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(chi2,annot=True)

## Feature Engineering

In [None]:

from sklearn import preprocessing
from collections import defaultdict

d = defaultdict(preprocessing.LabelEncoder)
fit = df.apply(lambda x: d[x.name].fit_transform(x))

In [None]:
fit

In [None]:
!pip install pgmpy

In [None]:
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD

model = BayesianModel([("high_blood_pressure","ejection_fraction_Band"),("Age_Band","DEATH_EVENT"),("plat_count","DEATH_EVENT"),("Age_Band","high_blood_pressure"),("ejection_fraction_Band","DEATH_EVENT")\
                       ,("anaemia","plat_count"),("serum_sodium_Band","DEATH_EVENT"),("serum_creatinine_Band","DEATH_EVENT"),\
                       ("sex","ejection_fraction_Band"),("diabetes","Age_Band"),("smoking","high_blood_pressure"),("creatinine_phosphokinase_Band","DEATH_EVENT")])

In [None]:
import networkx as nx
import pylab as plt
nx.draw(model, with_labels=True)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

sub_df = shuffle(fit)
train, test = train_test_split(sub_df,test_size=0.15,random_state=20781)

In [None]:
model.fit(train)

In [None]:
y = test['DEATH_EVENT']

In [None]:
test = test.drop(["DEATH_EVENT"],axis = 1).reset_index().drop("index",axis=1)

In [None]:
predictions = model.predict(test)

In [None]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score

precision,recall,f_score,_ = precision_recall_fscore_support(y, predictions, average='macro')
roc = roc_auc_score(y, predictions)
accuracy = accuracy_score(y, predictions)
results_dict = {
    "model":["bayes belief network"],
    "precision":[precision],
    "recall":[recall],
    "roc":[roc],
    "f_score": [f_score],
    "accuracy":[accuracy]
}

In [None]:
results_dict

### Random Forest

In [None]:
y = sub_df['DEATH_EVENT']
X = sub_df.drop("DEATH_EVENT",axis = 1)

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.15,random_state=20781)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_X, train_y)

In [None]:
predictions = clf.predict(test_X)

In [None]:
precision,recall,f_score,_ = precision_recall_fscore_support(test_y, predictions, average='macro')
roc = roc_auc_score(test_y, predictions)
accuracy = accuracy_score(test_y, predictions)
results_rfc_dict = {
    "model":["random forest"],
    "precision":[precision],
    "recall":[recall],
    "roc":[roc],
    "f_score": [f_score],
    "accuracy":[accuracy]
}

In [None]:
rfc_res_pd = pd.DataFrame(results_rfc_dict)

In [None]:
bayes_pd = pd.DataFrame(results_dict)

In [None]:
pd.concat([rfc_res_pd, bayes_pd],axis= 0)

In [None]:
results_rfc_dict