In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#import data
data = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
data.head(10)

In [None]:
data.describe()

In [None]:
#checking number of records and features
data.shape

In [None]:
#checking total NaN values in each column
data.isnull().sum()

In [None]:
data['anaemia'].value_counts()

In [None]:
#value_count_for_respective_columns
list = ['diabetes','ejection_fraction','high_blood_pressure','sex','smoking','DEATH_EVENT']
for col in list:
    print(data[col].value_counts())

**Getting some Insights for Heart Failure DataSet**

In [None]:
sns.countplot(x='DEATH_EVENT',hue='diabetes',data=data,palette='RdBu_r')

Non-Diabetic people died more than non diabetic people due to heart disease which means Death due to heart disease has no relation with Diabetes



In [None]:
sns.countplot(x="ejection_fraction", hue="DEATH_EVENT", data=data)

Ejection_Fraction for 20,25 are the ones where count Death Events are very high.

Ejection_Fraction 20 is the category where survived cases is extremely low as compared to death(Critical one)

In [None]:
data['ejection_fraction'].replace(17,25,inplace=True)
data['ejection_fraction'].replace(62,25,inplace=True)
data['ejection_fraction'].replace(65,14,inplace=True)
data['ejection_fraction'].replace(15,14,inplace=True)
data['ejection_fraction'].replace(70,14,inplace=True)
sns.countplot(x="ejection_fraction", hue="DEATH_EVENT", data=data)

Replaced some labels with one who were having similar results

In [None]:
sns.countplot(x='DEATH_EVENT',hue="high_blood_pressure",data=data,palette='RdBu_r')

High Blood Pressure is not the right parameter to judge the death event

In [None]:
sns.countplot(x='DEATH_EVENT',hue="sex",data=data,palette='RdBu_r')

Male category have high death event

In [None]:
sns.countplot(x='DEATH_EVENT',hue="smoking",data=data,palette='RdBu_r')

Non smokers have high number of death events thus cannot conclude anything from smoking category

In [None]:
sns.countplot(x='DEATH_EVENT',hue='anaemia',data=data,palette='RdBu_r')

Above graph shows count of people who had anaemia and died is almost close to those who survived and had anaemia.

In [None]:
sns.FacetGrid(data,hue='DEATH_EVENT',size=5).map(sns.distplot,"age").add_legend()

People with age around 60 died the most

In [None]:
sns.FacetGrid(data,hue='DEATH_EVENT',size=5).map(sns.distplot,"creatinine_phosphokinase").add_legend()

Creatinine_phosphpokinase Between 0 to 1000 have recorded the highest number of death event

In [None]:
sns.FacetGrid(data,hue='DEATH_EVENT',size=5).map(sns.distplot,"serum_creatinine").add_legend()

sodium_creatinine Between 0.5 to 2 have recorded the highest number of death event

In [None]:
sns.FacetGrid(data,hue='DEATH_EVENT',size=5).map(sns.distplot,"serum_sodium").add_legend()

sodium_sodium Between 130 to 140 have recorded the highest number of death event

In [None]:
#outliers checking and treatment
data.boxplot('serum_sodium')

There are only 4 unexpected low values so replaced with 125

In [None]:
data['serum_sodium'].quantile(np.arange(0,1,0.01))

In [None]:
data.loc[(data['serum_sodium']<125),'serum_sodium']=125

In [None]:
data.boxplot('serum_sodium')

In [None]:
data.boxplot('serum_creatinine')

In [None]:
data.boxplot('creatinine_phosphokinase')

In [None]:
sns.heatmap(data.corr())

In [None]:
X = data.drop("DEATH_EVENT",axis=1)
y = data["DEATH_EVENT"]
cnames = ["time","serum_sodium","serum_creatinine","platelets","creatinine_phosphokinase","age"]
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for col in cnames:
    X[col] = sc.fit_transform(X[[col]])

Now,Bringing all columns to same scale for our model

In [None]:
X.shape

In [None]:
#split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


Applying Logistic Regression

In [None]:
from  sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(X_train,y_train)


In [None]:
y_pred=logreg.predict(X_test)
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,recall_score,precision_score
print("Accuracy:",accuracy_score(y_test, y_pred))
print('f1 score', f1_score(y_test, y_pred,
                              ))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred),annot=True)

**Upsampling for better F1 Score¶**

In [None]:
data.DEATH_EVENT.value_counts()

In [None]:
df_majority = data[data.DEATH_EVENT==0]
df_min = data[data.DEATH_EVENT==1]

In [None]:
import sklearn.utils as ut
df_minority_upsample = ut.resample(df_min,replace=True,n_samples=203,random_state=1)

In [None]:
print(df_majority.shape)
print(df_minority_upsample.shape)

In [None]:
df_upsampled = pd.concat([df_majority,df_minority_upsample])

In [None]:
print(df_upsampled.DEATH_EVENT.value_counts())

In [None]:
X1=df_upsampled.drop("DEATH_EVENT",axis=1)
Y1=df_upsampled["DEATH_EVENT"]

Again we have to bring columns to same scale so doing standard scaler

In [None]:
cnames = ["time","serum_sodium","serum_creatinine","platelets","creatinine_phosphokinase","age"]
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for col in cnames:
    X1[col] = sc.fit_transform(X1[[col]])

In [None]:
#split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1,Y1, test_size=0.2, random_state=0)

Applying Random forest classifier

In [None]:
#Random Forst Classifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
rf=RandomForestClassifier(n_estimators=100,random_state=0)
rf.fit(X_train,y_train)

In [None]:
y_pred=rf.predict(X_test)

In [None]:
print("Accuracy by random forest:",accuracy_score(y_test, y_pred))
print('f1 score ', f1_score(y_test, y_pred,
                              ))

In [None]:
#Applying decision tree
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier("entropy")
clf.fit(X_train,y_train)

In [None]:
y_pred=clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
print('f1 score', f1_score(y_test, y_pred,
                              ))

In [None]:
cf=confusion_matrix(y_test, y_pred)
sns.heatmap(cf, annot=True)

Since only accuracy was not a good parameter in this case we have to increase our F2 score

Thus,I did upsampling technique for treating the imbalance in death event.

I am a beginner and this is my first submission. Thanks!!