In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
clinical_data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
clinical_data.head()

In [None]:
clinical_data.describe()

In [None]:
clinical_data.isna().sum()

# EDA 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

In [None]:
clinical_data.dtypes

In [None]:
clinical_data.anaemia = clinical_data.anaemia.astype('category')     
clinical_data.high_blood_pressure = clinical_data.high_blood_pressure.astype('category')     
clinical_data.sex = clinical_data.sex.astype('category')     
clinical_data.smoking = clinical_data.smoking.astype('category')   
clinical_data.diabetes = clinical_data.diabetes.astype('category')     
clinical_data.DEATH_EVENT = clinical_data.DEATH_EVENT.astype('category')     

clinical_data.dtypes

<h3> Our target feature is DEATH_EVENT so lets check the correlation between the target feature and the rest of the features</h3>

In [None]:
age_groups = pd.cut(clinical_data['age'],5,labels=['39-50','51-61','62-72','73-83','84-95'])
cd = clinical_data.copy()
cd = cd[cd['DEATH_EVENT'] == 1]
cd['age_groups'] = age_groups
cd = cd.groupby(by='age_groups').count()
plt.figure(figsize=(11,6))
ax = sns.barplot(x=cd.index,y='DEATH_EVENT',data=cd)
ax.set(title="Number Of Deaths In A Particular Age Group")
plt.show()

# **We can see that most heart failures occur between ages 51 and 72**

**lets try and see a more global point view of any correlations between our features using pearson correlation.**

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(11,7))
de = clinical_data[clinical_data['DEATH_EVENT']==1]
cor = clinical_data.corr('pearson')
sns.heatmap(cor,cmap='Blues',annot=True)

### We Can See there is no major correlation beteewn any of our features. 

# Feature Engineering
lets try to create some interesting features which will boost our chances of a successful model.

In [None]:
#first lets copy the original data so we dont need to reload it in case of a mistake
w_data = clinical_data.copy()
age_groups = pd.cut(clinical_data['age'],5,labels=['39-50','51-61','62-72','73-83','84-95'])
w_data['age groups'] = age_groups
w_data['platelets/age'] = w_data['platelets']/w_data['age']
w_data['sodium/creatinine'] = w_data['serum_sodium']/w_data['serum_creatinine']
w_data

In [None]:
#target
y = w_data.pop('DEATH_EVENT')
w_data
r_features = ['anaemia','diabetes','ejection_fraction','high_blood_pressure','sex','platelets/age',
     'sodium/creatinine','smoking','time']
X=w_data[r_features]

# Model Selection And Evaluation 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import f1_score as f1

In [None]:
def optimal_n_leaf(train_x,train_y,test_x,test_y,leaf_list,rstate=0):
    result = []
    for n in leaf_list:
        model = RandomForestRegressor(random_state=rstate,n_estimators=n,max_leaf_nodes=n)
        model.fit(train_x,train_y)
        pred = model.predict(test_x)
        result.append(f1(np.round(pred),test_y))
    return result
        
train_x,test_x,train_y,test_y = train_test_split(X,y)
leaf_candidates = [2,3,5,7,12,19,31,50]


In [None]:
leaf_check = optimal_n_leaf(train_x,train_y,test_x,test_y,leaf_candidates)
plt.figure(figsize=(12,8))
ax = sns.lineplot(x=np.arange(8),y=leaf_check)
ax.set_xticklabels(labels=[-1,2,3,5,7,12,19,31,50])
ax.set_title('Best F1 Score Via K Variable')
ax.set_xlabel('K Value',fontsize=15)
ax.set_ylabel('F1 Score',fontsize=15)
plt.show()


<h3>We can see that on average even taking acount the NP properites of the random tree algorithm and uncontrolable states we can still on average get an f1_score of 0.80</h3>

__Lets try and see if a KNN model can do better!__

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def optimal_k_value(train_x,train_y,test_x,test_y,k_list,rstate=0):
    result = []
    for k in k_list:
        model = KNeighborsClassifier(n_neighbors=k)
        model.fit(train_x,train_y)
        pred = model.predict(test_x)
        result.append(f1(np.round(pred),test_y))
    return result

In [None]:
k_result = optimal_k_value(train_x,train_y,test_x,test_y,[1,2,3,4,5,6,7,8,9,10,20])
plt.figure(figsize=(12,8))
ax = sns.lineplot(x=np.arange(11),y=k_result)
ax.set_xticklabels(labels=[-1,1,2,3,4,5,6,7,8,9,10,20])
ax.set_title('Best F1 Score Via K Variable')
ax.set_xlabel('K Value',fontsize=15)
ax.set_ylabel('F1 Score',fontsize=15)
plt.show()

<h3>We can see that using small k values (which are usually prone to underfitting) give us an f1 score of less the 0.6 in average and as the value of k increases​ the f1 score drops </h3>

<h3>Lets try using an adaboost model and see if it can overscore our RandomForest model</h3>

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
def optimal_n_value(train_x,train_y,test_x,test_y,n_list,rstate=0):
    result = []
    for n in n_list:
        model = AdaBoostClassifier(n_estimators=n,learning_rate=0.01,algorithm='SAMME')
        model.fit(train_x,train_y)
        pred = model.predict(test_x)
        result.append(f1(np.round(pred),test_y))
    return result

In [None]:
best_n = optimal_n_value(train_x,train_y,test_x,test_y,[100,200,300,600,900])
plt.figure(figsize=(12,8))
ax = sns.lineplot(x=np.arange(5),y=best_n)
ax.set_xticklabels(labels=[-1,100,200,300,600,900])
ax.set_title('Best F1 Score Via K Variable')
ax.set_xlabel('Number Of Estimators',fontsize=15)
ax.set_ylabel('F1 Score',fontsize=15)
plt.show()

<h2>We got great f1 score of 0.8 for most number of estimators up to 300 and its much more stable the the random forest which may give us better results somtimes but its less stable</h2>

<h2>Considering the analysis we did i will prefer to use the the adaboost model as it is much more stable then the random forest model which was the only one from the ones tested which showed simillar f1 scores  </h2>

In [None]:
model = AdaBoostClassifier(n_estimators=100,learning_rate=0.05,algorithm='SAMME')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
model.fit(X,y)
predictions = model.predict(X)
submission = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
submission['Prediction'] = predictions
submission
submission.to_csv("submission_AdaBoost.csv", index=False)