In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/heart-failure-prediction/heart.csv


**Aim of the model:** To predict whether someone is likely to have heart failure or not based on their chloresterol level. 

**References:**

Becker, D. (2022). Your First Machine Learning Model. Kaggle. https://www.kaggle.com/code/dansbecker/your-first-machine-learning-model

Becker, D. (2022). Model Validation. Kaggle. https://www.kaggle.com/code/dansbecker/model-validation

Becker, D. (2022). Underfitting and Overfitting. Kaggle. https://www.kaggle.com/code/dansbecker/underfitting-and-overfitting

Becker, D. (2022). Random Forests. Kaggle. https://www.kaggle.com/code/dansbecker/random-forests

Fedesoriano. (September 2021). Heart Failure Prediction Dataset. Retrieved [23/11/2022] from https://www.kaggle.com/fedesoriano/heart-failure-prediction.

In [2]:
import pandas as pd
heart_file_path = '../input/heart-failure-prediction/heart.csv'
heart_data = pd.read_csv(heart_file_path)
heart_data.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [3]:
heart_data = heart_data.dropna(axis=0)

In [4]:
y = heart_data.Cholesterol
heart_features = ['Age', 'RestingBP']
X = heart_data[heart_features]
X.describe()

Unnamed: 0,Age,RestingBP
count,918.0,918.0
mean,53.510893,132.396514
std,9.432617,18.514154
min,28.0,0.0
25%,47.0,120.0
50%,54.0,130.0
75%,60.0,140.0
max,77.0,200.0


In [5]:
X.head()

Unnamed: 0,Age,RestingBP
0,40,140
1,49,160
2,37,130
3,48,138
4,54,150


In [6]:
from sklearn.tree import DecisionTreeRegressor
heart_model = DecisionTreeRegressor(random_state=1)
heart_model.fit(X, y)

DecisionTreeRegressor(random_state=1)

In [7]:
print("Making predictions for the following 5 measurements of cholesterol and restingBP:")
print(X.head())
print("The predictions are")
print(heart_model.predict(X.head()))

Making predictions for the following 5 measurements of cholesterol and restingBP:
   Age  RestingBP
0   40        140
1   49        160
2   37        130
3   48        138
4   54        150
The predictions are
[241.         180.         237.66666667 214.         255.5       ]


In [8]:
from sklearn.metrics import mean_absolute_error
predicted_cholesterol_levels = heart_model.predict(X)
mean_absolute_error(y, predicted_cholesterol_levels)

40.687761995931936

In [9]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)
heart_model = DecisionTreeRegressor()
heart_model.fit(train_X, train_y)
val_predictions = heart_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

116.81123188405796


In [10]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, 
                                  random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)
    

In [11]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t \t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5 	 	 Mean Absolute Error: 93
Max leaf nodes: 50 	 	 Mean Absolute Error: 98
Max leaf nodes: 500 	 	 Mean Absolute Error: 118
Max leaf nodes: 5000 	 	 Mean Absolute Error: 118


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
heart_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, heart_preds))

104.82874175848306
