In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

In [3]:
#setting file path
file_path = r'Heart Disease Indicators.csv'
heart_data = pd.read_csv(file_path)

In [4]:
heart_data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [6]:
#setting file path to the modified dataset
file_path = r'Heart Disease Indicators modified.csv'
heart_data = pd.read_csv(file_path)

In [7]:
#no = 0 yes = 1, female = 0 male = 1
heart_data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3,30,0,0,57,White,1,1,Very good,5,1,0,1
1,0,20.34,0,0,1,0,0,0,0,80,White,0,1,Very good,7,0,0,0
2,0,26.58,1,0,0,20,30,0,1,67,White,1,1,Fair,8,1,0,0
3,0,24.21,0,0,0,0,0,0,0,77,White,0,0,Good,6,0,0,1
4,0,23.71,0,0,0,28,0,1,0,42,White,0,1,Very good,8,0,0,0


In [8]:
#dropping some columns we wont need
heart_data.drop(['Race', 'GenHealth'], axis = 1)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3,30,0,0,57,1,1,5,1,0,1
1,0,20.34,0,0,1,0,0,0,0,80,0,1,7,0,0,0
2,0,26.58,1,0,0,20,30,0,1,67,1,1,8,1,0,0
3,0,24.21,0,0,0,0,0,0,0,77,0,0,6,0,0,1
4,0,23.71,0,0,0,28,0,1,0,42,0,1,8,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7,0,1,1,62,1,0,6,1,0,0
319791,0,29.84,1,0,0,0,0,0,1,37,0,1,5,1,0,0
319792,0,24.24,0,0,0,0,0,0,0,47,0,1,6,0,0,0
319793,0,32.81,0,0,0,0,0,0,0,27,0,0,12,0,0,0


In [9]:
#setting prediction value
y = heart_data.HeartDisease

In [10]:
#setting prediction metrics
disease_metrics = ['BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Diabetic',
        'PhysicalActivity', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer']
X = heart_data[disease_metrics]

In [11]:
#splitting the data into a training and validation set
train_X, val_X, train_y, val_y = train_test_split(X, y)

In [12]:
#decision tree model
tree_model = DecisionTreeRegressor()
tree_model.fit(train_X, train_y)
preds = tree_model.predict(val_X)
mae = mean_absolute_error(val_y, preds)
print(mae)

0.1390753976830726


In [13]:
#function to find mae of different tree sized
def tree_depth_error(max_nodes, train_X, val_X, train_y, val_y):
    heart_model = DecisionTreeRegressor(max_leaf_nodes=max_nodes)
    heart_model.fit(train_X, train_y)
    preds = heart_model.predict(val_X)
    mae = mean_absolute_error(val_y, preds)
    return mae

In [14]:
#using tree_depth_error function to find best tree size
rel = {}
for nodes in (5, 50, 500, 5000, 50000):
    my_mae = tree_depth_error(nodes, train_X, val_X, train_y, val_y)
    rel[my_mae] = nodes
best_tree_size = rel[min(rel)]
tree_error = min(rel)
print("best tree size: %d \t mae: %18.17f" %(best_tree_size, min(rel)))

best tree size: 500 	 mae: 0.13302428663842283


In [15]:
#random forest model
forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
forest_preds = forest_model.predict(val_X)
forest_model_error = mean_absolute_error(val_y, forest_preds)
print("forest error: %18.17f" %(forest_model_error))

forest error: 0.13926727154222884


In [16]:
#linear regression model
lr_model = LinearRegression()
lr_model.fit(train_X, train_y)
lr_preds = lr_model.predict(val_X)
lr_error = mean_absolute_error(val_y, lr_preds)
print("error: %18.17f" %(lr_error))

error: 0.14578794903795522


In [17]:
#neural network regression model
nnr_model = MLPRegressor()
nnr_model.fit(train_X, train_y)
nnr_ans = nnr_model.predict(val_X)
nnr_error = mean_absolute_error(val_y, nnr_ans)
print("error: %18.17f" %(nnr_error))

error: 0.16311311198744199


In [19]:
errors = {'Decision Tree': [tree_error*100], 'Random Forest': [forest_model_error*100], 
            'Linear Regression': [lr_error*100], 'Neural Network Regression': [nnr_error*100]}
errors_df = pd.DataFrame(errors)
errors_df
# we take the decision tree as the final model as MLPRegression gives very varying results

Unnamed: 0,Decision Tree,Random Forest,Linear Regression,Neural Network Regression
0,13.302429,13.926727,14.578795,16.311311


In [20]:
#defining final model
f_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size)
f_model.fit(X, y)

In [21]:
#creating a model function for ease of use
def final_model(data):
    f_ans = f_model.predict(data)
    return f_ans

In [22]:
#custom data must be entered as a list // defaults set to 0 // refer README.md for key
my_data = {'BMI': [0.0], 'Smoking': [0],
            'AlcoholDrinking': [0], 'Stroke': [0], 'PhysicalHealth': [0], 
            'MentalHealth': [0], 'DiffWalking': [0], 'Sex': [0], 'AgeCategory': [0], 'Diabetic': [0],
            'PhysicalActivity': [0], 'SleepTime': [0], 'Asthma': [0], 
            'KidneyDisease': [0], 'SkinCancer': [0]}

df = pd.DataFrame(my_data)
df

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
#predictions from custom data
print(final_model(df))

[0.00661275]
