# 2.3 

In this notebook I will begin to sort the data so that I can test different types of models to see which is best suited to predicting if the patient has a cardiovascular disease or not.

In [32]:
# importing necessary packages
import pandas as pd
import numpy as np

In [33]:
# importing the saved data frame that I had cleaned and added features to.
disease_data = "..\data\Heart_disease.csv"
df_disease = pd.read_csv(disease_data)

In [34]:
df_disease.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI,BMI_cat,bp_category
0,1,55.38,1,156,85.0,140,90,3,1,0,0,1,1,34.93,obese (class I),Stage 2
1,3,48.25,2,169,82.0,150,100,1,1,0,0,1,1,28.71,overweight,Stage 2
2,12,61.83,2,178,95.0,130,90,3,3,0,0,1,1,29.98,overweight,Stage 1
3,32,63.1,1,158,90.0,145,85,2,2,0,0,1,1,36.05,obese (class II),Stage 1
4,46,60.07,2,173,82.0,140,90,3,1,0,0,0,1,27.4,overweight,Stage 2


In [35]:
df_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19788 entries, 0 to 19787
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           19788 non-null  int64  
 1   age          19788 non-null  float64
 2   gender       19788 non-null  int64  
 3   height       19788 non-null  int64  
 4   weight       19788 non-null  float64
 5   ap_hi        19788 non-null  int64  
 6   ap_lo        19788 non-null  int64  
 7   cholesterol  19788 non-null  int64  
 8   gluc         19788 non-null  int64  
 9   smoke        19788 non-null  int64  
 10  alco         19788 non-null  int64  
 11  active       19788 non-null  int64  
 12  cardio       19788 non-null  int64  
 13  BMI          19788 non-null  float64
 14  BMI_cat      19788 non-null  object 
 15  bp_category  19788 non-null  object 
dtypes: float64(3), int64(11), object(2)
memory usage: 2.4+ MB


In [36]:
# This is the criteria for the first dataframe 
# I drop all the columns that are not needed
df_one = df_disease.drop(columns=["ap_hi", "ap_lo", "height", "weight", "BMI"])
# Get dummies (one hot encoding) makes columns for categorical data and then represents them with a 1 if they are applicable
df_one = pd.get_dummies(df_one, columns=["gender","BMI_cat", "bp_category"])
# I rename the column for gender so I know that they are either male or female
df_one.rename(columns= {"gender_1" : "female", "gender_2" : "male"}, inplace=True)
df_one.head(1)

Unnamed: 0,id,age,cholesterol,gluc,smoke,alco,active,cardio,female,male,BMI_cat_normal range,BMI_cat_obese (class I),BMI_cat_obese (class II),BMI_cat_obese (class III),BMI_cat_overweight,BMI_cat_underweight,bp_category_Elevated,bp_category_Healthy,bp_category_Stage 1,bp_category_Stage 2
0,1,55.38,3,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,1


In [37]:
# dropping the columns for the criteria of the second data frame
df_two = df_disease.drop(columns=["BMI_cat", "bp_category", "height", "weight"])
# one hot encoding one the gender column this time
df_two = pd.get_dummies(df_two, columns=["gender"])
# changing the gender columns to male and female
df_two.rename(columns= {"gender_1" : "female", "gender_2" : "male"}, inplace=True)
df_two.head(1)

Unnamed: 0,id,age,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI,female,male
0,1,55.38,140,90,3,1,0,0,1,1,34.93,1,0


# 2.4

In this part of the project I will begin to test some different models

In [38]:
# importing scaling so that it can improve the algorithm by ensures the model will not be biased
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# train test split will split the data so we can train the model then test it.
from sklearn.model_selection import train_test_split

In [39]:
# Creating train test split and validation for the first data frame, Cardio is on its own as this is what we want to predict.
X,y = df_one.drop("cardio", axis = "columns"), df_one["cardio"]

X.shape, y.shape # The shapes of the training data and the removed cardio column

((19788, 19), (19788,))

In [40]:
# Splitting The data frame into train|testsplit, using random_state = 42 to keep the same split over the data set, doing a 70/30 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 
# splitting the test data 50/50 to get validation data to train on. This is good so that we dont leak the test data all the time 
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print(f"X train = {X_train.shape}\n X test = {X_test.shape}\n X val = {X_val.shape}\n y train = {y_train.shape}\n y test = {y_test.shape}\n y val = {y_val.shape}"  )

X train = (13257, 19)
 X test = (3266, 19)
 X val = (3265, 19)
 y train = (13257,)
 y test = (3266,)
 y val = (3265,)


In [41]:
# Feature scaling and normalization

# creating a standard scaler object, this subtracts the mean of the feature and dividing it by the standard deviation
scaler = StandardScaler()

# creating a MinMaxScaler object, This scales to a fixed range, normally 0 to 1. It subtracts the min value from the feature and then divides by the range.
minmax_scaler = MinMaxScaler()

# Fitting and transforming the data using standard scaler
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Fit and transforming the data using the minmax scaler
X_train_norm = minmax_scaler.fit_transform(X_train)
X_val_norm = minmax_scaler.transform(X_val)


### Train test split the second datframe and scaling

In [42]:
X2,y2 = df_two.drop("cardio", axis = "columns"), df_two["cardio"]
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size=0.33, random_state=42) 
X_val_2, X_test_2, y_val_2, y_test_2 = train_test_split(X_test_2, y_test_2, test_size=0.5, random_state=42)

scaler = StandardScaler()
# creating a MinMaxScaler object
minmax_scaler = MinMaxScaler()
# Fitting and transforming the data using standard scaler
X_train_2_scaled = scaler.fit_transform(X_train_2)
X_val_2_scaled = scaler.transform(X_val_2)
# Fit and transforming the data using the minmax scaler
X_train_2_norm = minmax_scaler.fit_transform(X_train_2)
X_val_2_norm = minmax_scaler.transform(X_val_2)

print(f"X train = {X_train_2.shape}\n X test = {X_test_2.shape}\n X val = {X_val_2.shape}\n y train = {y_train_2.shape}\n y test = {y_test_2.shape}\n y val = {y_val_2.shape}"  )

X train = (13257, 12)
 X test = (3266, 12)
 X val = (3265, 12)
 y train = (13257,)
 y test = (3266,)
 y val = (3265,)


### Logistic regression Data set 1

I will now train and test a logistic regression model on my split and processed data. I have chosen this algorithm as it is good for classifying data into different groups. As we have many features with categorical data such as binary and nominal. It will produce probability scores using a sigmoid function. Then predict whether the outcome is likely on a probability scale from 0 to 1.

#### standard scaler

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Defining the model and training it
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# defining the hyper parameters I wish to test on the model
hyper_param = {
    # This controls the coefficients, a small c means stronger regularization and simpler model,larger C means weaker regularization and more complex model 
    # np logspace searches the values between -2 and 20
    "C": np.logspace(-2,0,20),  
    # This determines the type of regularization to used. L1(Lasso), L2(Ridge)and elastic net is a combination of the two.
    "penalty" : ["l1", "l2", "elasticnet","None"],
    # This defines which algorithm will be used to optimize the model. 
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]}

# This is the grid search which will test the hyper parameters that we have chosen
# refit will find the best parameters based on accuracy, cv is the amount of cross folds that will be tested and verbose means that progress will not be printed in the console  
classifier_log_reg_ss = GridSearchCV(model, hyper_param, refit="accuracy", cv=5, verbose=0)
# Fitting the Grid search to the training data
log_reg_ss_cv = classifier_log_reg_ss.fit(X_train_scaled, y_train)

# Printing out the accuracy of the model and the best parameters. I also printed out the best estimator score based on the validation data.
print(f"Train accuracy {log_reg_ss_cv.best_score_}")
print(f"best params{log_reg_ss_cv.best_params_}")
print(f"Val accuracy {log_reg_ss_cv.best_estimator_.score(X_val_scaled, y_val)}")


1600 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise Valu

Train accuracy 0.7085307175128996
best params{'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}
Val accuracy 0.6934150076569678


#### min max scaler

I will now repeat the process with the Min max scaler on the first dataset

In [44]:
# Defining the model and training it
model = LogisticRegression()
model.fit(X_train_norm, y_train)

# defining the hyper parameters I wish to search on the model, I have selected the same parameters
hyper_param = {
    "C": np.logspace(-2,0,20),
    "penalty" : ["l1", "l2", "elasticnet","None"],
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]}

classifier_log_reg_norm = GridSearchCV(model, hyper_param,refit="accuracy", cv=5)
log_reg_norm_cv = classifier_log_reg_norm.fit(X_train_norm, y_train)

print(f"Train accuracy {log_reg_norm_cv.best_score_}")
print(f"best params{log_reg_norm_cv.best_params_}")
print(f"Val accuracy {log_reg_norm_cv.best_estimator_.score(X_val_norm, y_val)}")

Train accuracy 0.7072482964110984
best params{'C': 0.04281332398719394, 'penalty': 'l1', 'solver': 'liblinear'}
Val accuracy 0.692802450229709


1600 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise Valu

### data set 2

I will now do the same process on the second dataset

### scaler

In [45]:
# Defining the model and training it
model = LogisticRegression()
model.fit(X_train_2_scaled, y_train_2)

# defining the hyperparameters I wish to test on the model
hyper_param = {
    "C": np.logspace(-2,0,20),
    "penalty" : ["l1", "l2", "elasticnet","None"],
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]}

grid_search = GridSearchCV(model, hyper_param,refit="accuracy", cv=5)
log_reg2_ss_cv = grid_search.fit(X_train_2_scaled, y_train_2)

print(f"best score {log_reg2_ss_cv.best_score_}")
print(f"best params{log_reg2_ss_cv.best_params_}")
print(f"best estimator {log_reg2_ss_cv.best_estimator_.score(X_val_2_scaled, y_val_2)}")

best score 0.7347056490820221
best params{'C': 0.026366508987303583, 'penalty': 'l1', 'solver': 'saga'}
best estimator 0.7231240428790199


1600 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise Valu

### min max

In [46]:
model = LogisticRegression()
model.fit(X_train_2_norm, y_train_2)

# defining the hyperparameters I wish to test on the model
hyper_param = {
    "C": np.logspace(-2,0,20),
    "penalty" : ["l1", "l2", "elasticnet","None"],
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]}

grid_search = GridSearchCV(model, hyper_param,refit="accuracy", cv=5)
log_reg2_norm_cv = grid_search.fit(X_train_2_norm, y_train_2)

print(f"best score {log_reg2_norm_cv.best_score_}")
print(f"best params{log_reg2_norm_cv.best_params_}")
print(f"best estimator {log_reg2_norm_cv.best_estimator_.score(X_val_2_norm, y_val_2)}")

1600 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Sam Glass ITHS\.virtualenvs\Machine-learning-NSZCLOcg\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise Valu

best score 0.734781405235396
best params{'C': 0.026366508987303583, 'penalty': 'l1', 'solver': 'liblinear'}
best estimator 0.7200612557427258


I have now tested Logistic regression. It had a reasonable result, however I will now try other models to see if i get a better result. The best performing model was that of the second data set with standard scaler. This had the best estimator of the four

## Random Forest

I will now test Random Forest to see what kind of results that will predict with the four different scenario's. Random forest classifier is an algorithm thats classifies data by combining multiple decision trees. Each decision tree is trained on a random It helps in reducing over fitting as each tree is less likely to be biased. It takes the majority vote of all the individual decision trees to make a final classification. This makes it more accurate and robust.

In [47]:
# Importing random forest
from sklearn.ensemble import RandomForestClassifier

hyper_param = {
    # This determines the number of of decision trees to be used.More trees means more accuracy, however it can be slower to train and cause overfitting
    "n_estimators": [100, 150, 200],
    # The amount of features to be considered when splitting a node, sqrt does this by square root and log2 on logarithm.
    "max_features": ['sqrt', 'log2'],
    # This evaluated the quality of each split in the decision tree, gini measures the nodes impurity by the incorrect probability
    # entropy measures the amount of info gained splitting the a node based on the distribution class
    "criterion": ['gini', 'entropy']}

# chosing the model
model_forest = RandomForestClassifier()
# implementing grid search , entering in hyper parameters and crossfold at 5
grid_search = GridSearchCV(model_forest, hyper_param, cv=5,refit="accuracy", verbose=0)
# fitiing gridsearch to the data
rand_for_ss_cv = grid_search.fit(X_train_scaled, y_train)

print(f"best score {rand_for_ss_cv.best_score_}")
print(f"best params{rand_for_ss_cv.best_params_}")
print(f"best estimator {rand_for_ss_cv.best_estimator_.score(X_val_scaled, y_val)}")

best score 0.6736828869608953
best params{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 150}
best estimator 0.6643185298621745


### Min max scaler

In [48]:
hyper_param = {
    "n_estimators": [100, 150, 200],
    "max_features": ['sqrt', 'log2'],
    "criterion": ['gini', 'entropy']}

model_forest = RandomForestClassifier()
grid_search = GridSearchCV(model_forest, hyper_param, cv=5,refit="accuracy", verbose=0)
rand_for_norm_cv = grid_search.fit(X_train_norm, y_train)

print(f"best score {rand_for_norm_cv.best_score_}")
print(f"best params{rand_for_norm_cv.best_params_}")
print(f"best estimator {rand_for_norm_cv.best_estimator_.score(X_val_norm, y_val)}")

best score 0.6736077566563288
best params{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 200}
best estimator 0.6621745788667688


### Standard Scaler dataset 2

In [49]:
hyper_param = {
    "n_estimators": [100, 150, 200],
    "max_features": ['sqrt', 'log2'],
    "criterion": ['gini', 'entropy']}

model_forest = RandomForestClassifier()
grid_search = GridSearchCV(model_forest, hyper_param, cv=5,refit="accuracy", verbose=0)
rand_for2_ss_cv = grid_search.fit(X_train_2_scaled, y_train_2)

print(f"best score {rand_for2_ss_cv.best_score_}")
print(f"best params{rand_for2_ss_cv.best_params_}")
print(f"best estimator {rand_for2_ss_cv.best_estimator_.score(X_val_2_scaled, y_val_2)}")

best score 0.7276921028690617
best params{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 150}
best estimator 0.7261868300153139


### Min max scaler dataset 2

In [50]:
hyper_param = {
    "n_estimators": [100, 150, 200],
    "max_features": ['sqrt', 'log2'],
    "criterion": ['gini', 'entropy']}

model_forest = RandomForestClassifier()
grid_search = GridSearchCV(model_forest, hyper_param, cv=5,refit="accuracy", verbose=0)
rand_for2_norm_cv = grid_search.fit(X_train_2_norm, y_train)

print(f"best score {rand_for2_norm_cv.best_score_}")
print(f"best params{rand_for2_norm_cv.best_params_}")
print(f"best estimator {rand_for2_norm_cv.best_estimator_.score(X_val_2_norm, y_val)}")

best score 0.7271638864755781
best params{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 200}
best estimator 0.7283307810107198


## Bernoulli naive bayes model

For my 3rd model I have chosen to try a Bernoulli naive bayes algorithm. It uses probability distributions to represent different possible outcomes. It will calculate the probability of each possible outcome based on observed features, then choose the outcome with the highest probability. It works very well for binary features, but can be inaccurate if there is high correlations between features and if they are not binary. Within the dataset there is a good portion of binary features. So I thought it would be a good model to try

### Data set one standard scaler

In [51]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV

param_grid = {
    # alpha controls the smoothing of the probabilities for each feature. a larger alpha causes more generalization and a smaller could cause over fitting
    "alpha": np.logspace(-3, 3, 7), # tests values from 0.001 to 1000
    #  This contols whether to learn class prior probabilities from the data
    "fit_prior": [True, False],
    # This converts features that are not binary to binary 
    "binarize": np.linspace(0.0, 1.0, 11)} # will test 11 values between 0.0 to 1.0

model_mnb = BernoulliNB()
grid_search = GridSearchCV(model_mnb, param_grid,refit="accuracy", cv=5)
mnb_ss_cv = grid_search.fit(X_train_scaled, y_train)

print(f"best score {mnb_ss_cv.best_score_}")
print(f"best params{mnb_ss_cv.best_params_}")
print(f"best estimator {mnb_ss_cv.best_estimator_.score(X_val_scaled, y_val)}")

best score 0.6997051967640203
best params{'alpha': 0.001, 'binarize': 0.2, 'fit_prior': True}
best estimator 0.6909647779479327


### Max min scaler

In [52]:
param_grid = {
    "alpha": np.logspace(-3, 3, 7), # tests values from 0.001 to 1000
    "fit_prior": [True, False],
    "binarize": np.linspace(0.0, 1.0, 11)} # will test 11 values between 0.0 to 1.0

model_mnb = BernoulliNB()
grid_search = GridSearchCV(model_mnb, param_grid,refit="accuracy", cv=5)
mnb_norm_cv = grid_search.fit(X_train_norm, y_train)

print(f"best score {mnb_norm_cv.best_score_}")
print(f"best params{mnb_norm_cv.best_params_}")
print(f"best estimator {mnb_norm_cv.best_estimator_.score(X_val_scaled, y_val)}")

best score 0.6991782036204784
best params{'alpha': 1.0, 'binarize': 0.7000000000000001, 'fit_prior': True}
best estimator 0.6900459418070444


### Data set 2 Standard scaler

In [53]:
param_grid = {
    "alpha": np.logspace(-3, 3, 7), # tests values from 0.001 to 1000
    "fit_prior": [True, False],
    "binarize": np.linspace(0.0, 1.0, 11)} # will test 11 values between 0.0 to 1.0

model_mnb = BernoulliNB()
grid_search = GridSearchCV(model_mnb, param_grid,refit="accuracy", cv=5)
mnb_ss2_cv = grid_search.fit(X_train_2_scaled, y_train)

print(f"best score {mnb_ss2_cv.best_score_}")
print(f"best params{mnb_ss2_cv.best_params_}")
print(f"best estimator {mnb_ss2_cv.best_estimator_.score(X_val_2_scaled, y_val)}")

best score 0.7232408385691276
best params{'alpha': 10.0, 'binarize': 0.0, 'fit_prior': True}
best estimator 0.7075038284839203


### Min max scaler dataset 2

In [54]:
param_grid = {
    "alpha": np.logspace(-3, 3, 7), # tests values from 0.001 to 1000
    "fit_prior": [True, False],
    "binarize": np.linspace(0.0, 1.0, 11)} # will test 11 values between 0.0 to 1.0

model_mnb = BernoulliNB()
grid_search = GridSearchCV(model_mnb, param_grid,refit="accuracy", cv=5)
mnb_ss2_norm_cv = grid_search.fit(X_train_2_norm, y_train)

print(f"best score {mnb_ss2_norm_cv.best_score_}")
print(f"best params{mnb_ss2_norm_cv.best_params_}")
print(f"best estimator {mnb_ss2_norm_cv.best_estimator_.score(X_val_2_scaled, y_val)}")

best score 0.7239194578101096
best params{'alpha': 0.001, 'binarize': 0.5, 'fit_prior': True}
best estimator 0.691271056661562
