In [1]:
from preproc import create_dataset
from sklearn.metrics import confusion_matrix, accuracy_score
# StandardScaler because it scales the data to unit variance, which is good for this data since there's age data into it and StandardScaler makes all data into distrib of mean 0 unit variance => remove much of the magnitude from consideration (it improves the model by 2%)

In [2]:
X_train, X_test, y_train, y_test = create_dataset(
    1,
    1,
    test_size=0.4,
    return_tensors=False
)

<h1>FIT EVAL FUNCTION</h1>

In [57]:
def fit_and_eval(model, name):
  model.fit(X_train, y_train)
  preds1 = model.predict(X_train)
  preds = [round(val) for val in preds1]
  print('Accuracy on training data of', name, 'using %.3f' % (accuracy_score(preds, y_train)))
  # on new data
  test_preds = model.predict(X_test)
  print('Accuracy on training data of', name, 'using %.3f' % (accuracy_score(test_preds, y_test)))
  print(f'Confusion matrix on test data with {name}: \n', confusion_matrix(test_preds, y_test))

<h1>BUILDING THE MODELS</h1>

In [91]:
#Basic Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

models = dict()

lr = LogisticRegression(
  C=0.05,
  penalty='l2'
)
lr_pipe = make_pipeline(StandardScaler(), lr)
models['LogRes'] = lr_pipe

### RF grossly overfits
#XGBoost
from xgboost import XGBClassifier
import xgboost as xgb
xgbc = XGBClassifier(
  n_estimators=5, max_depth=5, 
  gamma=0.1
)
models['XGBC'] = xgbc

#SVM
from sklearn.svm import SVC
svc = make_pipeline(
  StandardScaler(),
  SVC(gamma=0.01, kernel='linear', probability=True)
)
models['SVC'] = svc
## Using Tree classifiers
from sklearn.ensemble import RandomForestClassifier
## Random Forest:
rf = RandomForestClassifier(
  n_estimators=3,
  max_depth=4, 
  criterion='gini',
)
models['RF'] = rf

#Naive Bayes
'''
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
models['GNB'] = gnb
'''

# K Neighbors
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(5)
models['KNC']=knc

<h1>FIT AND EVAL THE MODELS</h1>

In [92]:
for name in models.keys():
    fit_and_eval(models[name], name)

Accuracy on training data of LogRes using 0.745
Accuracy on training data of LogRes using 0.721
Confusion matrix on test data with LogRes: 
 [[188  75]
 [ 47 128]]
Accuracy on training data of XGBC using 0.803
Accuracy on training data of XGBC using 0.731
Confusion matrix on test data with XGBC: 
 [[195  78]
 [ 40 125]]
Accuracy on training data of SVC using 0.738
Accuracy on training data of SVC using 0.731
Confusion matrix on test data with SVC: 
 [[185  68]
 [ 50 135]]
Accuracy on training data of RF using 0.745
Accuracy on training data of RF using 0.715
Confusion matrix on test data with RF: 
 [[195  85]
 [ 40 118]]
Accuracy on training data of KNC using 0.780
Accuracy on training data of KNC using 0.685
Confusion matrix on test data with KNC: 
 [[180  83]
 [ 55 120]]


Model is bad at predicting if people don't have heart disease in general ==> we need to regularize more and not let them predict harshly

In [93]:
#### K fold cross validation
from numpy import mean, std
from sklearn.model_selection import KFold, cross_val_score
cv = KFold(n_splits=5, random_state=1, shuffle=True)
scores = cross_val_score(
  svc, X_test, y_test,
  scoring='accuracy', cv=cv, n_jobs=-1
)
print('Cross val accuracy: %.3f (%.3f) ' % (mean(scores), std(scores)))

Cross val accuracy: 0.729 (0.056) 


<h1>Stacked Classifier</h1>

In [94]:
from sklearn.ensemble import StackingClassifier
all_models = [*models.items()]
stacking = StackingClassifier(estimators=all_models, final_estimator=LogisticRegression())
fit_and_eval(stacking, "StackedModel")

Accuracy on training data of StackedModel using 0.799
Accuracy on training data of StackedModel using 0.747
Confusion matrix on test data with StackedModel: 
 [[194  70]
 [ 41 133]]


**Conclusion**
+ Stacked model is better than all other model by 1.4% to the best individual SVC 
- Overfits more and its precision is suffering (overpredicts ppl with no actual disease as having it)
+ Predicts ppl with heart disease better than ppl without it => useful because we want real ppl to be cautious of it anyways, and there's no harm with ppl being wrongly diagnosed with it because there's not much risk.
- However, around **<10%** of the diagnosis is risky (41 / 438 ppl to be false negatives i.e they actually have it but we say they don't). We want to *reduce this amount of false no-disease diagnoses*