# Machine Learning algorithm testing

In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

for file in os.listdir():
    print(file)

df_train= pd.read_csv("02_framingham_undersampling_train.csv")
df_test= pd.read_csv("02_framingham_scaled_test.csv")


00_framingham.csv
01_Data_preprocessing.ipynb
01_framingham_clean.csv
02_framingham_oversampling_scaled_train.csv
02_framingham_oversampling_train.csv
02_framingham_scaled_smote_train.csv
02_framingham_scaled_test.csv
02_framingham_scaled_train.csv
02_framingham_scaled_undersampling_train.csv
02_framingham_smote_train.csv
02_framingham_test.csv
02_framingham_undersampling_train.csv
02_Statistical_analyse.ipynb
04_ml_training.ipynb
05_risk_factors.ipynb
cvd_risk_model.pkl
feature_importances.csv
ml_training_smote.ipynb


In [125]:
sizes = df_train['TenYearCHD'].value_counts()
print(sizes)
print('Size of no CVD Risk',np.round(100/sizes.sum()*sizes[0],2),'%')
print('Size of no CVD Risk',np.round(100/sizes.sum()*sizes[1],2),'%')

TenYearCHD
0    2871
1    2871
Name: count, dtype: int64
Size of no CVD Risk 50.0 %
Size of no CVD Risk 50.0 %


In [None]:
# Split data in Train and Test and x and y
X_train = df_train.drop(["TenYearCHD"], axis = 1)
y_train = df_train["TenYearCHD"]

X_test = df_test.drop(["TenYearCHD"], axis = 1)
y_test = df_test["TenYearCHD"]

In [None]:
# combine x_train and y_train
train_set = pd.merge(X_train, y_train, left_index=True, right_index=True)
train_set.head()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(16,6))
mask = np.triu(np.ones_like(train_set.corr().abs(), dtype= bool))

heatmap = sns.heatmap(train_set.corr().abs(), mask= mask, vmin= 0, vmax= 0.6, annot= True, cmap= "YlGnBu", fmt= ".2f")
heatmap.set_title("Triangle Correlation Heatmap", fontdict= {"fontsize": 18}, pad= 16)
plt.show()

In [None]:
# correlation of indepenedent variables with the dependent variable
plt.figure(figsize=(6,8))
correlation = train_set.corr()[["TenYearCHD"]].abs().sort_values(by= "TenYearCHD", ascending= False)
correlation = correlation[correlation < 1]
heatmap = sns.heatmap(correlation, annot= True, cmap= "YlGnBu")
heatmap.set_title("Correlation of Independent Variables with the Dependent Variable", fontdict= {"fontsize": 18}, pad= 16)
plt.show()

# Machine Learning Models

In [None]:
# !pip install xgboost

In [None]:
# import all necessary models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve


In [None]:
# model 1
m1 = "Logistic Regression"
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression model:",lr_acc_score*100, "\n")
print(classification_report(y_test, lr_predict))

In [None]:
# model 2
m2 = "Naive Bayes"
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
nb_conf_matrix = confusion_matrix(y_test, nb_pred)
nb_acc_score = accuracy_score(y_test, nb_pred)
print("confussion matrix")
print(nb_conf_matrix)
print("\n")
print("Accuracy of Naive Bayes model:", nb_acc_score*100, "\n")
print(classification_report(y_test, nb_pred))

In [None]:
# model 3
m3 = "Random Forest Classifier"
rf = RandomForestClassifier(n_estimators=20, random_state=12, max_depth=5)
rf.fit(X_train,y_train)
rf_predicted = rf.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, rf_predicted)
rf_acc_score = accuracy_score(y_test, rf_predicted)
print("confussion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest model:", rf_acc_score*100, "\n")
print(classification_report(y_test, rf_predicted))

In [None]:
# Calculate and plot feature importance
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
feature_importances.sort_values().plot(kind='barh')  # Using sort_values() for correct sorting
plt.title("Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.show()

# Save feature importances to CSV
feature_importances.to_csv("feature_importances.csv")

# Save the trained CVD risk model as a .pkl file
import joblib
cvd_risk_model = rf  # rf is your trained random forest model
joblib.dump(cvd_risk_model, "cvd_risk_model.pkl")

print("Feature importances and model saved successfully.")


In [None]:
m4 = 'Extreme Gradient Boost'
xgb = XGBClassifier(learning_rate=0.01, n_estimators=25, max_depth=15,gamma=0.6, subsample=0.52,colsample_bytree=0.6,seed=27,
                    reg_lambda=2, booster='dart', colsample_bylevel=0.6, colsample_bynode=0.5)
xgb.fit(X_train, y_train)
xgb_predicted = xgb.predict(X_test)
xgb_conf_matrix = confusion_matrix(y_test, xgb_predicted)
xgb_acc_score = accuracy_score(y_test, xgb_predicted)
print("confussion matrix")
print(xgb_conf_matrix)
print("\n")
print("Accuracy of Extreme Gradient Boost:",xgb_acc_score*100,'\n')
print(classification_report(y_test,xgb_predicted))

In [None]:
m5 = 'K-NeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
knn_predicted = knn.predict(X_test)
knn_conf_matrix = confusion_matrix(y_test, knn_predicted)
knn_acc_score = accuracy_score(y_test, knn_predicted)
print("confussion matrix")
print(knn_conf_matrix)
print("\n")
print("Accuracy of K-NeighborsClassifier:",knn_acc_score*100,'\n')
print(classification_report(y_test,knn_predicted))

In [None]:
m6 = 'DecisionTreeClassifier'
dt = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 6)
dt.fit(X_train, y_train)
dt_predicted = dt.predict(X_test)
dt_conf_matrix = confusion_matrix(y_test, dt_predicted)
dt_acc_score = accuracy_score(y_test, dt_predicted)
print("confussion matrix")
print(dt_conf_matrix)
print("\n")
print("Accuracy of DecisionTreeClassifier:",dt_acc_score*100,'\n')
print(classification_report(y_test,dt_predicted))

In [None]:
m7 = 'Support Vector Classifier'
svc =  SVC(kernel='rbf', C=2)
svc.fit(X_train, y_train)
svc_predicted = svc.predict(X_test)
svc_conf_matrix = confusion_matrix(y_test, svc_predicted)
svc_acc_score = accuracy_score(y_test, svc_predicted)
print("confussion matrix")
print(svc_conf_matrix)
print("\n")
print("Accuracy of Support Vector Classifier:",svc_acc_score*100,'\n')
print(classification_report(y_test,svc_predicted))

In [None]:
# fine tuning the random forest model (Manual from towards data science)
# model 3
m3 = "Random Forest Classifier"
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train,y_train)
rf_predicted = rf.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, rf_predicted)
rf_acc_score = accuracy_score(y_test, rf_predicted)
print("confussion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest model:", rf_acc_score*100, "\n")
print(classification_report(y_test, rf_predicted))

# with pprint you can see the default parameters of a model:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())


# RandomizedSearchCV

In [None]:
# tuning a model, using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# random grid for the predefined parameters
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# with this one can use the random grid to search for best hyperparameters
# first create the base model to tune
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# fit the random search
rf_random.fit(X_train,y_train)

# documentation of random search CV
n_iter: controls the number of different combinations to try

cv: which number of folds to use for cross validation

More iterations = wider search space, more cv folds reduces the chance of overfitting, increasing leads to increaed run times

In [None]:
# view the best parameters
rf_random.best_params_

In [None]:
# evaluate the optimized model
m3 = "Random Forest Classifier"
rf2 = RandomForestClassifier(n_estimators = 1600, min_samples_split = 2, min_samples_leaf = 4, max_features = "sqrt", max_depth = 20, bootstrap = True, random_state=42)
rf2.fit(X_train,y_train)
rf2_predicted = rf2.predict(X_test)
rf2_conf_matrix = confusion_matrix(y_test, rf2_predicted)
rf2_acc_score = accuracy_score(y_test, rf2_predicted)
print("confussion matrix")
print(rf2_conf_matrix)
print("\n")
print("Accuracy of Random Forest model:", rf2_acc_score*100, "\n")
print(classification_report(y_test, rf2_predicted))


In [None]:
# feature importance --> to be optimized this based on random forest
importances = rf2.feature_importances_
indices = np.argsort(importances)[::-1]
features = X_train.columns

plt.figure(figsize=(10,5))
plt.title("Feature importances using Random Forest")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]) ,features)
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define refined hyperparameters
n_estimators = [100, 300, 500, 700, 1000]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [10, 20, 30, 40, 50]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# Refined random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

# Instantiate RandomForest and RandomizedSearchCV
rf3 = RandomForestClassifier(random_state=42)
rf3_random = RandomizedSearchCV(estimator=rf3, param_distributions=random_grid,
                               n_iter=50, cv=5, verbose=2, random_state=42,
                               n_jobs=-1, scoring='roc_auc')

# Fit the model
rf3_random.fit(X_train, y_train)

In [None]:
rf3_random.best_params_

In [None]:
# evaluate the optimized model again

rf4 = RandomForestClassifier(n_estimators = 1000, min_samples_split = 2, min_samples_leaf = 1, max_features = "sqrt", max_depth = 10, bootstrap = True, random_state=42)
rf4.fit(X_train,y_train)
rf4_predicted = rf4.predict(X_test)
rf4_conf_matrix = confusion_matrix(y_test, rf4_predicted)
rf4_acc_score = accuracy_score(y_test, rf4_predicted)

print("confussion matrix")
print(rf2_conf_matrix)
print("\n")
print("Accuracy of Random Forest model:", rf4_acc_score*100, "\n")
print(classification_report(y_test, rf4_predicted))

In [None]:
# feature importance --> to be optimized this based on random forest
importances = rf4.feature_importances_
indices = np.argsort(importances)[::-1]
features = X_train.columns

plt.figure(figsize=(10,5))
plt.title("Feature importances using Random Forest")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]) ,features)
plt.xticks(rotation=90)
plt.show()

# safe the model and feature importance

In [None]:
cvd_risk_model = rf
# save the model as pkl

import joblib
joblib.dump(cvd_risk_model, "cvd_risk_model.pkl")


# save the nodel importances



In [None]:
# emr bots! artificial generated electronics medical records
# cdss in r or ython,

# exercise folder moodle

# 100 patients folder

In [None]:
# min max scaler!

# TPOT

In [None]:
# !pip install tpot

In [None]:
from tpot import TPOTClassifier
tpot = TPOTClassifier(generations = 4, population_size=10, verbosity = 3 )
fitting = tpot.fit(X_train, y_train)
# print('TPOT Score: ',tpot.score(X_test, y_test))
y_pred = tpot.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred))