In [27]:
from ensemble_methods import BaggingClassifier, RandomForestClassifier, AdaBoost
import pandas as pd

df = pd.read_csv('tahkeer_data_cleaned.csv')
columns = df.columns.tolist()
columns.remove("smoking")
features_x = df[columns]
class_y = df["smoking"]

df

Unnamed: 0,Gtp,triglyceride,weight(kg)_waist(cm)_interaction,Cholesterol_LDL_aggregation,HDL,serum creatinine,fasting blood sugar,systolic,AST,smoking
0,0.548520,0.927055,0.144928,0.647329,0.204545,0.768622,0.288591,0.611111,0.461818,1
1,0.614924,0.493127,0.214171,0.742554,0.397727,0.800437,0.644295,0.733333,0.534610,0
2,0.690662,0.819477,0.217391,0.680603,0.261364,0.694135,0.187919,0.422222,0.534610,1
3,0.570725,0.827151,0.565217,0.693479,0.181818,0.768622,0.268456,0.566667,0.427941,0
4,0.451021,0.610424,0.142512,0.625588,0.250000,0.694135,0.268456,0.455556,0.409709,1
...,...,...,...,...,...,...,...,...,...,...
157933,0.394484,0.452922,0.043478,0.845072,0.568182,0.694135,0.087248,0.522222,0.507255,0
157934,0.463068,0.825888,0.224638,0.765632,0.477273,0.598104,0.255034,0.444444,0.445283,0
157935,0.377615,0.441799,0.048309,0.704446,0.738636,0.598104,0.221477,0.377778,0.325687,0
157936,0.614924,0.746325,0.297101,0.636700,0.375000,0.800437,0.476510,0.455556,0.461818,1


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

xtrain, xtest, ytrain, ytest = train_test_split(features_x, class_y, test_size=0.30, shuffle=False, train_size=0.70)

xtest, xvalidate, ytest, yvalidate = train_test_split(xtest, ytest, test_size=0.50, shuffle=True, train_size=0.50)

# Bagging

In [29]:
model_bagging = BaggingClassifier(n_estimators=300, max_depth=90)
model_bagging.fit(xtrain, ytrain)

test_predications = model_bagging.predict(xtest)
validation_predications = model_bagging.predict(xvalidate)

test_accuracy = accuracy_score(ytest, test_predications)
validation_accuracy = accuracy_score(yvalidate, validation_predications)

In [30]:
print(f"Bagging Test Accuracy: {round(test_accuracy*100, 2)}%")
print(f"Bagging Validation Accuracy: {round(validation_accuracy*100, 2)}%")

Bagging Test Accuracy: 72.99%
Bagging Validation Accuracy: 73.79%


# AdaBoost

In [31]:
model_boost = AdaBoost(n_estimators=300)
model_boost.fit(xtrain, ytrain)

test_predications = model_boost.predict(xtest)
validation_predications = model_boost.predict(xvalidate)

test_accuracy = accuracy_score(ytest, test_predications)
validation_accuracy = accuracy_score(yvalidate, validation_predications)

In [32]:
print(f"Boosting Test Accuracy: {round(test_accuracy*100, 2)}%")
print(f"Boosting Validation Accuracy: {round(validation_accuracy*100, 2)}%")

Boosting Test Accuracy: 71.72%
Boosting Validation Accuracy: 72.02%


# Random Forest

In [33]:
model_rf = RandomForestClassifier(n_estimators=250, max_depth=10, min_samples_split=2, min_samples_leaf=2)
model_rf.fit(xtrain, ytrain)

test_predications = model_rf.predict(xtest)
validation_predications = model_rf.predict(xvalidate)

test_accuracy = accuracy_score(ytest, test_predications)
validation_accuracy = accuracy_score(yvalidate, validation_predications)

In [34]:
print(f"Random Forest Test Accuracy: {round(test_accuracy*100, 2)}%")
print(f"Random Forest Validation Accuracy: {round(validation_accuracy*100, 2)}%")

Random Forest Test Accuracy: 72.36%
Random Forest Validation Accuracy: 72.52%


# Testing with real data

In [37]:
from statistics import mode

# This data is of a smoker
data = pd.DataFrame([{
    'Gtp': 0.548520, 'triglyceride': 0.927055,
    'weight(kg)_waist(cm)_interaction': 0.144928,
    'Cholesterol_LDL_aggregation': 0.647329, 'HDL': 0.204545, 'serum creatinine': 0.768622,
    'fasting blood sugar': 0.288591, 'systolic': 0.611111, 'AST': 0.461818
}])

prediction_bagging = model_bagging.predict(data)[0]
prediction_boost = int(model_boost.predict(data)[0])
prediction_rf = model_rf.predict(data)[0]

print("Bagging: " + ("Smoker" if prediction_bagging else "Non smoker") + " " + str(prediction_bagging))
print("Boost: " + ("Smoker" if prediction_boost else "Non smoker") + " " + str(prediction_boost))
print("Random Forest: " + ("Smoker" if prediction_rf else "Non smoker") + " " + str(prediction_rf))

prediction = mode([prediction_bagging, prediction_boost, prediction_rf])

print("Total: " + ("Smoker" if prediction else "Non smoker") + " " + str(prediction))

Bagging: Smoker 1
Boost: Smoker 1
Random Forest: Smoker 1
Total: Smoker 1
