In [1]:
import pandas as pd

blood_transfusion = pd.read_csv("../datasets/blood_transfusion.csv")
data = blood_transfusion.drop(columns="Class")
target = blood_transfusion["Class"]

In [7]:
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.dummy import DummyClassifier

cv_res = cross_validate(DummyClassifier(strategy="most_frequent"), 
                        data, target, cv=10, scoring="accuracy")

print(cv_res["test_score"].mean())

0.762054054054054


In [9]:
cv_res = cross_validate(DummyClassifier(strategy="most_frequent"), 
                        data, target, cv=10, scoring="balanced_accuracy")

print(cv_res["test_score"].mean())

0.5


In [12]:
from sklearn.tree import DecisionTreeClassifier

cv_res = cross_validate(DecisionTreeClassifier(), 
                        data, target, cv=10, scoring="balanced_accuracy")

print(cv_res["test_score"].mean())

0.5051857585139319


In [14]:
from sklearn.ensemble import RandomForestClassifier

cv_res = cross_validate(RandomForestClassifier(n_estimators=300), 
                        data, target, cv=10, scoring="balanced_accuracy")

print(cv_res["test_score"].mean())

0.5292741658066735


In [26]:
from sklearn.ensemble import RandomForestClassifier

scores_rf = []

for i in range(10):
    cv_res = cross_validate(RandomForestClassifier(n_estimators=300), 
                            data, target, cv=10, scoring="accuracy")

    scores_rf.append(cv_res["test_score"].mean())

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

scores_grd = []

for i in range(10):
    cv_res = cross_validate(GradientBoostingClassifier(n_estimators=300), 
                            data, target, cv=10, scoring="accuracy")

    scores_grd.append(cv_res["test_score"].mean())

In [32]:
import pandas as pd
import numpy as np

print("Mean of RF Trees:", np.mean(scores_rf))
print("Mean of GB Trees:", np.mean(scores_grd))

Mean of RF Trees: 0.6680738738738741
Mean of GB Trees: 0.691064864864865


In [50]:
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier

cv_res = cross_validate(HistGradientBoostingClassifier(max_iter=1000, 
                                                       early_stopping=True), 
                            data, target, cv=10, 
                        scoring=["accuracy", "balanced_accuracy"],
                       return_estimator=True)

print("Mean Accuracy: ", cv_res["test_accuracy"].mean())
print("Mean Balanced Accuracy: ", cv_res["test_balanced_accuracy"].mean())

Mean Accuracy:  0.7247387387387387
Mean Balanced Accuracy:  0.5573271413828689


In [55]:
tree_numbers = [] 
for est in cv_res["estimator"]:
    tree_numbers.append(est.n_iter_)
    
print(np.mean(tree_numbers))

27.5


In [57]:
from imblearn.ensemble import BalancedBaggingClassifier

cv_res = cross_validate(BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(),
                                                 n_estimators=50), 
                        data, target, cv=10, 
                        scoring=["accuracy", "balanced_accuracy"],
                        return_estimator=True)

print("Mean Accuracy: ", cv_res["test_accuracy"].mean())
print("Mean Balanced Accuracy: ", cv_res["test_balanced_accuracy"].mean())

Mean Accuracy:  0.6073693693693694
Mean Balanced Accuracy:  0.5918386652906775
