In [68]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [92]:
X,y = make_classification(n_samples=1000, n_features=35, n_redundant=5, n_informative=15, class_sep=1.2, n_classes=2, n_clusters_per_class=1)
# plt.scatter(X[:,0], X[:,1], c=y, cmap='plasma')
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=123)

In [93]:
from sklearn.model_selection import GridSearchCV


In [98]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier
dcc = DecisionTreeClassifier()
# param_grid_dcc = {
#     "criterion": ['gini', 'entropy','log_loss'],
#     "splitter": ['best','random'],
#     "max_depth": [i for i in range(11)],
#     "min_samples_split": [i for i in range(5,20)],
#     "max_features": [1,2],
#     "min_impurity_decrease": np.arange(0,2,0.01).tolist(),
# }

param_grid_dcc = {
    "criterion": ['gini'],
    "splitter": ['best','random'],
    "max_depth": [i for i in range(3,12)],
    "min_samples_split": [i for i in range(10,30)],
    "max_features": [10,15,20,25,30],
    "min_impurity_decrease": np.arange(0.001,0.1,0.01).tolist(),
}

dcc_grid = GridSearchCV(estimator = dcc, 
                       param_grid = param_grid_dcc, 
                       cv = 5, 
                       verbose=2, 
                       n_jobs = -1)
dcc_grid.fit(X_train, y_train)


Fitting 5 folds for each of 18000 candidates, totalling 90000 fits


In [99]:
print(dcc_grid.best_params_)
print(dcc_grid.best_score_)

{'criterion': 'gini', 'max_depth': 9, 'max_features': 30, 'min_impurity_decrease': 0.001, 'min_samples_split': 10, 'splitter': 'best'}
0.9275


In [100]:
dcc_grid.predict(X_test)
print(dcc_grid.score(X_test, y_test))

0.915


In [101]:
from sklearn.ensemble import BaggingClassifier

In [102]:
dcc_bagging = DecisionTreeClassifier(criterion = 'gini', max_depth=9, max_features= 30, min_impurity_decrease= 0.001, min_samples_split= 10, splitter= 'best')
# dcc_bagging = DecisionTreeClassifier()


In [112]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb.score(X_test, y_test)

0.94

In [113]:
param_grid_bag = {
    # Hyperparameters for the BaggingClassifier
    'n_estimators': [200],          # Number of base estimators
    'max_samples': [0.25,0.35,0.5,0.7],        # Fraction of samples for bootstrapping
    'max_features': [0.25,0.5, 1.0],       # Fraction of features for bootstrapping
    'bootstrap': [True, False],             # Whether to use bootstrap samples
    'bootstrap_features': [True, False], # Whether to use bootstrap for features
}
# bagging_class = BaggingClassifier(estimator=dcc_bagging, oob_score=True)
bagging_class = BaggingClassifier(estimator=gnb)
bag_grid = GridSearchCV(estimator = bagging_class, 
                       param_grid = param_grid_bag, 
                       cv = 5, 
                       verbose=5, 
                       n_jobs = -1)
bag_grid.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [114]:
print(bag_grid.best_params_)
print(bag_grid.best_score_)

{'bootstrap': True, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 0.7, 'n_estimators': 200}
0.95625


In [115]:
bag_grid.score(X_test, y_test)

0.945