In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

X = data_breast_cancer['data'][['mean texture', 'mean symmetry']]
y = data_breast_cancer['target']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
# Voting Classifier

tree_clf = DecisionTreeClassifier()
reg_log = LogisticRegression()
knn_clf = KNeighborsClassifier()

voting_clf_hard = VotingClassifier(
    estimators=[('tc', tree_clf),
                ('rl', reg_log), 
                ('kc', knn_clf)],
    voting='hard'
)
voting_clf_soft = VotingClassifier(
    estimators=[('tc', tree_clf),
                ('rl', reg_log), 
                ('kc', knn_clf)],
    voting='soft'
)

models = {"DecisionTreeClassifier" : tree_clf, "LogisticRegression" :  reg_log, "KNeighborsClassifier" :  knn_clf, "VotingClassifierHard" : voting_clf_hard, "VotingClassifierSoft" :  voting_clf_soft}
acc_results = []

for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_results.append((acc_train, acc_test))
    print(f'==== {name} ==== \ntrain data: {acc_train}, \ntest data: {acc_test}\n')

==== DecisionTreeClassifier ==== 
train data: 1.0, 
test data: 0.6666666666666666

==== LogisticRegression ==== 
train data: 0.6945054945054945, 
test data: 0.7192982456140351

==== KNeighborsClassifier ==== 
train data: 0.7824175824175824, 
test data: 0.6052631578947368

==== VotingClassifierHard ==== 
train data: 0.8615384615384616, 
test data: 0.6666666666666666

==== VotingClassifierSoft ==== 
train data: 0.9714285714285714, 
test data: 0.7017543859649122



In [5]:
with open("acc_vote.pkl", "wb") as f:
    pickle.dump(acc_results, f)
    
with open("vote.pkl", "wb") as f:
    pickle.dump(list(models.values()), f)

with open("acc_vote.pkl", "rb") as f:
    print(pickle.load(f))
    
with open("vote.pkl", "rb") as f:
    print(pickle.load(f))

[(1.0, 0.6666666666666666), (0.6945054945054945, 0.7192982456140351), (0.7824175824175824, 0.6052631578947368), (0.8615384615384616, 0.6666666666666666), (0.9714285714285714, 0.7017543859649122)]
[DecisionTreeClassifier(), LogisticRegression(), KNeighborsClassifier(), VotingClassifier(estimators=[('tc', DecisionTreeClassifier()),
                             ('rl', LogisticRegression()),
                             ('kc', KNeighborsClassifier())]), VotingClassifier(estimators=[('tc', DecisionTreeClassifier()),
                             ('rl', LogisticRegression()),
                             ('kc', KNeighborsClassifier())],
                 voting='soft')]


In [6]:
# Bagging and Pasting

estimators_amount = 30
base_clf = DecisionTreeClassifier()

bag_clf = BaggingClassifier(base_clf, n_estimators=estimators_amount, bootstrap=True)
bag_clf_half = BaggingClassifier(base_clf, n_estimators=estimators_amount, max_samples=0.5, bootstrap=True)
past_clf = BaggingClassifier(base_clf, n_estimators=estimators_amount, bootstrap=False)
past_clf_half = BaggingClassifier(base_clf, n_estimators=estimators_amount, max_samples=0.5, bootstrap=False)
rnd_forest_clf = RandomForestClassifier(n_estimators=estimators_amount)
adaboost = AdaBoostClassifier(n_estimators=estimators_amount)
gradientboost = GradientBoostingClassifier(n_estimators=estimators_amount)

models = {"BaggingClassifier" : bag_clf, "BaggingClassifier_half_samples" : bag_clf_half, "PastingClassifier" : past_clf, "PastingClassifier_half_samples" : past_clf_half, 
          "RandomForest" : rnd_forest_clf, "AdaBoostClassifier" : adaboost, "GradientBoostingClassifier" : gradientboost}
acc_results_2 = []

for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_results_2.append((acc_train, acc_test))
    print(f'==== {name} ==== \ntrain data: {acc_train}, \ntest data: {acc_test}\n')

==== BaggingClassifier ==== 
train data: 0.9934065934065934, 
test data: 0.6491228070175439

==== BaggingClassifier_half_samples ==== 
train data: 0.9428571428571428, 
test data: 0.6754385964912281

==== PastingClassifier ==== 
train data: 1.0, 
test data: 0.6578947368421053

==== PastingClassifier_half_saples ==== 
train data: 0.9648351648351648, 
test data: 0.6754385964912281

==== RandomForest ==== 
train data: 0.9956043956043956, 
test data: 0.631578947368421

==== AdaBoostClassifier ==== 
train data: 0.7714285714285715, 
test data: 0.7192982456140351

==== GradientBoostingClassifier ==== 
train data: 0.8285714285714286, 
test data: 0.7456140350877193



In [7]:
with open("acc_bag.pkl", "wb") as f:
    pickle.dump(acc_results_2, f)

with open("bag.pkl", "wb") as f:
    pickle.dump(list(models.values()), f)

with open("acc_bag.pkl", "rb") as f:
    print(pickle.load(f))

with open("bag.pkl", "rb") as f:
    print(pickle.load(f))

[(0.9934065934065934, 0.6491228070175439), (0.9428571428571428, 0.6754385964912281), (1.0, 0.6578947368421053), (0.9648351648351648, 0.6754385964912281), (0.9956043956043956, 0.631578947368421), (0.7714285714285715, 0.7192982456140351), (0.8285714285714286, 0.7456140350877193)]
[BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30), BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30), BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(),
                  n_estimators=30), BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(),
                  max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


In [8]:
# Sampling

fea_clf = BaggingClassifier(base_clf, n_estimators=estimators_amount, max_samples=0.5, max_features=2, bootstrap=True, bootstrap_features=False)

fea_clf.fit(X_train, y_train)
y_pred_train = fea_clf.predict(X_train)
y_pred_test = fea_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
acc_results_3 = [acc_train, acc_test]
print(f'==== BaggingClassifierWithSampling ==== \ntrain data: {acc_train}, \ntest data: {acc_test}\n')

==== BaggingClassifierWithSampling ==== 
train data: 0.9296703296703297, 
test data: 0.6666666666666666



In [9]:
with open("acc_fea.pkl", "wb") as f:
    pickle.dump(acc_results_3, f)

with open("fea.pkl", "wb") as f:
    pickle.dump([fea_clf], f)

with open("acc_fea.pkl", "rb") as f:
    print(pickle.load(f))

with open("fea.pkl", "rb") as f:
    print(pickle.load(f))

[0.9296703296703297, 0.6666666666666666]
[BaggingClassifier(estimator=DecisionTreeClassifier(), max_features=2,
                  max_samples=0.5, n_estimators=30)]


In [10]:
# Features Ranking

estimators = fea_clf.estimators_
estimators_features = fea_clf.estimators_features_
acc_results_4 = []
for clf, features in zip(estimators, estimators_features):
    features_names = X.columns[features].to_list()
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_results_4.append((acc_train, acc_test, features_names))

df = pd.DataFrame(acc_results_4, columns=["acc_train", "acc_test", "features"])
df = df.sort_values(by=["acc_test", "acc_train"], ascending=False)

with open("acc_fea_rank.pkl", "wb") as f:
    pickle.dump(df, f)

with open("acc_fea_rank.pkl", "rb") as f:
    data = pickle.load(f)

data

Unnamed: 0,acc_train,acc_test,features
25,1.0,0.692982,"[mean texture, mean symmetry]"
1,1.0,0.684211,"[mean texture, mean symmetry]"
8,1.0,0.684211,"[mean texture, mean symmetry]"
10,1.0,0.684211,"[mean texture, mean symmetry]"
11,1.0,0.684211,"[mean texture, mean symmetry]"
14,1.0,0.684211,"[mean texture, mean symmetry]"
16,1.0,0.684211,"[mean texture, mean symmetry]"
29,1.0,0.684211,"[mean texture, mean symmetry]"
3,1.0,0.675439,"[mean texture, mean symmetry]"
5,1.0,0.675439,"[mean texture, mean symmetry]"
