In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

X = data_breast_cancer['data'][['mean texture', 'mean symmetry']]
y = data_breast_cancer['target']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
# Voting Classifier

tree_clf = DecisionTreeClassifier()
reg_log = LogisticRegression()
knn_clf = KNeighborsClassifier()

voting_clf_hard = VotingClassifier(
    estimators=[('tc', tree_clf),
                ('rl', reg_log), 
                ('kc', knn_clf)],
    voting='hard'
)
voting_clf_soft = VotingClassifier(
    estimators=[('tc', tree_clf),
                ('rl', reg_log), 
                ('kc', knn_clf)],
    voting='soft'
)

models = {"DecisionTreeClassifier" : tree_clf, "LogisticRegression" :  reg_log, "KNeighborsClassifier" :  knn_clf, "VotingClassifierHard" : voting_clf_hard, "VotingClassifierSoft" :  voting_clf_soft}
acc_results = []

for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_results.append((acc_train, acc_test))
    print(f'==== {name} ==== \ntrain data: {acc_train}, \ntest data: {acc_test}\n')

==== DecisionTreeClassifier ==== 
train data: 1.0, 
test data: 0.6491228070175439

==== LogisticRegression ==== 
train data: 0.7208791208791209, 
test data: 0.7105263157894737

==== KNeighborsClassifier ==== 
train data: 0.7824175824175824, 
test data: 0.6754385964912281

==== VotingClassifierHard ==== 
train data: 0.8571428571428571, 
test data: 0.7105263157894737

==== VotingClassifierSoft ==== 
train data: 0.9714285714285714, 
test data: 0.6842105263157895



In [5]:
with open("acc_vote.pkl", "wb") as f:
    pickle.dump(acc_results, f)
    
with open("vote.pkl", "wb") as f:
    pickle.dump(list(models.values()), f)

with open("acc_vote.pkl", "rb") as f:
    print(pickle.load(f))
    
with open("vote.pkl", "rb") as f:
    print(pickle.load(f))

[(1.0, 0.6491228070175439), (0.7208791208791209, 0.7105263157894737), (0.7824175824175824, 0.6754385964912281), (0.8571428571428571, 0.7105263157894737), (0.9714285714285714, 0.6842105263157895)]
[DecisionTreeClassifier(), LogisticRegression(), KNeighborsClassifier(), VotingClassifier(estimators=[('tc', DecisionTreeClassifier()),
                             ('rl', LogisticRegression()),
                             ('kc', KNeighborsClassifier())]), VotingClassifier(estimators=[('tc', DecisionTreeClassifier()),
                             ('rl', LogisticRegression()),
                             ('kc', KNeighborsClassifier())],
                 voting='soft')]


In [6]:
# Bagging and Pasting

estimators_amount = 30
base_clf = DecisionTreeClassifier()

bag_clf = BaggingClassifier(base_clf, n_estimators=estimators_amount, bootstrap=True)
bag_clf_half = BaggingClassifier(base_clf, n_estimators=estimators_amount, max_samples=0.5, bootstrap=True)
past_clf = BaggingClassifier(base_clf, n_estimators=estimators_amount, bootstrap=False)
past_clf_half = BaggingClassifier(base_clf, n_estimators=estimators_amount, max_samples=0.5, bootstrap=False)
rnd_forest_clf = RandomForestClassifier(n_estimators=estimators_amount)
adaboost = AdaBoostClassifier(n_estimators=estimators_amount)
gradientboost = GradientBoostingClassifier(n_estimators=estimators_amount)

models = {"BaggingClassifier" : bag_clf, "BaggingClassifier_half_samples" : bag_clf_half, "PastingClassifier" : past_clf, "PastingClassifier_half_samples" : past_clf_half, 
          "RandomForest" : rnd_forest_clf, "AdaBoostClassifier" : adaboost, "GradientBoostingClassifier" : gradientboost}
acc_results_2 = []

for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_results_2.append((acc_train, acc_test))
    print(f'==== {name} ==== \ntrain data: {acc_train}, \ntest data: {acc_test}\n')

==== BaggingClassifier ==== 
train data: 0.9956043956043956, 
test data: 0.6491228070175439

==== BaggingClassifier_half_samples ==== 
train data: 0.9252747252747253, 
test data: 0.6929824561403509

==== PastingClassifier ==== 
train data: 1.0, 
test data: 0.6578947368421053

==== PastingClassifier_half_samples ==== 
train data: 0.9758241758241758, 
test data: 0.6929824561403509

==== RandomForest ==== 
train data: 0.9978021978021978, 
test data: 0.6666666666666666

==== AdaBoostClassifier ==== 
train data: 0.778021978021978, 
test data: 0.7105263157894737

==== GradientBoostingClassifier ==== 
train data: 0.8373626373626374, 
test data: 0.7280701754385965



In [7]:
with open("acc_bag.pkl", "wb") as f:
    pickle.dump(acc_results_2, f)

with open("bag.pkl", "wb") as f:
    pickle.dump(list(models.values()), f)

with open("acc_bag.pkl", "rb") as f:
    print(pickle.load(f))

with open("bag.pkl", "rb") as f:
    print(pickle.load(f))

[(0.9956043956043956, 0.6491228070175439), (0.9252747252747253, 0.6929824561403509), (1.0, 0.6578947368421053), (0.9758241758241758, 0.6929824561403509), (0.9978021978021978, 0.6666666666666666), (0.778021978021978, 0.7105263157894737), (0.8373626373626374, 0.7280701754385965)]
[BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30), BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30), BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(),
                  n_estimators=30), BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(),
                  max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


In [8]:
# Sampling

X = data_breast_cancer['data']
y = data_breast_cancer['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

fea_clf = BaggingClassifier(n_estimators=estimators_amount, bootstrap=True, max_features=2, bootstrap_features=False, max_samples=0.5)

fea_clf.fit(X_train, y_train)
y_pred_train = fea_clf.predict(X_train)
y_pred_test = fea_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
acc_results_3 = [acc_train, acc_test]
print(f'==== BaggingClassifierWithSampling ==== \ntrain data: {acc_train}, \ntest data: {acc_test}\n')

==== BaggingClassifierWithSampling ==== 
train data: 0.9956043956043956, 
test data: 0.9210526315789473



In [9]:
with open("acc_fea.pkl", "wb") as f:
    pickle.dump(acc_results_3, f)

with open("fea.pkl", "wb") as f:
    pickle.dump([fea_clf], f)

with open("acc_fea.pkl", "rb") as f:
    print(pickle.load(f))

with open("fea.pkl", "rb") as f:
    print(pickle.load(f))

[0.9956043956043956, 0.9210526315789473]
[BaggingClassifier(max_features=2, max_samples=0.5, n_estimators=30)]


In [10]:
# Features Ranking

estimators = fea_clf.estimators_
estimators_features = fea_clf.estimators_features_
acc_results_4 = []
for clf, features in zip(estimators, estimators_features):
    features_names = X.columns[features].to_list()
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_results_4.append((acc_train, acc_test, features_names))

df = pd.DataFrame(acc_results_4, columns=["acc_train", "acc_test", "features"])
df = df.sort_values(by=["acc_test", "acc_train"], ascending=False)

with open("acc_fea_rank.pkl", "wb") as f:
    pickle.dump(df, f)

with open("acc_fea_rank.pkl", "rb") as f:
    data = pickle.load(f)

data

Unnamed: 0,acc_train,acc_test,features
9,1.0,0.964912,"[mean concavity, worst fractal dimension]"
27,1.0,0.964912,"[worst concave points, worst symmetry]"
16,1.0,0.95614,"[worst radius, worst area]"
18,1.0,0.95614,"[worst smoothness, mean compactness]"
20,1.0,0.95614,"[mean concavity, worst smoothness]"
25,1.0,0.95614,"[worst fractal dimension, concave points error]"
2,1.0,0.947368,"[mean texture, mean area]"
5,1.0,0.947368,"[mean area, fractal dimension error]"
8,1.0,0.947368,"[concavity error, worst symmetry]"
15,1.0,0.947368,"[worst concave points, area error]"
