In [None]:
import pandas as pd
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_breast_cancer['data'][['mean texture', 'mean symmetry']], data_breast_cancer['target'], test_size=0.2)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pickle

decision_tree_clf = DecisionTreeClassifier()
logistic_regression_clf = SGDClassifier(loss='log_loss')
knn_clf = KNeighborsClassifier()

ensemble_hard = VotingClassifier(estimators=[('dt', decision_tree_clf),
                                          ('lr', logistic_regression_clf),
                                          ('knn', knn_clf)],
                              voting='hard')
ensemble_soft = VotingClassifier(estimators=[('dt', decision_tree_clf),
                                             ('lr', logistic_regression_clf),
                                             ('knn', knn_clf)],
                                 voting='soft')

ensemble_hard.fit(X_train, y_train)
ensemble_soft.fit(X_train, y_train)

In [None]:
list_of_accuracy = []

list_of_accuracy.append((accuracy_score(y_train, decision_tree_clf.fit(X_train, y_train).predict(X_train)), accuracy_score(y_test, decision_tree_clf.fit(X_train, y_train).predict(X_test))))
list_of_accuracy.append((accuracy_score(y_train, logistic_regression_clf.fit(X_train, y_train).predict(X_train)), accuracy_score(y_test, logistic_regression_clf.fit(X_train, y_train).predict(X_test))))
list_of_accuracy.append((accuracy_score(y_train, knn_clf.fit(X_train, y_train).predict(X_train)), accuracy_score(y_test, knn_clf.fit(X_train, y_train).predict(X_test))))
list_of_accuracy.append((accuracy_score(y_train, ensemble_hard.predict(X_train)), accuracy_score(y_test, ensemble_hard.predict(X_test))))
list_of_accuracy.append((accuracy_score(y_train, ensemble_soft.predict(X_train)), accuracy_score(y_test, ensemble_soft.predict(X_test))))

for pair in list_of_accuracy:
    print(f'train: {pair[0]}, test: {pair[1]}')

with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(list_of_accuracy, f)

train: 1.0, test: 0.6754385964912281
train: 0.6703296703296703, test: 0.6052631578947368
train: 0.7736263736263737, test: 0.7192982456140351
train: 0.8219780219780219, test: 0.7105263157894737
train: 0.8395604395604396, test: 0.6929824561403509


In [None]:
list_of_clf = [decision_tree_clf, logistic_regression_clf, knn_clf, ensemble_hard, ensemble_soft]
with open('vote.pkl', 'wb') as f:
    pickle.dump(list_of_clf, f)

In [None]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
decision_tree = DecisionTreeClassifier()

bagging = BaggingClassifier(decision_tree, n_estimators=30)
bagging.fit(X_train, y_train)

bagging_50 = BaggingClassifier(decision_tree, n_estimators=30, max_samples=0.5)
bagging_50.fit(X_train, y_train)

pasting = BaggingClassifier(decision_tree, n_estimators=30, bootstrap=False)
pasting.fit(X_train, y_train)

pasting_50 = BaggingClassifier(decision_tree, n_estimators=30, bootstrap=False, max_samples=0.5)
pasting_50.fit(X_train, y_train)

random_forest = RandomForestClassifier(n_estimators=30)
random_forest.fit(X_train, y_train)

adaboost = AdaBoostClassifier(decision_tree, n_estimators=30)
adaboost.fit(X_train, y_train)

gradient_boosting = GradientBoostingClassifier(n_estimators=30)
gradient_boosting.fit(X_train, y_train)

In [None]:
list_of_bagging = []

list_of_bagging.append((accuracy_score(y_train, bagging.predict(X_train)), accuracy_score(y_test, bagging.predict(X_test))))
list_of_bagging.append((accuracy_score(y_train, bagging_50.predict(X_train)), accuracy_score(y_test, bagging_50.predict(X_test))))
list_of_bagging.append((accuracy_score(y_train, pasting.predict(X_train)), accuracy_score(y_test, pasting.predict(X_test))))
list_of_bagging.append((accuracy_score(y_train, pasting_50.predict(X_train)), accuracy_score(y_test, pasting_50.predict(X_test))))
list_of_bagging.append((accuracy_score(y_train, random_forest.predict(X_train)), accuracy_score(y_test, random_forest.predict(X_test))))
list_of_bagging.append((accuracy_score(y_train, adaboost.predict(X_train)), accuracy_score(y_test, adaboost.predict(X_test))))
list_of_bagging.append((accuracy_score(y_train, gradient_boosting.predict(X_train)), accuracy_score(y_test, gradient_boosting.predict(X_test))))

for pair in list_of_bagging:
    print(f'train: {pair[0]}, test: {pair[1]}')

with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(list_of_bagging, f)

train: 0.9956043956043956, test: 0.7105263157894737
train: 0.9230769230769231, test: 0.7280701754385965
train: 1.0, test: 0.6842105263157895
train: 0.9472527472527472, test: 0.6929824561403509
train: 1.0, test: 0.6929824561403509
train: 1.0, test: 0.6842105263157895
train: 0.8395604395604396, test: 0.7982456140350878


In [None]:
list_of_bag = [bagging, bagging_50, pasting, pasting_50, random_forest, adaboost, gradient_boosting]
with open('bag.pkl', 'wb') as f:
    pickle.dump(list_of_bag, f)

In [None]:
tree = DecisionTreeClassifier()

bagging = BaggingClassifier(tree, n_estimators=30, max_samples=0.5, max_features=2, bootstrap=True, bootstrap_features=True)

bagging.fit(X_train, y_train)

In [None]:
list_of_tree_acc = [accuracy_score(y_train, tree.fit(X_train, y_train).predict(X_train)), accuracy_score(y_test, tree.fit(X_train, y_train).predict(X_test))]

with open('acc_fea.pkl', 'wb') as f:
    pickle.dump(list_of_tree_acc, f)

In [None]:
with open('fea.pkl', 'wb') as f:
    pickle.dump([bagging], f)

In [None]:
estimators = bagging.estimators_
estimators_features = bagging.estimators_features_

test_acc = []
train_acc = []
features_names = []

for tree, features in zip(estimators, estimators_features):
    train_acc.append(accuracy_score(y_train, tree.predict(X_train.iloc[:, features])))
    test_acc.append(accuracy_score(y_test, tree.predict(X_test.iloc[:, features])))
    features_names.append(features)

df = pd.DataFrame({ "train: " : train_acc, "test: " : test_acc, "features: " : features_names})

df.sort_values(by=['train: ', 'test: '], ascending=False, inplace=True)

df.to_pickle('acc_fea_rank.pkl')

