In [73]:
import pandas as pd
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [74]:
X = data_breast_cancer.data[["mean texture","mean symmetry"]]
X.head()

Unnamed: 0,mean texture,mean symmetry
0,10.38,0.2419
1,17.77,0.1812
2,21.25,0.2069
3,20.38,0.2597
4,14.34,0.1809


In [75]:
y = data_breast_cancer.target
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [77]:
print(len(X_train), len(X_test))

455 114


In [78]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
clf_list = []

In [79]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
y_pred_train = tree_clf.predict(X_train)
y_pred_test = tree_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
tree_clf_acc = (acc_train, acc_test)
print(tree_clf_acc)
clf_list.append(tree_clf)

(1.0, 0.6491228070175439)


In [80]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
y_pred_train = knn_clf.predict(X_train)
y_pred_test = knn_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
knn_clf_acc = (acc_train, acc_test)
print(knn_clf_acc)
clf_list.append(knn_clf)

(0.7670329670329671, 0.6578947368421053)


In [81]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
y_pred_train = lr_clf.predict(X_train)
y_pred_test = lr_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
lr_acc = (acc_train, acc_test)
print(lr_acc)
clf_list.append(lr_clf)

(0.7076923076923077, 0.6491228070175439)


In [82]:
from sklearn.ensemble import VotingClassifier

In [83]:
voting_clf = VotingClassifier(estimators=[("tr", tree_clf),
                                          ("knn", knn_clf),
                                          ("lr", lr_clf)], voting="hard")

In [84]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())])

In [85]:
print(voting_clf.predict(X_train))

[0 1 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 0 0 1 1 1 0 1 0 1 0 1 0 1 0 1 0 1
 1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1
 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 0 0 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 1 1 0
 0 0 0 0 0 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0
 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1 0 1 1 0 1 1
 1 0 1 1 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 1 0 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 1
 1 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 0 0 0 1 0 1 1 1 1 0
 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 0 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 1 0 1 1 1 1 0 1 1 1 1 1
 1 0 1 1 1 0 0 1 1 1 0]


In [86]:
y_pred_train = voting_clf.predict(X_train)
y_pred_test = voting_clf.predict(X_test)
hard_acc_train = accuracy_score(y_train, y_pred_train)
hard_acc_test = accuracy_score(y_test, y_pred_test)
hard_acc = (hard_acc_train, hard_acc_test)
print(hard_acc)
clf_list.append(voting_clf)

(0.8483516483516483, 0.6578947368421053)


In [87]:
voting_clf = VotingClassifier(estimators=[("tr", tree_clf),
                                          ("knn", knn_clf),
                                          ("lr", lr_clf)], voting="soft")

In [88]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())],
                 voting='soft')

In [89]:
print(voting_clf.predict(X_train))

[0 0 0 1 0 1 1 1 0 1 0 1 1 1 0 0 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1
 1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1
 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0
 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 1 0 1 1 1 0 0 1 0
 1 0 0 0 0 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1
 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 0
 0 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1
 0 1 1 1 1 1 0 1 1 0 0 0 1 1 0 1 1 0 1 1 1 0 0 1 1 1 0 0 1 0 1 1 1 0 1 1 1
 1 0 1 1 1 1 0 0 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 1 0
 1 0 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 0 0 0 1 1 1 1 1 0 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 0 0 0 1 1 1 1 0 1 1 1 0 1
 1 0 1 1 1 0 0 1 1 1 0]


In [90]:
y_pred_train = voting_clf.predict(X_train)
y_pred_test = voting_clf.predict(X_test)
soft_acc_train = accuracy_score(y_train, y_pred_train)
soft_acc_test = accuracy_score(y_test, y_pred_test)
soft_acc = (soft_acc_train, soft_acc_test)
print(soft_acc)
clf_list.append(voting_clf)

(0.9604395604395605, 0.6666666666666666)


In [91]:
acc_list = [tree_clf_acc, lr_acc, knn_clf_acc, hard_acc, soft_acc]
print(acc_list)

[(1.0, 0.6491228070175439), (0.7076923076923077, 0.6491228070175439), (0.7670329670329671, 0.6578947368421053), (0.8483516483516483, 0.6578947368421053), (0.9604395604395605, 0.6666666666666666)]


In [92]:
import pickle
open_file = open("acc_vote.pkl", "wb")
pickle.dump(acc_list, open_file)
open_file.close()

In [93]:
open_file = open("acc_vote.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[(1.0, 0.6491228070175439), (0.7076923076923077, 0.6491228070175439), (0.7670329670329671, 0.6578947368421053), (0.8483516483516483, 0.6578947368421053), (0.9604395604395605, 0.6666666666666666)]


In [94]:
print(clf_list)

[DecisionTreeClassifier(), KNeighborsClassifier(), LogisticRegression(), VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())]), VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())],
                 voting='soft')]


In [95]:
open_file = open("vote.pkl", "wb")
pickle.dump(clf_list, open_file)
open_file.close()

In [96]:
open_file = open("vote.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[DecisionTreeClassifier(), KNeighborsClassifier(), LogisticRegression(), VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())]), VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())],
                 voting='soft')]


In [97]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [98]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30)
bag_clf.fit(X_train, y_train)
y_pred_train = bag_clf.predict(X_train)
y_pred_test = bag_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
bag_acc = (acc_train, acc_test)
print(bag_acc)

(0.9956043956043956, 0.7280701754385965)


In [100]:
bag_clf_50 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5)
bag_clf_50.fit(X_train, y_train)
y_pred_train = bag_clf_50.predict(X_train)
y_pred_test = bag_clf_50.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
bag_acc_50 = (acc_train, acc_test)
print(bag_acc_50)

(0.9120879120879121, 0.7280701754385965)


In [102]:
past_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False)
past_clf.fit(X_train, y_train)
y_pred_train = past_clf.predict(X_train)
y_pred_test = past_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
past_acc = (acc_train, acc_test)
print(past_acc)

(1.0, 0.6403508771929824)


In [103]:
past_clf_50 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, bootstrap=False)
past_clf_50.fit(X_train, y_train)
y_pred_train = past_clf_50.predict(X_train)
y_pred_test = past_clf_50.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
past_acc_50 = (acc_train, acc_test)
print(past_acc_50)

(0.9648351648351648, 0.7192982456140351)


In [104]:
rf_clf = RandomForestClassifier(n_estimators=30)
rf_clf.fit(X_train, y_train)
y_pred_train = rf_clf.predict(X_train)
y_pred_test = rf_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
rf_acc = (acc_train, acc_test)
print(rf_acc)

(0.9978021978021978, 0.7368421052631579)


In [105]:
ada_clf = AdaBoostClassifier(n_estimators=30)
ada_clf.fit(X_train, y_train)
y_pred_train = ada_clf.predict(X_train)
y_pred_test = ada_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
ada_acc = (acc_train, acc_test)
print(ada_acc)

(0.8087912087912088, 0.7368421052631579)


In [106]:
gradient_clf = GradientBoostingClassifier(n_estimators=30)
gradient_clf.fit(X_train, y_train)
y_pred_train = gradient_clf.predict(X_train)
y_pred_test = gradient_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
gradient_acc = (acc_train, acc_test)
print(gradient_acc)

(0.8153846153846154, 0.7456140350877193)


In [107]:
acc_list2 = [bag_acc, bag_acc_50, past_acc, past_acc_50, rf_acc, ada_acc, gradient_acc]
print(acc_list2)

[(0.9956043956043956, 0.7280701754385965), (0.9120879120879121, 0.7280701754385965), (1.0, 0.6403508771929824), (0.9648351648351648, 0.7192982456140351), (0.9978021978021978, 0.7368421052631579), (0.8087912087912088, 0.7368421052631579), (0.8153846153846154, 0.7456140350877193)]


In [109]:
open_file = open("acc_bag.pkl", "wb")
pickle.dump(acc_list2, open_file)
open_file.close()

In [110]:
open_file = open("acc_bag.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[(0.9956043956043956, 0.7280701754385965), (0.9120879120879121, 0.7280701754385965), (1.0, 0.6403508771929824), (0.9648351648351648, 0.7192982456140351), (0.9978021978021978, 0.7368421052631579), (0.8087912087912088, 0.7368421052631579), (0.8153846153846154, 0.7456140350877193)]
