In [2]:
import pandas as pd
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [3]:
X = data_breast_cancer.data[["mean texture","mean symmetry"]]
X.head()

Unnamed: 0,mean texture,mean symmetry
0,10.38,0.2419
1,17.77,0.1812
2,21.25,0.2069
3,20.38,0.2597
4,14.34,0.1809


In [4]:
y = data_breast_cancer.target
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int32

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
print(len(X_train), len(X_test))

455 114


In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
clf_list = []

In [8]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
y_pred_train = tree_clf.predict(X_train)
y_pred_test = tree_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
tree_clf_acc = (acc_train, acc_test)
print(tree_clf_acc)
clf_list.append(tree_clf)

(1.0, 0.6578947368421053)


In [9]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
y_pred_train = knn_clf.predict(X_train)
y_pred_test = knn_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
knn_clf_acc = (acc_train, acc_test)
print(knn_clf_acc)
clf_list.append(knn_clf)

(0.7714285714285715, 0.6578947368421053)


In [10]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
y_pred_train = lr_clf.predict(X_train)
y_pred_test = lr_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
lr_acc = (acc_train, acc_test)
print(lr_acc)
clf_list.append(lr_clf)

(0.6989010989010989, 0.7105263157894737)


In [11]:
from sklearn.ensemble import VotingClassifier

In [12]:
voting_clf = VotingClassifier(estimators=[("tr", tree_clf),
                                          ("knn", knn_clf),
                                          ("lr", lr_clf)], voting="hard")

In [13]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())])

In [14]:
print(voting_clf.predict(X_train))

[1 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1
 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 1 0 1
 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1
 0 1 1 0 1 1 1 0 1 0 1 0 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1
 0 1 1 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0
 1 0 0 0 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0
 0 1 0 0 0 0 1 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1
 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 0 0
 0 1 1 0 1 0 1 1 0 0 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1
 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 1 0 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 0 0 1 0 1 0 0 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1]


In [15]:
y_pred_train = voting_clf.predict(X_train)
y_pred_test = voting_clf.predict(X_test)
hard_acc_train = accuracy_score(y_train, y_pred_train)
hard_acc_test = accuracy_score(y_test, y_pred_test)
hard_acc = (hard_acc_train, hard_acc_test)
print(hard_acc)
clf_list.append(voting_clf)

(0.8461538461538461, 0.6842105263157895)


In [16]:
voting_clf = VotingClassifier(estimators=[("tr", tree_clf),
                                          ("knn", knn_clf),
                                          ("lr", lr_clf)], voting="soft")

In [17]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())],
                 voting='soft')

In [18]:
print(voting_clf.predict(X_train))

[1 0 1 0 0 1 1 0 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1
 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 1 0 1
 0 1 0 1 1 1 0 1 1 0 1 0 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1
 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 0 0 0
 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1
 0 1 1 1 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1
 1 0 0 0 0 1 0 0 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 0
 0 1 0 0 0 0 1 1 1 0 1 1 1 0 0 0 1 0 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1
 1 1 1 0 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 0 0
 0 1 1 0 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 0 0 1 1 1 0
 0 1 1 0 1 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 1 0 1 0 1 1 1 0 1 1 1 1
 0 1 0 0 1 0 1 1 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 1 0 1 0 0 1 1 1 0 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1]


In [19]:
y_pred_train = voting_clf.predict(X_train)
y_pred_test = voting_clf.predict(X_test)
soft_acc_train = accuracy_score(y_train, y_pred_train)
soft_acc_test = accuracy_score(y_test, y_pred_test)
soft_acc = (soft_acc_train, soft_acc_test)
print(soft_acc)
clf_list.append(voting_clf)

(0.9626373626373627, 0.6842105263157895)


In [20]:
acc_list = [tree_clf_acc, lr_acc, knn_clf_acc, hard_acc, soft_acc]
print(acc_list)

[(1.0, 0.6578947368421053), (0.6989010989010989, 0.7105263157894737), (0.7714285714285715, 0.6578947368421053), (0.8461538461538461, 0.6842105263157895), (0.9626373626373627, 0.6842105263157895)]


In [21]:
import pickle
open_file = open("acc_vote.pkl", "wb")
pickle.dump(acc_list, open_file)
open_file.close()

In [22]:
open_file = open("acc_vote.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[(1.0, 0.6578947368421053), (0.6989010989010989, 0.7105263157894737), (0.7714285714285715, 0.6578947368421053), (0.8461538461538461, 0.6842105263157895), (0.9626373626373627, 0.6842105263157895)]


In [23]:
print(clf_list)

[DecisionTreeClassifier(), KNeighborsClassifier(), LogisticRegression(), VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())]), VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())],
                 voting='soft')]


In [24]:
open_file = open("vote.pkl", "wb")
pickle.dump(clf_list, open_file)
open_file.close()

In [25]:
open_file = open("vote.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[DecisionTreeClassifier(), KNeighborsClassifier(), LogisticRegression(), VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())]), VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression())],
                 voting='soft')]


In [26]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
clf_list2 = []

In [27]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30)
bag_clf.fit(X_train, y_train)
y_pred_train = bag_clf.predict(X_train)
y_pred_test = bag_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
bag_acc = (acc_train, acc_test)
print(bag_acc)
clf_list2.append(bag_clf)

(0.9956043956043956, 0.7368421052631579)


In [28]:
bag_clf_50 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5)
bag_clf_50.fit(X_train, y_train)
y_pred_train = bag_clf_50.predict(X_train)
y_pred_test = bag_clf_50.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
bag_acc_50 = (acc_train, acc_test)
print(bag_acc_50)
clf_list2.append(bag_clf_50)

(0.9076923076923077, 0.7368421052631579)


In [29]:
past_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False)
past_clf.fit(X_train, y_train)
y_pred_train = past_clf.predict(X_train)
y_pred_test = past_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
past_acc = (acc_train, acc_test)
print(past_acc)
clf_list2.append(past_clf)

(1.0, 0.6403508771929824)


In [30]:
past_clf_50 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, bootstrap=False)
past_clf_50.fit(X_train, y_train)
y_pred_train = past_clf_50.predict(X_train)
y_pred_test = past_clf_50.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
past_acc_50 = (acc_train, acc_test)
print(past_acc_50)
clf_list2.append(past_clf_50)

(0.9736263736263736, 0.7105263157894737)


In [31]:
rf_clf = RandomForestClassifier(n_estimators=30)
rf_clf.fit(X_train, y_train)
y_pred_train = rf_clf.predict(X_train)
y_pred_test = rf_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
rf_acc = (acc_train, acc_test)
print(rf_acc)
clf_list2.append(rf_clf)

(0.9978021978021978, 0.7192982456140351)


In [32]:
ada_clf = AdaBoostClassifier(n_estimators=30)
ada_clf.fit(X_train, y_train)
y_pred_train = ada_clf.predict(X_train)
y_pred_test = ada_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
ada_acc = (acc_train, acc_test)
print(ada_acc)
clf_list2.append(ada_clf)

(0.7912087912087912, 0.7807017543859649)


In [33]:
gradient_clf = GradientBoostingClassifier(n_estimators=30)
gradient_clf.fit(X_train, y_train)
y_pred_train = gradient_clf.predict(X_train)
y_pred_test = gradient_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
gradient_acc = (acc_train, acc_test)
print(gradient_acc)
clf_list2.append(gradient_clf)

(0.8263736263736263, 0.7719298245614035)


In [34]:
acc_list2 = [bag_acc, bag_acc_50, past_acc, past_acc_50, rf_acc, ada_acc, gradient_acc]
print(acc_list2)

[(0.9956043956043956, 0.7368421052631579), (0.9076923076923077, 0.7368421052631579), (1.0, 0.6403508771929824), (0.9736263736263736, 0.7105263157894737), (0.9978021978021978, 0.7192982456140351), (0.7912087912087912, 0.7807017543859649), (0.8263736263736263, 0.7719298245614035)]


In [35]:
open_file = open("acc_bag.pkl", "wb")
pickle.dump(acc_list2, open_file)
open_file.close()

In [36]:
open_file = open("acc_bag.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[(0.9956043956043956, 0.7368421052631579), (0.9076923076923077, 0.7368421052631579), (1.0, 0.6403508771929824), (0.9736263736263736, 0.7105263157894737), (0.9978021978021978, 0.7192982456140351), (0.7912087912087912, 0.7807017543859649), (0.8263736263736263, 0.7719298245614035)]


In [37]:
print(clf_list2)

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


In [38]:
open_file = open("bag.pkl", "wb")
pickle.dump(clf_list2, open_file)
open_file.close()

In [39]:
open_file = open("bag.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


In [40]:
X = data_breast_cancer.data

In [41]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [44]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 30,
                            bootstrap = True, bootstrap_features = False,
                            max_samples = 0.5, max_features = 2)
bag_clf.fit(X_train, y_train)
y_pred_train = bag_clf.predict(X_train)
y_pred_test = bag_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
bag_acc_list = [acc_train, acc_test]
print(bag_acc_list)

[0.9956043956043956, 0.9298245614035088]


In [45]:
open_file = open("acc_fea.pkl", "wb")
pickle.dump(bag_acc_list, open_file)
open_file.close()

In [46]:
open_file = open("acc_fea.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[0.9956043956043956, 0.9298245614035088]


In [49]:
clf_list = [bag_clf]
print(clf_list)

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=2,
                  max_samples=0.5, n_estimators=30)]


In [50]:
open_file = open("fea.pkl", "wb")
pickle.dump(clf_list, open_file)
open_file.close()

In [51]:
open_file = open("fea.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=2,
                  max_samples=0.5, n_estimators=30)]


In [72]:
df = []
for estimator, features in zip(bag_clf.estimators_, bag_clf.estimators_features_):
    print(features)
    y_pred_train = estimator.predict(X_train.iloc[:, features])
    y_pred_test = estimator.predict(X_test.iloc[:, features])
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    features_names = [X.columns[features[0]], X.columns[features[1]]]
    row = [acc_train, acc_test, features_names]
    print(row)
    df.append(row)
df = pd.DataFrame(df, columns = ['train_acc', 'test_acc', 'features_names'])

[17  4]
[0.8065934065934066, 0.631578947368421, ['concave points error', 'mean smoothness']]
[ 1 28]
[0.8153846153846154, 0.6754385964912281, ['mean texture', 'worst symmetry']]
[1 0]
[0.9054945054945055, 0.8245614035087719, ['mean texture', 'mean radius']]
[26 28]
[0.9010989010989011, 0.8070175438596491, ['worst concavity', 'worst symmetry']]
[2 5]
[0.9208791208791208, 0.8508771929824561, ['mean perimeter', 'mean compactness']]
[14 12]
[0.843956043956044, 0.7807017543859649, ['smoothness error', 'perimeter error']]
[28  4]
[0.7758241758241758, 0.6052631578947368, ['worst symmetry', 'mean smoothness']]
[9 5]
[0.8725274725274725, 0.8508771929824561, ['mean fractal dimension', 'mean compactness']]
[22 23]
[0.9406593406593406, 0.8859649122807017, ['worst perimeter', 'worst area']]
[29  8]
[0.7802197802197802, 0.6842105263157895, ['worst fractal dimension', 'mean symmetry']]
[28 26]
[0.8703296703296703, 0.8508771929824561, ['worst symmetry', 'worst concavity']]
[26 20]
[0.9494505494505494,

In [82]:
df.sort_values(['train_acc', 'test_acc'], ascending=[False, False], inplace=True)

In [83]:
df

Unnamed: 0,train_acc,test_acc,features_names
25,0.958242,0.894737,"[mean compactness, worst area]"
13,0.949451,0.938596,"[worst radius, mean texture]"
11,0.949451,0.877193,"[worst concavity, worst radius]"
29,0.945055,0.868421,"[mean concavity, mean area]"
8,0.940659,0.885965,"[worst perimeter, worst area]"
23,0.934066,0.903509,"[mean area, worst radius]"
22,0.931868,0.885965,"[worst smoothness, worst concave points]"
17,0.931868,0.877193,"[mean concave points, mean perimeter]"
28,0.927473,0.877193,"[concavity error, worst concave points]"
27,0.923077,0.877193,"[mean area, mean concavity]"


In [85]:
open_file = open("acc_fea_rank.pkl", "wb")
pickle.dump(df, open_file)
open_file.close()

In [None]:
open_file = open("fea.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)