In [28]:
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np


In [29]:
# Load the cancer dataset
cancer = load_breast_cancer()

In [30]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=42)


In [31]:
# Define the 4 machine learning models
model1 = DecisionTreeClassifier()
model2 = RandomForestClassifier(n_estimators=10)
model3 = KNeighborsClassifier(n_neighbors=3)
model4 = SVC(kernel='linear', C=1, gamma='auto')

In [32]:
# Train the models on the training set
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)
model4.fit(X_train, y_train)

In [33]:
for model in [model1, model2, model3, model4]:
    scores = cross_val_score(model, X_test, y_test, cv=5)
    print("Accuracy of %s: %0.2f%%" % (type(model).__name__, scores.mean() * 100))


Accuracy of DecisionTreeClassifier: 92.92%
Accuracy of RandomForestClassifier: 93.83%
Accuracy of KNeighborsClassifier: 95.65%
Accuracy of SVC: 96.44%


In [34]:
# Apply cross-validation score on the entire dataset
kf = KFold(n_splits=5, shuffle=True)
X = X_train
y = y_train
meta_train = np.zeros((X.shape[0], 4))

In [35]:
for train_index, test_index in kf.split(X):
    X_train_kf, X_test_kf = X[train_index], X[test_index]
    y_train_kf, y_test_kf = y[train_index], y[test_index]
    model1.fit(X_train_kf, y_train_kf)
    model2.fit(X_train_kf, y_train_kf)
    model3.fit(X_train_kf, y_train_kf)
    model4.fit(X_train_kf, y_train_kf)
    meta_train[test_index, 0] = model1.predict(X_test_kf)
    meta_train[test_index, 1] = model2.predict(X_test_kf)
    meta_train[test_index, 2] = model3.predict(X_test_kf)
    meta_train[test_index, 3] = model4.predict(X_test_kf)

In [36]:
# Define the meta-model for stacking
meta_model = DecisionTreeClassifier()

In [37]:
# Train the meta-model on the meta features
meta_model.fit(meta_train, y)

In [38]:
# Make predictions on the testing set using the stacked model
meta_test = np.column_stack((model1.predict(X_test), model2.predict(X_test), model3.predict(X_test), model4.predict(X_test)))
y_pred = meta_model.predict(meta_test)

In [39]:
# Compute the accuracy of the stacked model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of stacked model: %.2f%%" % (accuracy * 100))

Accuracy of stacked model: 98.25%


In [40]:
# Print GT and prediction
import pandas as pd
df = pd.DataFrame({"Ground Truth": y_test, "Prediction": y_pred})
print(df)


     Ground Truth  Prediction
0               1           1
1               0           0
2               0           0
3               1           1
4               1           1
..            ...         ...
109             1           1
110             0           0
111             1           1
112             1           1
113             0           0

[114 rows x 2 columns]
