In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)
print(mnist.data.shape, mnist.target.shape)

  warn(


(70000, 784) (70000,)


In [2]:
from sklearn.model_selection import train_test_split

X, y = mnist.data, mnist.target

X_train, X_t, y_train, y_t = train_test_split(X, y, train_size=50000, random_state=42)

X_test, X_val, y_test, y_val = train_test_split(X_t, y_t, train_size=10000, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape)

(50000, 784) (50000,) (10000, 784) (10000,) (10000, 784) (10000,)


In [3]:
from sklearn.ensemble import RandomForestClassifier

r_forest_clf = RandomForestClassifier(n_estimators = 100, random_state=42)
r_forest_clf.fit(X_train[:8000], y_train[:8000])

In [4]:
from sklearn.svm import SVC

svc_clf = SVC(kernel="rbf", random_state=42, probability=True)
svc_clf.fit(X_train[:8000], y_train[:8000])

In [5]:
from sklearn.ensemble import ExtraTreesClassifier

ex_trees_clf = ExtraTreesClassifier(n_estimators = 100, random_state=42)
ex_trees_clf.fit(X_train[:8000], y_train[:8000])

In [6]:
from sklearn.metrics import accuracy_score

r_f_y_pred = r_forest_clf.predict(X_val)
svc_y_pred = svc_clf.predict(X_val)

ex_trees_y_pred = ex_trees_clf.predict(X_val)

print(f"random tree acc: {accuracy_score(y_val, r_f_y_pred)},svc acc: {accuracy_score(y_val, svc_y_pred)}, extra trees acc: {accuracy_score(y_val, ex_trees_y_pred)} ")

random tree acc: 0.9453,svc acc: 0.9593, extra trees acc: 0.9507 


In [7]:
import scipy
import numpy as np

In [8]:
def hard_ensemble(clfs, X):
  preds = np.ndarray((X.shape[0], len(clfs)))
  print(preds.shape)
  for i, clf in enumerate(clfs):
    y_pred_ = clf.predict(X) 
    preds[:, i] = y_pred_
  
  y_pred = scipy.stats.mode(preds, axis=1, nan_policy='propagate', keepdims=False).mode
  return(y_pred)


y_hard_pred = hard_ensemble([r_forest_clf, svc_clf, ex_trees_clf], X_val)

print(f"ensemble acc: ", accuracy_score(np.array(y_val, dtype="float"), y_hard_pred))

(10000, 3)
ensemble acc:  0.954


In [9]:
def soft_ensemble(clfs, X, n_classes):
  preds = np.ndarray(( X.shape[0], n_classes,  len(clfs)))
  # print(preds.shape)
  clf_probas = []
  for i, clf in enumerate(clfs):
    y_pred_ = clf.predict_proba(X) #np.stack(np.hsplit(clf.predict_proba(X).T, X.shape[0])) #clf.predict(X) 
    # print(np.stack(np.hsplit(clf.predict_proba(X).T, X.shape[0])).shape)
    # print(clf.predict_proba(X).T)
    clf_probas.append(clf.predict_proba(X).T)
    preds[:,:, i] = y_pred_
  
  # print(preds)
  # print(np.argmax(np.average(preds, axis=2), axis=1))
  #print(np.argmax(np.average(np.concatenate(clf_probas, axis=1), axis=1)))
  return np.argmax(np.average(preds, axis=2), axis=1)


y_soft_pred = soft_ensemble([r_forest_clf, svc_clf, ex_trees_clf], X_val, 10)

print(f"ensemble acc: ", accuracy_score(np.array(y_val, dtype="float"), y_soft_pred))

ensemble acc:  0.9577


In [10]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators = 100, random_state=42)),
        ('svc', SVC(kernel="rbf", random_state=42, probability=True)),
        ('etr', ExtraTreesClassifier(n_estimators = 100, random_state=42))
    ]
)
voting_clf.fit(X_train[:8000], y_train[:8000])

In [11]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", accuracy_score(np.array(y_val, dtype="float"), clf.predict(X_val)))

rf = 0.9453
svc = 0.9593
etr = 0.9507


In [12]:
y_sk_voting_pred = voting_clf.predict(X_val)

print(f"ensemble acc: ", accuracy_score(y_val, y_sk_voting_pred))

ensemble acc:  0.954


In [13]:
from sklearn.ensemble import VotingClassifier

voting_soft_clf = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators = 100, random_state=42)),
        ('svc', SVC(kernel="rbf", random_state=42, probability=True)),
        ('etr', ExtraTreesClassifier(n_estimators = 100, random_state=42))
    ], voting="soft"
)
voting_soft_clf.fit(X_train[:8000], y_train[:8000])

In [14]:
for name, clf in voting_soft_clf.named_estimators_.items():
    print(name, "=", accuracy_score(np.array(y_val, dtype="float"), clf.predict(X_val)))

rf = 0.9453
svc = 0.9593
etr = 0.9507


In [15]:
y_sk_soft_voting_pred = voting_soft_clf.predict(X_val)

print(f"ensemble acc: ", accuracy_score(y_val, y_sk_soft_voting_pred))

ensemble acc:  0.9577


In [21]:

class Blender():
  def __init__(self, clfs, main_clf):
    self.clfs = clfs
    self.main_clf = main_clf

  def fit(self, X, y):
    y_preds = []
    for name, clf in self.clfs:
      y_preds.append(clf.predict(X))

    X_labels = np.stack(y_preds).T
    self.main_clf.fit(X_labels, y)

  def predict(self, X):
    y_preds = []
    for name, clf in self.clfs:
      y_preds.append(clf.predict(X))

    X_labels = np.stack(y_preds).T
    return self.main_clf.predict(X_labels)



In [27]:
from sklearn.linear_model import LogisticRegression

lr_clf = RandomForestClassifier(random_state=43)

blender = Blender([*voting_soft_clf.named_estimators_.items()], lr_clf)
blender.fit(X_val, y_val)

In [28]:
y_test_pred_blender = blender.predict(X_test)

In [30]:

print(f"blender acc: ", accuracy_score(y_test, y_test_pred_blender))

blender acc:  0.9509


In [32]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators = 100, random_state=42)),
        ('svc', SVC(kernel="rbf", random_state=42, probability=True)),
        ('etr', ExtraTreesClassifier(n_estimators = 100, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5  # number of cross-validation folds
)
stacking_clf.fit(X_train[:8000], y_train[:8000])

In [33]:
y_test_pred_stacking = stacking_clf.predict(X_test)

print(f"blender acc: ", accuracy_score(y_test, y_test_pred_stacking))

blender acc:  0.9599
