In [64]:
import pandas as pd
import numpy as np

In [65]:
from sklearn.datasets import fetch_openml

In [66]:
data = fetch_openml('mnist_784', version=1)

In [67]:
data.data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
X_train,X_val,Y_train,Y_val = train_test_split(data.data,data.target,test_size = 10000,random_state = 42)
# X_train_val,X_val,Y_train_val,Y_val = train_test_split(data.data,data.target,test_size = 10000,random_state = 42)

In [70]:
from sklearn.svm import LinearSVC
svm_clf = LinearSVC()

In [71]:
svm_clf.fit(X_train,Y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [72]:
from sklearn.ensemble import RandomForestClassifier
rand_clf = RandomForestClassifier()
rand_clf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [73]:
from sklearn.ensemble import ExtraTreesClassifier
extra_clf = ExtraTreesClassifier()
extra_clf.fit(X_train,Y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

Validating Individually

In [74]:
from sklearn.metrics import accuracy_score
ypred_svm = svm_clf.predict(X_val)
ypred_rand = rand_clf.predict(X_val)
ypred_extra = extra_clf.predict(X_val)

In [75]:
print("-----------------------------------------------------")
print("Accuracy score")
print("-----------------------------------------------------")
print("SVM           Random forest           Extra trees")
print("-----------------------------------------------------")
print(f'{accuracy_score(Y_val,ypred_svm)}           {accuracy_score(Y_val,ypred_rand)}                 {accuracy_score(Y_val,ypred_extra)}  ')
print("-----------------------------------------------------")



-----------------------------------------------------
Accuracy score
-----------------------------------------------------
SVM           Random forest           Extra trees
-----------------------------------------------------
0.867           0.9671                 0.97  
-----------------------------------------------------


Ensembling them

In [76]:
from sklearn.ensemble import VotingClassifier

In [77]:
voting_clf = VotingClassifier(estimators=[
                        ("svm",svm_clf),
                        ("Random Forest" , rand_clf),
                        ("Extra Trees" , extra_clf)
] ,voting = 'hard')

In [78]:
voting_clf.fit(X_train,Y_train)



VotingClassifier(estimators=[('svm',
                              LinearSVC(C=1.0, class_weight=None, dual=True,
                                        fit_intercept=True, intercept_scaling=1,
                                        loss='squared_hinge', max_iter=1000,
                                        multi_class='ovr', penalty='l2',
                                        random_state=None, tol=0.0001,
                                        verbose=0)),
                             ('Random Forest',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_fe...
                                                   criterion='gini',
   

In [79]:
ypred = voting_clf.predict(X_val)

In [80]:
accuracy_score(Y_val,ypred)

0.966

In [81]:
voting_clf.estimators_

[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
           verbose=0),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None

Stacking technique

In [82]:
def createdata(data):
  svm_pred = svm_clf.predict(data)
  random_pred = rand_clf.predict(data)
  extra_pred = extra_clf.predict(data)
  return np.stack((svm_pred , random_pred, extra_pred), axis=1)


In [83]:
new_train = createdata(X_val)
new_train

array([['8', '8', '8'],
       ['4', '4', '4'],
       ['5', '8', '8'],
       ...,
       ['2', '3', '3'],
       ['3', '8', '8'],
       ['3', '3', '3']], dtype=object)

In [84]:
new_rand = RandomForestClassifier()
new_rand.fit(new_train,Y_val)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [85]:
new_ypred = new_rand.predict(new_train)
accuracy_score(Y_val,new_ypred)

0.9754

**Testing**

In [86]:
X_trains,X_test,Y_trains,Y_test = train_test_split(data.data,data.target,test_size = 10000)

In [87]:
X_test.shape

(10000, 784)

In [88]:
test = createdata(X_test)
test

array([['7', '7', '7'],
       ['4', '7', '7'],
       ['1', '1', '1'],
       ...,
       ['4', '4', '4'],
       ['3', '3', '3'],
       ['2', '2', '2']], dtype=object)

In [89]:
test.shape

(10000, 3)

In [90]:
p = new_rand.predict(test)

In [91]:
accuracy_score(Y_test ,p)

0.9958