# Getting MNIST dataset
Once I obtain the dataset, I use train_test_split to split the dataset to train and test.

In [1]:
from sklearn.datasets import fetch_openml

# This library contains images of 28x28 pixels.
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(mnist['data'], mnist['target'], test_size=0.15, random_state=42)


# 0. Developing and Training: Decision Tree, Random Forest, and KNN
Here I import the 3 models, fit them to the training set, then use a for statement to iterate through the 3 models and prints the accuracy score.

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=10)
knn_clf.fit(X_train, y_train)

rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)

tree_clf = DecisionTreeClassifier(max_depth=10)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10)

In [4]:
# Accuracy Score for above models
from sklearn.metrics import accuracy_score

for clf in (knn_clf, rnd_clf, tree_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

KNeighborsClassifier 0.9656190476190476
RandomForestClassifier 0.9667619047619047
DecisionTreeClassifier 0.8553333333333333


# Developing and Training: Hard and Soft Voting Classifiers
Using the two forms of voting (soft and hard) and use the three previous models, then use the accuracy score stated above.

In [5]:
from sklearn.ensemble import VotingClassifier

voting_hard_clf = VotingClassifier(
    estimators=[('knn', knn_clf), ('rnd', rnd_clf), ('tree', tree_clf)],
    voting='hard'
)

voting_soft_clf = VotingClassifier(
    estimators=[('knn', knn_clf), ('rnd', rnd_clf), ('tree', tree_clf)],
    voting='soft'
)

In [11]:
# Accuracy score for voting models
from sklearn.metrics import accuracy_score

for clf in (voting_hard_clf, voting_soft_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

TypeError: unsupported operand type(s) for +: 'VotingClassifier' and 'str'

# 2. Developing and Training: Decision Tree with Bagging and Pasting, and Random Forest
Using bagging and pasting(for pasting, set bootstrap=False) and use the DecisionTree Classifier in the model. Also use the RandomForest classifier model to fit the data. Then use the Accuracy score method to produce the accuracy score.

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=1
)
bag_clf.fit(X_train, y_train)

pas_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=False, n_jobs=1
)
pas_clf.fit(X_train, y_train)

rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)

RandomForestClassifier()

In [8]:
# Accuracy score for above classifier
from sklearn.metrics import accuracy_score

for clf in (bag_clf, pas_clf, rnd_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

BaggingClassifier 0.8472380952380952
BaggingClassifier 0.844
RandomForestClassifier 0.9672380952380952


# 3. Developing AdaBoost classifier and finding optimized using GridSearchCV

In [10]:
# AdaBoost classifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid_search_ada = GridSearchCV(ada_clf, param_grid=parameters, scoring = 'roc_auc')
grid_search_ada

GridSearchCV(estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                                          learning_rate=0.5, n_estimators=200,
                                          random_state=42),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
             scoring='roc_auc')