# Getting MNIST dataset
Once I obtain the dataset, I use train_test_split to split the dataset to train and test.

In [12]:
from sklearn.datasets import fetch_openml

# This library contains images of 28x28 pixels.
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(mnist['data'], mnist['target'], test_size=0.15, random_state=42)


# 0. Developing and Training: Decision Tree, Random Forest, and KNN
Here I import the 3 models, fit them to the training set, then use a for statement to iterate through the 3 models and prints the accuracy score. <br>
KNeighbors - K is the number of nearest neighbors. The number of neighbors is the deciding factor.<br>
RandomForest - Large number of individual decision trees that operate as an ensemble<br>
DecisionTree - A Decision Tree<br> 
<br>
RandomForestClassifier operates the best out of the three classifiers.

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=10)
knn_clf.fit(X_train, y_train)

rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)

tree_clf = DecisionTreeClassifier(max_depth=10)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10)

In [15]:
# Accuracy Score for above models
from sklearn.metrics import accuracy_score

for clf in (knn_clf, rnd_clf, tree_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

KNeighborsClassifier 0.9656190476190476
RandomForestClassifier 0.9666666666666667
DecisionTreeClassifier 0.8538095238095238


# Developing and Training: Hard and Soft Voting Classifiers
Using the two forms of voting (soft and hard) and use the three previous models, then use the accuracy score stated above.<br>
Hard voting - Picking the predictor with the highest number of votes<br>
Soft voting - Combining the probabiltiies of each prediction in the models and picking the prediction with the highest total probability<br>
<br>
Both Voting Classifiers performed very closely both at 96%.

In [21]:
# Hard Voting
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

voting_hard_clf = VotingClassifier(
    estimators=[('knn', knn_clf), ('rnd', rnd_clf), ('tree', tree_clf)],
    voting='hard'
)

voting_hard_clf.fit(X_train, y_train)
y_pred = voting_hard_clf.predict(X_test)
print(voting_hard_clf.__class__.__name__, accuracy_score(y_test, y_pred))

VotingClassifier 0.9643809523809523


In [22]:
# Soft Voting
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

voting_soft_clf = VotingClassifier(
    estimators=[('knn', knn_clf), ('rnd', rnd_clf), ('tree', tree_clf)],
    voting='soft'
)

voting_soft_clf.fit(X_train, y_train)
y_pred = voting_soft_clf.predict(X_test)
print(voting_soft_clf.__class__.__name__, accuracy_score(y_test, y_pred))

VotingClassifier 0.9602857142857143


# 2. Developing and Training: Decision Tree with Bagging and Pasting, and Random Forest
Using bagging and pasting(for pasting, set bootstrap=False) and use the DecisionTree Classifier in the model. Also use the RandomForest classifier model to fit the data. Then use the Accuracy score method to produce the accuracy score. <br>
Bagging Classifier with Bagging - Through diversity of training sets passed to each predictor in the model. instances replaced after being drawn by a classifier <br>
Bagging Classifier with Pasting - does not replace instances after being drawn<br>
<br>
Random Forest out performed the Bagging classifiers

In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=1
)
bag_clf.fit(X_train, y_train)

pas_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=False, n_jobs=1
)
pas_clf.fit(X_train, y_train)

rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)

RandomForestClassifier()

In [19]:
# Accuracy score for above classifier
from sklearn.metrics import accuracy_score

for clf in (bag_clf, pas_clf, rnd_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

BaggingClassifier 0.8475238095238096
BaggingClassifier 0.8445714285714285
RandomForestClassifier 0.9665714285714285


# 3. Developing AdaBoost classifier and finding optimized using GridSearchCV
The below parameters was gathered from https://stackoverflow.com/questions/32210569/using-gridsearchcv-with-adaboost-and-decisiontreeclassifier. I was not able to find much information at all regarding GridSearchCV (from slides, book, or searching GridSearchCV). The only information I could gather was from StackOverflow. base_estimator__max_depths and base_estimator__min_samples_leaf accesses the base estimator hyperparameters (DecisionTree Classifier). n_estimators and learning_rate tunes the AdaBoos Classifier.

In [28]:
# AdaBoost classifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=None, random_state=42), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)

ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
print(ada_clf.__class__.__name__, accuracy_score(y_test, y_pred))

AdaBoostClassifier 0.872


In [30]:
parameters = {
    'base_estimator__max_depth': [i for i in range (2, 11, 2)],
    'base_estimator__min_samples_leaf': [5, 10],
    'n_estimators': [10, 50, 250, 1000],
    'learning_rate': [0.01, 0.1]}

grid_search_ada = GridSearchCV(ada_clf, param_grid=parameters, scoring = 'roc_auc')

grid_search_ada.fit(X_train, y_train)
y_pred = grid_search_ada.predict(X_test)
print(grid_search_ada.__class__.__name__, accuracy_score(y_test, y_pred))

Traceback (most recent call last):
  File "C:\Users\William\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\William\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\William\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 349, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported



KeyboardInterrupt: 