In [23]:
from sklearn.datasets import fetch_openml
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearnex import patch_sklearn
from sklearn.linear_model import SGDClassifier

In [5]:
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [6]:
mnist.DESCR

"**Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges  \n**Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown  \n**Please cite**:  \n\nThe MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  \n\nIt is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 

In [7]:
mnist.data.shape

(70000, 784)

In [8]:
x,y = mnist["data"], mnist["target"]

In [9]:
y = y.astype(int)

In [10]:
x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=10000, random_state=42, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=10000, random_state=42, stratify=y_train_val)

In [11]:
# Applying Standard Scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

In [17]:
# Random Forest algorithm
random_forest = RandomForestClassifier(n_estimators=30, random_state=42)
random_forest.fit(x_train, y_train)

y_pred_rf_for_val_set = random_forest.predict(x_val)
print(accuracy_score(y_val, y_pred_rf_for_val_set))

0.9649


In [12]:
# Decision Tree algorithm with GridSearchCV(We will find the best hyperparameters)
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
decision_tree = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(decision_tree, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

print("En iyi parametreler : ", grid_search.best_params_)
print("En iyi cross-validation doğruluk oranı : ", grid_search.best_score_)

best_decision_tree_model = grid_search.best_estimator_
print("Validation Set Doğruluğu:", best_decision_tree_model.score(x_val, y_val))

En iyi parametreler :  {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 5, 'min_samples_split': 2}
En iyi cross-validation doğruluk oranı :  0.87094
Validation Set Doğruluğu: 0.8776


In [15]:
# SVM algorithm
SVM = SVC(kernel='linear', probability=True)
SVM.fit(x_train, y_train)

y_pred_svm_for_val_set = SVM.predict(x_val)
print(f"Accuracy score of the SVM with validation set is : {accuracy_score(y_val, y_pred_svm_for_val_set)}")

Accuracy score of the SVM with validation set is : 0.9263


In [39]:
# Voting Classifiers with hard and soft voting
def find_voting_classifier(votingType, x_test, y_test):
    voting_classifier = VotingClassifier(
            estimators=[('rf', random_forest), ('dt', best_decision_tree_model), ('svm', SVM)],
            voting=votingType
        )
    voting_model = voting_classifier.fit(x_train, y_train)
    voting_prediction = voting_model.predict(x_test)
    accuracy = accuracy_score(y_test, voting_prediction)
    print(f"Accuracy score of the {votingType} voting classifier : {accuracy}")

In [41]:
print("For validation set,")
votingTypes = {"hard", "soft"}
for votingType in votingTypes:
    find_voting_classifier(votingType, x_val, y_val)

For validation set,
Accuracy score of the hard voting classifier : 0.9569
Accuracy score of the soft voting classifier : 0.9539


In [43]:
# The results are better than the individual models
print("For test set")

models = {
    "Random Forest": random_forest,
    "Decision Tree": best_decision_tree_model,
    "SVM": SVM
}
#Normal Learning
for name, model in models.items():
    prediction = model.predict(x_test)
    accuracy = accuracy_score(y_test, prediction)
    print(f"Accuracy score of the {name} model: {accuracy}")
 #Ensemble Learning   
for votingType in votingTypes:
    find_voting_classifier(votingType, x_test, y_test)


For test set
Accuracy score of the Random Forest model: 0.9615
Accuracy score of the Decision Tree model: 0.8763
Accuracy score of the SVM model: 0.9226
Accuracy score of the hard voting classifier : 0.9523
Accuracy score of the soft voting classifier : 0.9525
