Imports

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import classification_report
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

Fetching data and data exploration

In [None]:
mnist = fetch_openml('mnist_784', version = 1, cache = True,  as_frame = False)
print(mnist.DESCR)

In [None]:
X = mnist['data']
y = mnist['target'].astype(np.uint8)

In [None]:
df_data = pd.DataFrame(mnist['data'])
df_target = pd.DataFrame(mnist['target'])

In [None]:
df_data.info()

In [None]:
df_target.info()

In [None]:
df_data.describe()

In [None]:
df_target.describe()

In [None]:
i = 0
while i < 10:
    
    print(df_target[df_target==f'{i}'].count())
    i = i + 1

In [None]:
some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = mpl.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()

All pictures have 28x28 pixels or 784 labels wich can be between 0 (white) and 255 (Black)

Train, val and testdata spliting, and initial model testing

In [None]:
#last 10000 is test data and rest is train
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]

#Creating new train set and validation set
X_train_new, X_val, y_train_new, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=0.15,
    random_state=42)

In [None]:
def model_eval(dic, X_train, y_train, X_val, y_val):
    for model_name, model in dic.items():
        model.fit(X_train, y_train)
        score = model.score(X_val, y_val)
        print(f"{model_name} score: {score}")

In [None]:
models = {'random_forest_clf': RandomForestClassifier(random_state=42), 
          'extra_trees_clf': ExtraTreesClassifier(random_state=42),
          'svc': SVC(random_state=42),
          'KNeighborsClassifier': KNeighborsClassifier(),
          'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42),
         }
model_eval(models, X_train_new, y_train_new, X_val, y_val)

Model fine tuning with GridSearchCV 

In [None]:
#Getting the best hyper paramaters for the models over 95% in score
param_grid = {
    'n_estimators': [300],
    'criterion': ['gini'], 
    'max_depth': [None],
    'min_samples_split': [4],
    'min_samples_leaf': [1],
}
clf = ExtraTreesClassifier()
grid_search = GridSearchCV(clf, param_grid)
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
# Getting the best hyper paramaters for the models over 95% in score
param_grid = {
    'n_neighbors': [5],
    'weights': ['distance'],
    'algorithm': ['auto'],
    'leaf_size': [20]
}
clf = KNeighborsClassifier()
grid_search = GridSearchCV(clf, param_grid)
grid_search.fit(X_train, y_train)
grid_search.best_params_


In [None]:
# Getting the best hyper paramaters for the models over 95% in score
param_grid = {
    'C': [ 5.0],
    'kernel': ['rbf'],
    'gamma': ['scale']
}
clf = SVC()
grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X_train, y_train)
grid_search.best_params_


In [None]:
# Getting the best hyper paramaters for the models over 95% in score
param_grid = {
    'n_estimators': [300],
    'criterion': ['gini'],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
}
clf = RandomForestClassifier()
grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X_train, y_train)
grid_search.best_params_


Model testing after model tuning.

In [None]:
svc_clf = SVC(C=5.0, kernel='rbf', gamma='scale', random_state=42)

svc_clf.fit(X_train_new, y_train_new)
y_val_pred = svc_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))
cm = confusion_matrix(y_val, y_val_pred)

sns.heatmap(cm, annot=True, fmt='d')
plt.show()

In [None]:
random_forest_clf = RandomForestClassifier(
    n_estimators=300,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)
random_forest_clf.fit(X_train_new, y_train_new)
y_val_pred = random_forest_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))
cm = confusion_matrix(y_val, y_val_pred)

sns.heatmap(cm, annot=True, fmt='d')
plt.show()

In [None]:
extra_trees_clf = ExtraTreesClassifier(
    n_estimators=300,
    criterion='gini',
    max_depth=None,
    min_samples_split=4,
    min_samples_leaf=1,
    random_state=42
)
extra_trees_clf.fit(X_train_new, y_train_new)
y_val_pred = extra_trees_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))
cm = confusion_matrix(y_val, y_val_pred)

sns.heatmap(cm, annot=True, fmt='d')
plt.show()

In [None]:
kneighbors_clf = KNeighborsClassifier(
    n_neighbors=5,                              
    weights='distance',
    algorithm='auto',
    leaf_size=20,
)
kneighbors_clf.fit(X_train_new, y_train_new)
y_val_pred = kneighbors_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))
cm = confusion_matrix(y_val, y_val_pred)

sns.heatmap(cm, annot=True, fmt='d')
plt.show()

Last model test with test data

In [None]:
y_test_pred = svc_clf.predict(X_test)
print(classification_report(y_test, y_test_pred))
cm = confusion_matrix(y_test, y_test_pred)

sns.heatmap(cm, annot=True, fmt='d')
plt.show()

Importing, modifying and testing own pictures

In [None]:
import cv2
import matplotlib as mpl
import matplotlib.pyplot as plt


file = 'c:/Users/leona/EC-Data science/Machine Learning/machine_learning_ds23-main/machine_learning_ds23-main/kunskapskontroll_2/Siffror/nio.jpg'
image = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
image = cv2.bitwise_not(image)

image_resize = cv2.resize(image, (28, 28), interpolation=cv2.INTER_LINEAR)


plt.imshow(image, cmap = mpl.cm.binary, interpolation="nearest")

plt.axis("off")
plt.show()
image_resize.shape

In [None]:
plt.imshow(image_resize.reshape(28, 28), cmap = mpl.cm.binary, interpolation="nearest")

plt.axis("off")
plt.show()

In [None]:
resized_image = np.resize(image_resize, (1, 784))
resized_image.shape

In [None]:
svc_clf.predict(resized_image)