<a href="https://colab.research.google.com/github/VicLopes/Computer-Intelligence/blob/main/DecisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries & MNIST

In [None]:
import numpy as np
import pandas as pd # I/O de arquivos CSV
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import fetch_openml

# Data Preprocessing

In [None]:
mnist = fetch_openml('mnist_784', version=1)
 
x = mnist["data"] 
y = mnist["target"]
test_x = []
print(x.shape)


(70000, 784)


In [None]:
for row in x:
  image = list(row) # Organizes images into a list
  image = np.array(image) / 255
  test_x.append(image)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.85, random_state=0)
test_x = np.array(test_x)
classifier = DecisionTreeClassifier()
classifier = classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

0.8109411764705883


# Classification

In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

# Parameter distribution configuration
param_dist = {"max_depth": randint(3,80),
              "min_samples_leaf": randint(1, 50),
              "criterion": ["gini", "entropy"]}

# Initiates a Decision Tree Classifier
tree = DecisionTreeClassifier()
# Instantiate RandomizedSearch object, tree_random
tree_random = RandomizedSearchCV(tree, param_dist)
# Fit it on the data
tree_random.fit(x, y)

# Print the tuned parameters and score
print("Best Decision Tree Parameters post-tuning: {}".format(tree_random.best_params_))
print("Best score was {}".format(tree_random.best_score_))

Best Decision Tree Parameters post-tuning: {'criterion': 'gini', 'max_depth': 45, 'min_samples_leaf': 10}
Best score was 0.8684714285714286


Metrics using parameters from snippet above

In [None]:
from sklearn import metrics

y_pred_random = tree_random.predict(x_test)

print("Mean Absolute Error:\n\t", metrics.mean_absolute_error(y_test, y_pred_random))
print("\nMean Squared Error:\n\t", metrics.mean_squared_error(y_test, y_pred_random))
print("\nRoot Mean Squared Error:\n\t", np.sqrt(metrics.mean_squared_error(y_test, y_pred_random)))
print("\nConfusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred_random))
print("\n\nAccuracy Score:\n", metrics.accuracy_score(y_test, y_pred_random))
print("\n\nClassification Report:\n", metrics.classification_report(y_test, y_pred_random))

Mean Absolute Error:
	 0.27260504201680674

Mean Squared Error:
	 1.3018151260504203

Root Mean Squared Error:
	 1.1409711328734047

Confusion Matrix:
 [[5644    3   44   33   13   39   45   12   41   21]
 [   3 6540   29   23   14   17   10   24   31   12]
 [  48   42 5473   64   52   50   40   57   98   33]
 [  29   25  119 5488   32  130   20   49   87   65]
 [  15   14   62   19 5406   27   41   36   54  136]
 [  69   26   47  174   39 4816   71   27   67   53]
 [  60   27   64   21   58   79 5416    8   71   22]
 [  19   33   66   36   57   28    5 5804   44   92]
 [  37   56  102   95   71   94   57   26 5146   93]
 [  39   19   71   71  175   71   20  107   72 5270]]


Accuracy Score:
 0.9244201680672269


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.95      5895
           1       0.96      0.98      0.97      6703
           2       0.90      0.92      0.91      5957
           3       0.91      0.91   

For comparison purposes, here's how it'd look if RandomSearch wasn't used

In [None]:
classifier = DecisionTreeClassifier(criterion='gini', max_depth=45, min_samples_leaf=10)
classifier = classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print("Mean Absolute Error:\n\t", metrics.mean_absolute_error(y_test, y_pred))
print("\nMean Squared Error:\n\t", metrics.mean_squared_error(y_test, y_pred))
print("\nRoot Mean Squared Error:\n\t", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("\nConfusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))
print("\n\nAccuracy Score:\n", metrics.accuracy_score(y_test, y_pred))
print("\n\nClassification Report:\n", metrics.classification_report(y_test, y_pred))

Mean Absolute Error:
	 0.7060840336134454

Mean Squared Error:
	 3.312705882352941

Root Mean Squared Error:
	 1.8200840316735216

Confusion Matrix:
 [[5152    7   49  108   76  156   81   82  167   17]
 [   2 6205  105   87   56   24   18   79  109   18]
 [ 145  180 4486  193  139  114  187  191  256   66]
 [ 170   79  239 4437   78  369  100  140  242  190]
 [  36   30  100   86 4716  122  151  111  191  267]
 [ 131  146  146  378   89 3888  193   91  194  133]
 [ 144   58  213   95  118  165 4810   35  175   13]
 [  63   55  163  109  118   98   17 5276   80  205]
 [ 111  234  142  294  115  214  106   81 4262  218]
 [  81   25   48  165  401  183   47  343  155 4467]]


Accuracy Score:
 0.8016638655462185


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86      5895
           1       0.88      0.93      0.90      6703
           2       0.79      0.75      0.77      5957
           3       0.75      0.73     