#### Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

#### Load the Dataset

In [2]:
digits = load_digits()

#### Split the training data into a training and test sets

In [3]:
# Splitting data (X = data, Y = target)
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=42)

#### Train and Test Sets Explainations:
- The X_train and y_train will be used to train the random forest model for processing
- The X_test and y_test are used to evaluate the model's performance on unseen data to see how well the model works

#### Random Forest Classifier

In [4]:
random_forest_classifier = RandomForestClassifier(random_state=42)

In [5]:
# Create an instance of RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Get all parameter details
params = clf.get_params()

# Print the parameters and their values
for key, value in params.items():
    print(f"{key}: {value}")

bootstrap: True
ccp_alpha: 0.0
class_weight: None
criterion: gini
max_depth: None
max_features: sqrt
max_leaf_nodes: None
max_samples: None
min_impurity_decrease: 0.0
min_samples_leaf: 1
min_samples_split: 2
min_weight_fraction_leaf: 0.0
n_estimators: 100
n_jobs: None
oob_score: False
random_state: 42
verbose: 0
warm_start: False


#### Max Depth Parameter
Max depth controls the depth of decision trees in a Random Forest and helps prevent overfitting as well as improves generalization this helps the model become more balanced.

In [6]:
# Max_depth range of values to search through
param_grid = {'max_depth': [None, 10, 20, 30, 40]}

# Best max_depth calculated by creating a GridSearchCV object
grid_search = GridSearchCV(estimator=random_forest_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# GridSearchCV object fitted to the training data
grid_search.fit(X_train, y_train)

# Best value for max_depth
best_max_depth = grid_search.best_params_['max_depth']
print(f"Best max_depth: {best_max_depth}")

Best max_depth: None


#### Max Depth (None) value
The Value None enables the model to capture patterns in the data that are complex. By not restricting the tree depth, the model can adapt to patterns and variations in the MNIST dataset, This improves its ability to generalize and perform well on both training and test data.

In [7]:
# Training a Random Forest Classifier with the best max_depth 
best_random_forest_classifier = RandomForestClassifier(max_depth=best_max_depth, random_state=42)
best_random_forest_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_random_forest_classifier.predict(X_test)

# Evaluate the model's performance (e.g., accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with best max_depth: {accuracy}")

Accuracy with best max_depth: 0.9722222222222222


#### Confusion Matrix

In [8]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Labeling confusion matrix for easier processing
digit_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

print("Confusion Matrix:\n")

# Enumerate through the confusion matrix and label the rows and columns
for i, row in enumerate(conf_matrix):
    digit = digit_labels[i]
    print(f"Digit {digit}: {row}")

Confusion Matrix:

Digit 0: [32  0  0  0  1  0  0  0  0  0]
Digit 1: [ 0 28  0  0  0  0  0  0  0  0]
Digit 2: [ 0  0 33  0  0  0  0  0  0  0]
Digit 3: [ 0  0  0 32  0  1  0  0  1  0]
Digit 4: [ 0  0  0  0 46  0  0  0  0  0]
Digit 5: [ 0  0  0  0  0 45  1  0  0  1]
Digit 6: [ 0  0  0  0  0  1 34  0  0  0]
Digit 7: [ 0  0  0  0  0  0  0 33  0  1]
Digit 8: [ 0  1  0  0  0  0  0  0 29  0]
Digit 9: [ 0  0  0  0  0  1  0  1  0 38]


####  Classes that struggle the most

- Digits (3,5 & 9) struggles the most as they have 2 classifications each that were wrong
- Digits (1,6,7 & 8) have 1 classification each that were wrong

#### Report on (Accuracy, Precision, Recall, and f1-score)

In [9]:
# Accuracy, precision, recall, and F1-score calculations
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Report
print("Report:\n")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Report:

Accuracy: 0.9722222222222222
Precision: 0.9740424119023985
Recall: 0.9727003722185199
F1-Score: 0.9732067700933176
