# MNIST 1 - Random Forest

In [20]:
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from keras.utils.np_utils import to_categorical

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

### Einlesen und erste Analyse der Daten

In [3]:
train = pd.read_csv('resources/mnist_train.csv')

In [4]:
train.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 359.3 MB


### Um Zeit zu sparen, betrachten wir nur ein Viertel der Daten. Möchte man bessere Ergebnisse erhalten, kann man diesen Block auslassen und den vollständigen Datensatz verwenden.

In [6]:
np.random.seed(10)

remove_n = 45000
drop_indices = np.random.choice(train.index, remove_n, replace=False)
train_subset = train.drop(drop_indices)

In [7]:
train_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 2 to 59995
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 90.0 MB


### Aufbereiten der Daten: Wir trennen in abhängige (vorherzusagenden Zahl) und unabhängige Variablen (Pixelwerte), skalieren die Pixelwerte und encoden die labels.

In [9]:
Y_train = train_subset["label"]
X_train = train_subset.drop(labels = ["label"],axis = 1) 

In [10]:
Y_train.value_counts()

1    1713
7    1550
8    1518
9    1515
3    1511
0    1504
4    1487
6    1472
2    1429
5    1301
Name: label, dtype: int64

In [11]:
X_train = X_train / 255.0

In [12]:
Y_train = to_categorical(Y_train, num_classes = 10)

### Aufteilen in Test- und Validierungsdaten

In [13]:
images_train, images_val, labels_train, labels_val = train_test_split(X_train, Y_train, test_size = 0.2, random_state=42)

In [14]:
images_train.shape

(12000, 784)

### Wir verwenden wie im letzten Klassifikationsbeispiel (Halloween) einen RandomForest (mit GridSearchCV).

In [21]:
param_grid = [
    {'n_estimators': [35, 40, 45, 50, 55, 60], 'max_depth': [3, 5, 7, 9]}
]

In [23]:
random_forest = RandomForestClassifier()
grid_search = GridSearchCV(forest, param_grid, cv=5, scoring='neg_mean_squared_log_error')
grid_search.fit(images_train, labels_train)

grid_search.best_params_

{'max_depth': 9, 'n_estimators': 55}

In [24]:
forest = grid_search.best_estimator_

In [25]:
forest.fit(images_train, labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=55, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Evaluieren des Models

In [28]:
class ModelEvaluator:

    def evaluate_model(self, forest, input, labels, limit):
        predictions = forest.predict(input)
        some_predictions = predictions[:limit]
        some_labels = list(labels)[:limit]

        differences = np.array(some_predictions) - np.array(some_labels)
        
        count_incorrectly_correctly_classified = []
        for difference in differences:
            if difference.any() != 0:
                count_incorrectly_correctly_classified.append(difference)
        percentage_correctly_classified = (limit - len(count_incorrectly_correctly_classified)) / limit

        return some_predictions, percentage_correctly_classified

In [27]:
evaluator = ModelEvaluator()
predicted, percentage_correct = evaluator.evaluate_model(forest, images_val, labels_val, 3000)
print("Validierungssatz :" + str(percentage_correct))

predicted_test, percentage_correct_test = evaluator.evaluate_model(forest, images_train, labels_train, 3000)
print("Trainingssatz :" + str(percentage_correct_test))

Validierungssatz :0.7283333333333334
Trainingssatz :0.827
