# Introduction

# Setup
Let us import the required modules.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import os
import joblib

import src.data_split as ds
# import project.src.visualization as viz
import src.evaluation as eva

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Load Data
Note that the dataset is already split into Train-Test sets.

In [2]:
data_split = ds.TrainTestSplit.from_csv_directory(dir_path="../data")

In [3]:
data_split.x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 719 entries, pixel13 to pixel780
dtypes: float64(719)
memory usage: 329.1 MB


In [5]:
data_split.y_train.shape

(60000,)

# Random Forest Classifier Model

In [15]:
# automatic parameters tuning
rfclsf=RandomForestClassifier(random_state=28)
properties={
    "n_estimators": [x for x in range(50,201,50)],
    # "min_samples_leaf": [x for x in range(50,201,50)],
    # "max_leaf_nodes": [x for x in range(2,11,2)],
    # "max_features": ["sqrt"]
    # "criterion": ["gini","entropy"]
}

tuned_rfclsf=GridSearchCV(rfclsf,properties,scoring="accuracy_score",cv=5,return_train_score=True,verbose=4,n_jobs=-1)
tuned_rfclsf.fit(data_split.x_train,data_split.y_train)

print ("Best Score: {:.3f}".format(tuned_rfclsf.best_score_) )
print ("Best Params: ", tuned_rfclsf.best_params_)

In [16]:
# save results
if not os.path.exists('../../tuned_models'):
    os.mkdir('../../tuned_models')
joblib.dump(tuned_rfclsf, '../../tuned_models/randforest_tuning_results.pkl')

In [17]:
# to load previously saved results
tuned_rfclsf = joblib.load("../../tuned_models/randforest_tuning_results.pkl")

In [18]:
print(tuned_rfclsf.cv_results_)
tuned_rfclsf_results = pd.DataFrame( tuned_rfclsf.cv_results_ )

In [19]:
tuned_rfclsf_results[["param_n_estimators","mean_test_score","mean_train_score"]]

In [20]:
sns.set(rc={"figure.figsize":(12, 8)})
sns.lineplot(data=tuned_rfclsf_results, x="param_n_estimators", y="mean_test_score")

In [21]:
sns.set(rc={"figure.figsize":(12, 8)})
sns.lineplot(data=tuned_rfclsf_results, x="param_min_samples_leaf", y="mean_test_score")

In [22]:
sns.set(rc={"figure.figsize":(12, 8)})
sns.lineplot(data=tuned_rfclsf_results, x=tuned_rfclsf_results.index, y="mean_test_score")

In [23]:
print(tuned_rfclsf_results.best_estimator_)
print(tuned_rfclsf_results.best_score_)

In [24]:
test_acc = accuracy_score(y_true = data_split.y_train,
                          y_pred = tuned_rfclsf.predict(data_split.x_train) )
print ("Train Accuracy: {}".format(test_acc) )

In [6]:
# best parameters from automatic parameters tuning
forest_clsf = RandomForestClassifier(**tuned_rfclsf.best_params_)
# forest_clsf = RandomForestClassifier(n_estimators=20)
forest_clsf.fit(data_split.x_train, data_split.y_train)

RandomForestClassifier(n_estimators=20)

In [7]:
rf_train_pred = forest_clsf.predict(data_split.x_train)
rf_test_pred = forest_clsf.predict(data_split.x_test)

rf_train_eval = eva.Evaluation(y_real=data_split.y_train, y_pred=rf_train_pred)
rf_test_eval = eva.Evaluation(y_real=data_split.y_test, y_pred=rf_test_pred)

In [8]:
print("Training:")
rf_train_eval.print_eval()
print("Testing:")
rf_test_eval.print_eval()

Training:
--------------Model Evaluations:--------------
Accuracy score: 0.9999166666666667

Testing:
--------------Model Evaluations:--------------
Accuracy score: 0.959



In [52]:
# rf_model = eva.EvaluatedModel(forest_clsf, train_eval=rf_train_eval, test_eval=rf_test_eval)

# Performance Visualization

### Setup