In [None]:
# explore random forest tree depth effect on performance
from numpy import mean
from numpy import std
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
from sklearn.datasets 				import make_classification
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RepeatedStratifiedKFold.html
from sklearn.model_selection 	import cross_val_score
from sklearn.model_selection 	import RepeatedStratifiedKFold
#	https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble 				import RandomForestClassifier

from matplotlib import pyplot


##	Use K-Fold to evaluate the mode
#		https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RepeatedStratifiedKFold.html
def accuray_by_K_Fold(model, features, outputs):

	# Determines the cross-validation splitting strategy.
	rskf = RepeatedStratifiedKFold(
			n_splits = 5, 					#	Number of folds. Must be at least 2.
			n_repeats = 10, 				#	Number of times cross-validator needs to be repeated.
			random_state = 463001		#	Controls the generation of the random states for each repetition. Can use any number
			)
 
	## evaluate the model and collect the results
	#	https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
	accuracy = cross_val_score(
			model, 								# The object to use to fit the data.
			features, 						# The data to fit.
			outputs, 							# The target variable to try to predict in the case of supervised learning.
			scoring='accuracy', 	#	A str (see model evaluation documentation) or a scorer callable object / function with signature
			cv=rskf, 							#	Determines the cross-validation splitting strategy
			n_jobs=-1							# Number of jobs to run in parallel.
			)
 
	return accuracy

##	create random data array using make_classification from sklearn
##
#		Replace with the real data
##
#	https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
#	features:		The data to fit. Can be for example a list, or an array.
#	outputs:		The target variable to try to predict in the case of supervised learning.
features, outputs = make_classification(
		n_samples = 500,							#	The number of samples.
		n_features = 20, 							#	The total number of features. 
		n_informative = 12,						# The number of informative features.
		n_redundant = 2, 							#	The number of redundant features.
		random_state = 3							#	Determines random number generation for dataset creation.
		)

##	create n random forest model with depths 1 ~ n
#	https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#
#	max_depth:	The maximum depth of the tree. 
#							If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
##
#	change "max_depth"
#	to		 "max_features", "max_samples", "n_estimators" for evealuating different useful Parameters in random forest
##
depths = 10
random_forest_models = dict()
for n in range(1,depths) :
		random_forest_models[n] = RandomForestClassifier(max_depth = n)


# evaluate the models and store results
accuracy, names = list(), list()
for name, model in random_forest_models.items():

	# calculate the accuracy for each model
	result = accuray_by_K_Fold(model, features, outputs)
 
	accuracy.append(result)
	names.append(name)
	# summarize the performance along the way
	print( "max_depth = " + str(name) + " mean = %.2f" % mean(result) + " std = %.2f" % std(result) )
 
# create the visulization
#	https://matplotlib.org/3.5.0/api/_as_gen/matplotlib.pyplot.boxplot.html
pyplot.boxplot(
		accuracy, 				# Array or a sequence of vectors.
  	labels = names, 	# Labels for each dataset (one per dataset).
		showmeans = True	# Show the arithmetic means.
		)
pyplot.show()

max_depth = 1 mean = 0.78 std = 0.04
max_depth = 2 mean = 0.81 std = 0.04
max_depth = 3 mean = 0.83 std = 0.04
