In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
data = pd.read_csv('Cleaned_data.csv').set_index('Unnamed: 0')

In [None]:
data.head(4)

In [None]:
# train_data = data.drop(['Agent'], axis = 1)
train_data = data.drop(['현재값', 'Reward'], axis = 1)

In [None]:
# target_data = data['Agent']
target_data = data['현재값']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.2, random_state=30)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

rf_clf = RandomForestRegressor(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(x_train, y_train)

In [None]:
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_clf1 = RandomForestRegressor(n_estimators = 100,
                                max_depth = 12,
                                min_samples_leaf = 8,
                                min_samples_split = 8,
                                n_jobs = -1)

In [None]:
from sklearn.metrics import mean_squared_error
rf_clf1.fit(x_train, y_train)
pred = rf_clf1.predict(x_test)
print('Mean Squared Error: {:.4f}'.format(mean_squared_error(y_test,pred)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf1.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = train_data.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances')
sns.barplot(x=ftr_top20, y=ftr_top20.index)
plt.show()

In [None]:
estimator = rf_clf1.estimators_[3]

export_graphviz(estimator, out_file='tree.png', 
                feature_names = x_train.columns,
                class_names = y_train,
                max_depth = 10, # 표현하고 싶은 최대 depth
                precision = 3, # 소수점 표기 자릿수
                filled = True, # class별 color 채우기
                rounded=True, # 박스의 모양을 둥글게
               )

In [None]:
os.getcwd()

In [None]:
# 생성된 .dot 파일을 .png로 변환
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'decistion-tree.png', '-Gdpi=600'])

# jupyter notebook에서 .png 직접 출력
from IPython.display import Image
Image(filename = 'decistion-tree.png')