In [None]:
# Package to implement Decision Tree Model
import pandas as pd             # Pandas
import streamlit as st          # Streamlit
import matplotlib.pyplot as plt # Matplotlib

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Package for data partitioning
from sklearn.model_selection import train_test_split

# Package to calculate f1_score
from sklearn.metrics import r2_score
import time
from sklearn import tree
from sklearn import metrics

bike_data_train = pd.read_csv('bike.csv')
output = bike_data_train['cnt']
features = bike_data_train[['season','mnth','holiday','weekday','workingday','weathersit','temp','atemp','hum','windspeed']]
X = features
y = output
cat_var = X['weathersit']
X_encoded = pd.get_dummies(X, columns=cat_var)

train_X, test_X, train_y, test_y = train_test_split(X_encoded, y, test_size = 0.2, random_state = 1)

# DT REGRESSOR CONFIGURATION    
bk_r = DecisionTreeRegressor(random_state=1)

hyper_params = {
    'max_depth': list(range(30,50)),
    'min_samples_split': list(range(3,12)),
    'min_samples_leaf': list(range(3,15))
}

fold_r  = KFold(n_splits=5, shuffle=True)

model_cv = GridSearchCV(estimator = bk_r,
                        param_grid = hyper_params,
                        scoring= 'r2',
                        cv=fold_r,
                        verbose=1,
                        n_jobs=-1)

start = time.time()
model_cv.fit(train_X, train_y)
stop = time.time()
print(f"Training time: {stop - start}s")

cv_results = pd.DataFrame(model_cv.cv_results_)
#cv_results

print('Initial score: ', model_cv.best_score_)
print('Initial parameters: ', model_cv.best_params_)

bestClassTree = model_cv.best_estimator_
print(bestClassTree)
y_pred_train = model_cv.predict(train_X)
y_pred = model_cv.predict(test_X)


# TRAINED TREE VISUALIZATION
bestClassTree1 = model_cv.best_estimator_
print(bestClassTree1)

fig = plt.figure(figsize=(25,20))
a = tree.plot_tree(decision_tree = bestClassTree1,
                    feature_names = train_X.columns,
                    class_names = ['0','1','2','3'],
                    filled = True)
a;
    # FEATURE IMPORTANCE PLOT
    importance = bestClassTree.feature_importances_
    feature_imp = pd.DataFrame(list(zip(train_X.columns, importance)),
                columns = ['Feature', 'Importance'])

    feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)

    feature_imp

    feature_imp_nonzero = feature_imp[feature_imp['Importance'] != 0.0]
    plt.figure(figsize=(10, 5), dpi = 100)
    plt.barh(feature_imp_nonzero['Feature'], feature_imp_nonzero['Importance'], color = ['orange', 'blue'])

    plt.xlabel("Importance")
    plt.ylabel("Input Feature")
    plt.title("Feature Importance");