## Importing the libraries

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [63]:
dataset = pd.read_csv("ANZ_customers_salary.csv")
X = dataset.iloc[:, [2, 4]].values
y = dataset.iloc[:, 3].values

## Encoding categorical variable

In [64]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse=False), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Splitting the dataset into training set and test set

In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Decision Tree Regression model on the training set

In [66]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth = 15, random_state = 0)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=15, random_state=0)

## Predicting the test set results

In [67]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1928.97  983.36]
 [2718.51 3517.61]
 [ 866.23 1630.36]
 [3903.73  862.95]
 [3195.01 2218.73]
 [2687.85 1225.33]
 [ 761.33 3026.95]
 [3649.51 1541.12]
 [3308.61 1952.29]
 [1099.7  1640.83]
 [4863.62 1396.23]
 [3676.94 2485.37]
 [ 866.23 2588.01]
 [ 725.32 2308.67]
 [ 970.47 3785.78]
 [4216.04 2101.51]
 [4863.62 1916.51]
 [1272.47 1436.98]
 [1013.67 2538.68]
 [1757.81 3231.26]]


## Applying K-fold Cross Validation

In [68]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: -152.83 %
Standard Deviation: 184.05 %


## Applying Grid Search to find the best parameters

In [69]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(regressor,
                  param_grid = {'max_depth': range(1, 11),
                                'min_samples_split': range(10, 60, 10)},
                  cv=5,
                  n_jobs=1,
                  scoring='neg_mean_squared_error')

gs.fit(X_train, y_train)

best_accuracy = gs.best_score_
best_parameters = gs.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: -224746796.57 %
Best Parameters: {'max_depth': 1, 'min_samples_split': 10}
