# Decision Tree Regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

### Loading the dataset

In [12]:
filepath = '../data/dataset_with_encoded_location.zip'
df = pd.read_csv(filepath, compression='zip')
df.head()

Unnamed: 0,bath,balcony,price,House_size,new_total_sqft,L1,L2,L3,L4,L5,...,L7,L8,L9,L10,L11,L12,L13,L14,L15,L16
0,2,3,62,3,1440,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1,95,3,1521,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,51,2,1200,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,63,3,1310,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,2,70,3,1800,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Splitting the data into Feature and Target

In [13]:
X = df.drop('price', axis=1)
y = df['price']

X

Unnamed: 0,bath,balcony,House_size,new_total_sqft,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15,L16
0,2,3,3,1440,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3,1,3,1521,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,2,1,2,1200,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,1,3,1310,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,2,3,1800,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6991,2,2,2,1050,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6992,2,2,2,1262,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6993,2,1,3,1345,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6994,3,3,3,1715,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


### Splitting the data into training and testing 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Tuning the Hyper-Parameters in the DecisionTreeRegressor

In [15]:
# Define the parameter grid to search
param_grid = {
    'max_depth': [3, 5, 7, 9],           # Specify the maximum depth of the tree
    'min_samples_split': [2, 5, 10],     # Specify the minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]        # Specify the minimum number of samples required to be at a leaf node
}

# Initialize the DecisionTreeRegressor
model = DecisionTreeRegressor()

# Initialize GridSearchCV with the model and parameter grid
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
mse = mean_squared_error(y_test, best_model.predict(X_test))
rmse = np.sqrt(mse)
r2 = r2_score(y_test, best_model.predict(X_test))

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')


Best Parameters: {'max_depth': 9, 'min_samples_leaf': 4, 'min_samples_split': 10}
Mean Squared Error: 33.32166297030955
Root Mean Squared Error: 5.772491920332982
R-squared: 0.9594230015882613


### Conclusion:

Mean Squared Error (MSE): The MSE value of approximately 33.32 indicates the average squared difference between the actual and predicted values. It suggests that, on average, the squared error of the model's predictions is around 33.32.

Root Mean Squared Error (RMSE): The RMSE value of approximately 5.77 suggests that, on average, the model's predictions are around 5.77 units away from the actual values. Since RMSE is in the same units as the target variable, this value provides a more interpretable measure of the average magnitude of error.

R-squared (R²): The R² value of approximately 0.959 indicates that the model explains approximately 95.9% of the variance in the target variable. This high R² value suggests that the model is performing well in explaining the variability in the data.