In [37]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(r'../cleaned/output_encoded.csv')
df.head()

Unnamed: 0,fuel_type,mileage [km],model,name,price [pln],seller_type,year,engine_capacity [cm3],horsepower [km],is_gearbox_automatic,region
0,178162.801039,62000,101613.193302,85505.357299,63000,90852.118289,2019,1368,145,0,80689.855237
1,178162.801039,56800,101613.193302,133323.671059,99000,90852.118289,2018,1368,165,0,128266.815789
2,178162.801039,59000,101613.193302,85505.357299,71000,90852.118289,2018,1368,165,0,215862.601274
3,178162.801039,80089,101613.193302,85505.357299,59500,90852.118289,2016,1368,164,0,128266.815789
4,178162.801039,33000,101613.193302,85505.357299,70000,265742.16686,2021,1368,145,0,104437.423222


In [29]:
y = df['price [pln]']
X = df.drop(columns=['price [pln]'])
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
tree_model = DecisionTreeRegressor(random_state = 42)
tree_model.fit(X_train, y_train)

In [40]:
tree_model.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 42,
 'splitter': 'best'}

In [41]:
tree_model.get_depth()

21

In [42]:
tree_model.get_n_leaves()

np.int64(498)

In [43]:
path = tree_model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

models = []
for ccp_alpha in ccp_alphas:
    reg = DecisionTreeRegressor(random_state=42, ccp_alpha=ccp_alpha)
    reg.fit(X_train, y_train)
    models.append(reg)

rmse_values = [np.sqrt(mean_squared_error(y_test, model.predict(X_test))) for model in models]

best_alpha_idx = np.argmin(rmse_values)
best_model = models[best_alpha_idx]
best_alpha = ccp_alphas[best_alpha_idx]

print(best_alpha)
print(rmse_values[best_alpha_idx])

35370555.103148766
103118.44634057104


In [44]:
final_tree_model = DecisionTreeRegressor(random_state = 42, ccp_alpha=best_alpha)
final_tree_model.fit(X_train, y_train)

In [45]:
final_tree_model.get_n_leaves()

np.int64(30)

In [47]:
final_tree_model.get_depth()

8

In [48]:
final_tree_model.score(X_test, y_test)

0.8685078733955598