In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv(r'D:\PYTON PROGRAMMING\PYTHON FILES\Machine Learning Projects\CAR PRICE PREDICTION\cleaned_car_dataset.csv')

In [4]:
df.columns

Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'year', 'Category',
       'Leather interior', 'Engine volume', 'Mileage', 'Cylinders', 'Doors',
       'Wheel', 'Color', 'Airbags', 'Engine Type', 'Gear box type_Automatic',
       'Gear box type_Manual', 'Gear box type_Tiptronic',
       'Gear box type_Variator', 'Drive wheels_FWD', 'Drive wheels_RWD',
       'Fuel_Diesel', 'Fuel_Hybrid', 'Fuel_Hydrogen', 'Fuel_LPG',
       'Fuel_Petrol', 'Fuel_Plug-in Hybrid'],
      dtype='object')

In [5]:
req_col = []
for col in df.columns:
    if col not in ['ID', 'Levy', 'Manufacturer', 'Model', 'Doors', 'Category', 'Color']:
        req_col.append(col)
req_col

['Price',
 'year',
 'Leather interior',
 'Engine volume',
 'Mileage',
 'Cylinders',
 'Wheel',
 'Airbags',
 'Engine Type',
 'Gear box type_Automatic',
 'Gear box type_Manual',
 'Gear box type_Tiptronic',
 'Gear box type_Variator',
 'Drive wheels_FWD',
 'Drive wheels_RWD',
 'Fuel_Diesel',
 'Fuel_Hybrid',
 'Fuel_Hydrogen',
 'Fuel_LPG',
 'Fuel_Petrol',
 'Fuel_Plug-in Hybrid']

In [6]:
copy_df = df[req_col].copy()

In [7]:
# Split and fit it into decision tree model 
x_train, x_test, y_train, y_test = train_test_split(copy_df.drop('Price', axis=1), copy_df['Price'], test_size=0.2, random_state=42)

In [None]:
# Define model
dt = DecisionTreeRegressor(random_state=42)

# Define parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2']
}

# Grid search
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid,
                           cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(x_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters: {'max_depth': 15, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Best Score: -177834536.57459965


In [9]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_test)

# Evaluate again
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


MAE: 5963.717596927663
RMSE: 10792.525418206393
R² Score: 0.5909599550514478


In [10]:
y_train_pred = best_model.predict(x_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Train RMSE:", train_rmse)


Train RMSE: 9865.548739097889


In [11]:
copy_df['Price'].describe()

count     19237.000000
mean      19409.015621
std       19039.694583
min        1000.000000
25%        9408.000000
50%       15053.000000
75%       23521.000000
max      872946.000000
Name: Price, dtype: float64