In [118]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

from xgboost import XGBRegressor

In [119]:
df = pd.read_csv('../../../data/processed/full_data.csv')

In [120]:
df = df[df['year'] != 'eléctrico']
df['year'] = df['year'].astype(float)
df['year'] = df['year'].astype(int)

In [121]:
df.dtypes

car_type       object
year            int64
price         float64
km            float64
prediccion     object
color          object
dtype: object

In [122]:
df

Unnamed: 0,car_type,year,price,km,prediccion,color
0,BMW - SERIE 1,2021,19490.0,85977.0,neutral,tan
1,AUDI - A1,2017,16490.0,48836.0,neutral,red
2,BMW - Z4,2022,40790.0,54486.0,neutral,blue
3,BMW - SERIE 1,2020,21590.0,61215.0,neutral,grey
4,BMW - SERIE 1,2022,23990.0,93881.0,neutral,grey
...,...,...,...,...,...,...
2698,CUPRA - LEON,2025,34305.0,0.0,bueno,grey
2699,CUPRA - LEON,2025,28885.0,0.0,bueno,black
2700,CUPRA - FORMENTOR,2025,49400.0,3500.0,neutral,grey
2701,CUPRA - LEON,2025,44600.0,0.0,bueno,black


In [123]:
LE = LabelEncoder()
df['car_color_encoded'] = LE.fit_transform(df['color'])
df['car_type_encoded'] = LE.fit_transform(df['car_type'])
estados = pd.get_dummies(df['prediccion'], prefix='estado_')
df = pd.concat([df, estados], axis=1)

df

Unnamed: 0,car_type,year,price,km,prediccion,color,car_color_encoded,car_type_encoded,estado__bueno,estado__malo,estado__neutral
0,BMW - SERIE 1,2021,19490.0,85977.0,neutral,tan,12,13,False,False,True
1,AUDI - A1,2017,16490.0,48836.0,neutral,red,10,0,False,False,True
2,BMW - Z4,2022,40790.0,54486.0,neutral,blue,2,26,False,False,True
3,BMW - SERIE 1,2020,21590.0,61215.0,neutral,grey,6,13,False,False,True
4,BMW - SERIE 1,2022,23990.0,93881.0,neutral,grey,6,13,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
2698,CUPRA - LEON,2025,34305.0,0.0,bueno,grey,6,30,True,False,False
2699,CUPRA - LEON,2025,28885.0,0.0,bueno,black,1,30,True,False,False
2700,CUPRA - FORMENTOR,2025,49400.0,3500.0,neutral,grey,6,29,False,False,True
2701,CUPRA - LEON,2025,44600.0,0.0,bueno,black,1,30,True,False,False


In [124]:
df.drop(columns=['car_type', 'color', 'prediccion'], axis=1, inplace=True)
df

Unnamed: 0,year,price,km,car_color_encoded,car_type_encoded,estado__bueno,estado__malo,estado__neutral
0,2021,19490.0,85977.0,12,13,False,False,True
1,2017,16490.0,48836.0,10,0,False,False,True
2,2022,40790.0,54486.0,2,26,False,False,True
3,2020,21590.0,61215.0,6,13,False,False,True
4,2022,23990.0,93881.0,6,13,False,False,True
...,...,...,...,...,...,...,...,...
2698,2025,34305.0,0.0,6,30,True,False,False
2699,2025,28885.0,0.0,1,30,True,False,False
2700,2025,49400.0,3500.0,6,29,False,False,True
2701,2025,44600.0,0.0,1,30,True,False,False


In [125]:
df_shuffled = df.sample(frac=1, random_state=689).reset_index(drop=True)

In [126]:
X = df_shuffled.drop(columns=['price'])
y = df_shuffled['price']

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=689)

In [128]:
y_train

344       5000.0
1225     20500.0
2353    109000.0
2668     26290.0
1354     24990.0
          ...   
941      40000.0
374      19000.0
2213     24399.0
514      42990.0
1528     10499.0
Name: price, Length: 2161, dtype: float64

In [129]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'criterion': ['squared_error', 'poisson'],
}
rf = RandomForestRegressor(random_state=42)

In [130]:
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

print("Best params RF:", grid_search_rf.best_params_)
print("Best RMSE RF:", -grid_search_rf.best_score_)

Best params RF: {'criterion': 'poisson', 'max_depth': None, 'n_estimators': 50}
Best RMSE RF: 10638.413355031667


# XGBoost

In [131]:
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
}
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

In [132]:
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

print("Best params XGB:", grid_search_xgb.best_params_)
print("Best RMSE XGB:", -grid_search_xgb.best_score_)

Best params XGB: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Best RMSE XGB: 10031.009090491852
