In [1]:
import pandas as pd
import numpy as np

In [40]:
import joblib as jb

In [2]:
import lightgbm as lgb

In [3]:
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.metrics import mean_squared_error, mean_absolute_error # for calculating the cost function

In [13]:
from sklearn.model_selection import GridSearchCV

In [4]:
path = r'D:\0_Respaldo\0_Proyectos_2024\ML_proyects\KagelX\KaggelX_Challenge\data\transform_data\transform_data.parquet'

In [5]:
data = pd.read_parquet(path)

In [7]:
df = data.copy()

In [8]:
x = df.drop(['price'], axis=1) #Features
y = df['price'] #target

In [9]:
# Splitting the dataset into training and testing set (80/20)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 28)

In [38]:
model = lgb.LGBMRegressor(learning_rate=0.05, 
                          num_leaves=31, 
                          max_depth=10, 
                          min_data_in_leaf=100,
                          bagging_fraction= 0.8, 
                          bagging_freq= 5,
                          feature_fraction= 0.6,
                          lambda_l2= 0.5,
                          n_estimators= 100
                          )


model.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001790 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 903
[LightGBM] [Info] Number of data points in the train set: 43418, number of used features: 10
[LightGBM] [Info] Start training from score 39128.271017


#### Entrenar hiperparametros

In [12]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100],
    'max_depth': [-1, 10, 20],
    'min_data_in_leaf': [20, 50, 100],
    'feature_fraction': [0.6, 0.8, 1.0],
    'bagging_fraction': [0.6, 0.8, 1.0],
    'bagging_freq': [0, 5, 10],
    'lambda_l1': [0, 0.1, 0.5],
    'lambda_l2': [0, 0.1, 0.5],
    'n_estimators': [100, 200, 500]
}

In [34]:
param_grid_1 = {
    'lambda_l1': [0, 0.1, 0.5],
    'lambda_l2': [0, 0.1, 0.5],
    'n_estimators': [100, 200, 500]
    }

In [35]:
grid_search = GridSearchCV(
    estimator=model, 
    param_grid=param_grid_1, 
    scoring='neg_mean_squared_error', 
    cv=3, 
    verbose=1, 
    n_jobs=-1
)

In [36]:
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 903
[LightGBM] [Info] Number of data points in the train set: 43418, number of used features: 10
[LightGBM] [Info] Start training from score 39128.271017


In [37]:
print("Best hiperparameters", grid_search.best_params_)

Best hiperparameters {'lambda_l1': 0, 'lambda_l2': 0.5, 'n_estimators': 100}


In [39]:
# Predicting the target values of the test set
y_pred = model.predict(x_test)

# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)

mae = mean_absolute_error(y_test, y_pred)
print("\nMAE: ", mae)


RMSE:  68893.543

MAE:  16895.24062552243


#### Export model

In [42]:
jb.dump(model, r'D:\0_Respaldo\0_Proyectos_2024\ML_proyects\KagelX\KaggelX_Challenge\data\transform_data\ml.pkl')

['D:\\0_Respaldo\\0_Proyectos_2024\\ML_proyects\\KagelX\\KaggelX_Challenge\\data\\transform_data\\ml.pkl']