In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("2023-06-25_data_cleaned_enhanced_data_no_outliers.csv")

In [11]:
from sklearn.model_selection import train_test_split

# Drop non-informative columns
data = df.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "url_id"])

# Define features X and target y
X = data.drop(columns=["Celková cena"])
y = data["Celková cena"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


((12748, 157), (3188, 157))

In [17]:
import xgboost as xgb

# Initialize XGBoost regressor
model = xgb.XGBRegressor(objective ='reg:squarederror', seed=42)

# Fit the model
model.fit(X_train, y_train)


In [14]:
# Drop the "coords" column
X_train = X_train.drop(columns=["coords"])
X_test = X_test.drop(columns=["coords"])

# Check the shape of the updated datasets
X_train.shape, X_test.shape


((12748, 156), (3188, 156))

In [18]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Predict house prices on the test set
y_pred = model.predict(X_test)

# Calculate the root mean squared error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse


771546.2963755457

In [20]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'min_child_weight': [1, 3, 5]
}

# Initialize the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Fit the grid search
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [22]:
# Get the best parameters
best_parameters = grid_search.best_params_

print(best_parameters)


{'learning_rate': 0.3, 'max_depth': 6, 'min_child_weight': 3}


In [21]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for the random search
param_dist = {
    'max_depth': range(3, 10),
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5],
    'min_child_weight': range(1, 6)
}

# Initialize the random search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, scoring='neg_mean_squared_error', verbose=2, random_state=42, n_jobs=-1)

# Fit the random search
random_search.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [23]:
# Define the expanded parameter grid
expanded_param_grid = {
    'max_depth': range(3, 11),
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5],
    'min_child_weight': range(1, 7)
}

# Initialize the grid search
expanded_grid_search = GridSearchCV(estimator=model, param_grid=expanded_param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Fit the grid search
expanded_grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 384 candidates, totalling 1152 fits


In [24]:
# Get the best parameters
best_parameters = grid_search.best_params_

print(best_parameters)


{'learning_rate': 0.3, 'max_depth': 6, 'min_child_weight': 3}
