In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np


In [20]:
data = pd.read_csv("test_2023-08-13_data_cleaned_distances_no_outliers.csv")
data.head()

Unnamed: 0,url_id,Celková cena,Podlaží,Užitná plocha,Parkování,Garáž,Latitude,Longitude,Terasa,Balkón,...,Distance_to_Olomouc,Distance_to_Pardubice,Distance_to_Plzeň,Distance_to_Ústí_nad_Labem,Distance_to_Zlín,Distance_to_Moravskoslezský,Distance_to_Prague_North,Distance_to_Prague_West,Distance_to_Prague_South,Distance_to_Prague_East
0,2059674700,2857000.0,1.0,47,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
1,2765366348,2163000.0,3.0,29,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
2,3099862092,1956000.0,3.0,26,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
3,886318156,2100000.0,2.0,27,1,1,50.203105,14.279582,1,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
4,1892951116,2569000.0,3.0,37,1,1,50.203105,14.279582,1,1,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746


In [21]:
# Drop the target variable and unnecessary columns
X = data.drop(['Celková cena', 'Latitude', 'Longitude'], axis=1)

# 'Celková cena' is our target variable
y = data['Celková cena']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
# Create a Linear Regression model
lr = LinearRegression()

# Train the model
lr.fit(X_train, y_train)

# Make predictions
lr_predictions = lr.predict(X_test)

# Calculate RMSE
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_predictions))

lr_rmse


1313684.2312443806

In [23]:
# Create a Decision Tree Regressor model
dt = DecisionTreeRegressor(random_state=42)

# Train the model
dt.fit(X_train, y_train)

# Make predictions
dt_predictions = dt.predict(X_test)

# Calculate RMSE
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_predictions))

dt_rmse


1439003.8963174832

In [24]:
# Create a Random Forest Regressor model
rf = RandomForestRegressor(random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
rf_predictions = rf.predict(X_test)

# Calculate RMSE
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))

rf_rmse


1036455.9829323374

In [25]:
# Create a Gradient Boosting Regressor model
gb = GradientBoostingRegressor(random_state=42)

# Train the model
gb.fit(X_train, y_train)

# Make predictions
gb_predictions = gb.predict(X_test)

# Calculate RMSE
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_predictions))

gb_rmse


1109256.4449931649

In [10]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a base model
rf = RandomForestRegressor(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

best_params


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  14.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  14.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  14.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  12.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  12.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  27.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  27.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  28.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  11.7s
[CV] END max_dep

KeyboardInterrupt: 

In [None]:
# Create a Random Forest Regressor model with the best parameters
rf_best = RandomForestRegressor(
    max_depth=None, 
    min_samples_leaf=2, 
    min_samples_split=2, 
    n_estimators=100, 
    random_state=42
)

# Train the model
rf_best.fit(X_train, y_train)

# Make predictions
rf_best_predictions = rf_best.predict(X_test)

# Calculate RMSE
rf_best_rmse = np.sqrt(mean_squared_error(y_test, rf_best_predictions))

rf_best_rmse


913091.4868734049