In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
data_date = "2023-06-25"

In [3]:
data = pd.read_csv(data_date + "_data_cleaned_location_enhanced.csv")
data.head()

Unnamed: 0,url_id,Celková cena,Podlaží,Užitná plocha,Sklep,Parkování,Výtah,Latitude,Longitude,Terasa,...,Obecná kriminalita,Hospodářská kriminalita,Loupeže,Vloupání do bytů a rodinných domů,Znásilnění,Vraždy,Rodinné domy (Kč za m2),Byty (Kč za m2),Teoretická cena,Total_foreigners
0,722035788,6832000.0,2.0,48.0,1,1,1,50.049548,14.462156,0,...,35056,3040,255,984,153,24,96498,100727,4834896.0,345307
1,2418832972,6361000.0,1.0,53.0,0,1,0,50.296096,16.359198,0,...,4955,565,26,142,53,9,37856,41035,2174855.0,38806
2,3930260556,2214000.0,5.0,35.0,1,1,1,50.723614,15.189542,0,...,6551,687,67,183,56,2,39631,39361,1377635.0,42622
3,3756315724,5643000.0,4.0,68.0,1,0,1,50.051077,14.298411,0,...,35056,3040,255,984,153,24,96498,100727,6849436.0,345307
4,3955312204,5390000.0,4.0,38.0,1,0,1,50.049265,14.43826,0,...,35056,3040,255,984,153,24,96498,100727,3827626.0,345307


In [6]:
# Drop the target variable and unnecessary columns
X = data.drop(['Celková cena', 'Latitude', 'Longitude', 'Region'], axis=1)

# 'Celková cena' is our target variable
y = data['Celková cena']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Create a Linear Regression model
lr = LinearRegression()

# Train the model
lr.fit(X_train, y_train)

# Make predictions
lr_predictions = lr.predict(X_test)

# Calculate MAE
lr_mae = mean_absolute_error(y_test, lr_predictions)

lr_mae

654873.9596592018

In [11]:
# Create a Decision Tree Regressor model
dt = DecisionTreeRegressor(random_state=42)

# Train the model
dt.fit(X_train, y_train)

# Make predictions
dt_predictions = dt.predict(X_test)


# Calculate MAE
dt_mae = mean_absolute_error(y_test, dt_predictions)

dt_mae


711322.6466836735

In [12]:
# Create a Random Forest Regressor model
rf = RandomForestRegressor(random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
rf_predictions = rf.predict(X_test)

# Calculate MAE
rf_mae = mean_absolute_error(y_test, rf_predictions)

rf_mae


525816.3523596938

In [13]:
# Create a Gradient Boosting Regressor model
gb = GradientBoostingRegressor(random_state=42)

# Train the model
gb.fit(X_train, y_train)

# Make predictions
gb_predictions = gb.predict(X_test)

# Calculate MAE
gb_mae = mean_absolute_error(y_test, gb_predictions)

gb_mae


588408.5373018135

In [14]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a base model
rf = RandomForestRegressor(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

best_params


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  14.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  14.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  14.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  12.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  12.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  27.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  27.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  28.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  11.1s
[CV] END max_dep

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

In [16]:
# Create a Random Forest Regressor model with the best parameters
rf_best = RandomForestRegressor(
    max_depth=None, 
    min_samples_leaf=1, 
    min_samples_split=2, 
    n_estimators=200, 
    random_state=42
)

# Train the model
rf_best.fit(X_train, y_train)

# Make predictions
rf_best_predictions = rf_best.predict(X_test)

# Calculate MAE
rf_best_mae = mean_absolute_error(y_test, rf_best_predictions)

rf_best_mae


524522.2632653061