In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
pd.set_option('display.max_rows', None)

In [69]:
data = pd.read_csv("../processed_data/data.csv", index_col=0)

# Assuming the real data is similar in structure
# Data preprocessing
X = data.drop(columns=['price', 'id', 'source', 'coordinates',
                       'latitude', 'longitude'
                      ])  # Assuming 'id' and 'source' exist in the real dataset
y = data['price']

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initializing models
linear_model = LinearRegression()
rf_model = RandomForestRegressor(random_state=42)
boosting_model = XGBRegressor(random_state=42)

# Training the models
linear_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
boosting_model.fit(X_train_scaled, y_train)

# Making predictions
linear_preds = linear_model.predict(X_test_scaled)
rf_preds = rf_model.predict(X_test_scaled)
boosting_preds = boosting_model.predict(X_test_scaled)

# Evaluating the models
linear_rmse = mean_squared_error(y_test, linear_preds, squared=False)
rf_rmse = mean_squared_error(y_test, rf_preds, squared=False)
boosting_rmse = mean_squared_error(y_test, boosting_preds, squared=False)

linear_r2 = r2_score(y_test, linear_preds)
rf_r2 = r2_score(y_test, rf_preds)
boosting_r2 = r2_score(y_test, boosting_preds)

(linear_rmse, rf_rmse, boosting_rmse), (linear_r2, rf_r2, boosting_r2)


((18073.398124542942, 2113.186480458743, 25737.419988921207),
 (-174.53123356608208, -1.3996639164922269, -354.96294655741013))

In [8]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
import numpy as np

# Assuming X_train_scaled and y_train are your training data and labels

# Convert the DataFrame to PyTorch tensors
X_tensor = torch.tensor(X_train_scaled.astype(np.float32))
y_tensor = torch.tensor(y_train.values.astype(np.float32))

# Creating a dataset and dataloader for batch processing
dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define a simple feedforward neural network for regression
class RealEstatePricePredictor(nn.Module):

    def __init__(self):
        super(RealEstatePricePredictor, self).__init__()
        self.fc1 = nn.Linear(X_train_scaled.shape[1], 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 256)
        self.fc5 = nn.Linear(256, 256)
        self.fc6 = nn.Linear(256, 256)
        self.fc7 = nn.Linear(256, 256)
        self.fc8 = nn.Linear(256, 64)
        self.fc9 = nn.Linear(64, 32)
        self.fc10 = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = F.relu(self.fc8(x))
        x = F.relu(self.fc9(x))
        x = self.fc10(x)  # No activation function as it's a regression problem
        return x

# Initialize the model
model = RealEstatePricePredictor()

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Model training
num_epochs = 1000  # This can be adjusted
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1, 1))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# The model is now trained. You can use it to make predictions on your test set.
# Note: In a real scenario, you would also want to include validation logic.


Epoch [1/1000], Loss: 145154.4844
Epoch [2/1000], Loss: 664010.5000
Epoch [3/1000], Loss: 444363.8125
Epoch [4/1000], Loss: 947882.1875
Epoch [5/1000], Loss: 8658767.0000
Epoch [6/1000], Loss: 775801.6250
Epoch [7/1000], Loss: 1625240.8750
Epoch [8/1000], Loss: 19467932.0000
Epoch [9/1000], Loss: 533303.3750
Epoch [10/1000], Loss: 11713.7549
Epoch [11/1000], Loss: 533840.3750
Epoch [12/1000], Loss: 1123494.3750
Epoch [13/1000], Loss: 5481773.0000
Epoch [14/1000], Loss: 4045249.2500
Epoch [15/1000], Loss: 8193923.0000
Epoch [16/1000], Loss: 6217691.0000
Epoch [17/1000], Loss: 15887507.0000
Epoch [18/1000], Loss: 10298054.0000
Epoch [19/1000], Loss: 41385336.0000
Epoch [20/1000], Loss: 2981432.0000
Epoch [21/1000], Loss: 362983.3125
Epoch [22/1000], Loss: 732913.7500
Epoch [23/1000], Loss: 229295.7812
Epoch [24/1000], Loss: 125499.2891
Epoch [25/1000], Loss: 275351.5312
Epoch [26/1000], Loss: 184446.8906
Epoch [27/1000], Loss: 68129.0312
Epoch [28/1000], Loss: 888071.1250
Epoch [29/1000]

Epoch [233/1000], Loss: 50994.7031
Epoch [234/1000], Loss: 528192.1875
Epoch [235/1000], Loss: 225623.8438
Epoch [236/1000], Loss: 90039.7734
Epoch [237/1000], Loss: 214813.6875
Epoch [238/1000], Loss: 2414129.2500
Epoch [239/1000], Loss: 934.8276
Epoch [240/1000], Loss: 53056.3320
Epoch [241/1000], Loss: 721331.7500
Epoch [242/1000], Loss: 58862.2109
Epoch [243/1000], Loss: 99688.1953
Epoch [244/1000], Loss: 10779.6709
Epoch [245/1000], Loss: 22756.1934
Epoch [246/1000], Loss: 1019045.8750
Epoch [247/1000], Loss: 1711.4967
Epoch [248/1000], Loss: 602388.5625
Epoch [249/1000], Loss: 4414.3633
Epoch [250/1000], Loss: 72545.2656
Epoch [251/1000], Loss: 340487.9062
Epoch [252/1000], Loss: 323818.2500
Epoch [253/1000], Loss: 5101.1538
Epoch [254/1000], Loss: 31657.0625
Epoch [255/1000], Loss: 2099213.0000
Epoch [256/1000], Loss: 588553.6250
Epoch [257/1000], Loss: 40703.1133
Epoch [258/1000], Loss: 687268.5000
Epoch [259/1000], Loss: 151143.7344
Epoch [260/1000], Loss: 42267.2852
Epoch [26

Epoch [464/1000], Loss: 8393686.0000
Epoch [465/1000], Loss: 492518.5000
Epoch [466/1000], Loss: 940052.3750
Epoch [467/1000], Loss: 541061.5000
Epoch [468/1000], Loss: 3084235.7500
Epoch [469/1000], Loss: 124249.3672
Epoch [470/1000], Loss: 112672.3281
Epoch [471/1000], Loss: 22169.2910
Epoch [472/1000], Loss: 736850.3750
Epoch [473/1000], Loss: 469372.8438
Epoch [474/1000], Loss: 177083.1562
Epoch [475/1000], Loss: 787005.0625
Epoch [476/1000], Loss: 1266222.7500
Epoch [477/1000], Loss: 38078.6719
Epoch [478/1000], Loss: 35493.9258
Epoch [479/1000], Loss: 1075742.2500
Epoch [480/1000], Loss: 672621.4375
Epoch [481/1000], Loss: 4012146.0000
Epoch [482/1000], Loss: 94785.9609
Epoch [483/1000], Loss: 332339.1562
Epoch [484/1000], Loss: 11591.1299
Epoch [485/1000], Loss: 419565.5000
Epoch [486/1000], Loss: 612041.5000
Epoch [487/1000], Loss: 364868.5000
Epoch [488/1000], Loss: 4615138.5000
Epoch [489/1000], Loss: 170328.9062
Epoch [490/1000], Loss: 66069.1406
Epoch [491/1000], Loss: 1770

Epoch [697/1000], Loss: 187123.4062
Epoch [698/1000], Loss: 284695.9688
Epoch [699/1000], Loss: 104329.3125
Epoch [700/1000], Loss: 3094814.5000
Epoch [701/1000], Loss: 1812940.0000
Epoch [702/1000], Loss: 3247630.0000
Epoch [703/1000], Loss: 42090.5898
Epoch [704/1000], Loss: 355237.2500
Epoch [705/1000], Loss: 89874.3594
Epoch [706/1000], Loss: 4544476.5000
Epoch [707/1000], Loss: 582666.8750
Epoch [708/1000], Loss: 2312560.2500
Epoch [709/1000], Loss: 19525.3203
Epoch [710/1000], Loss: 414402.7500
Epoch [711/1000], Loss: 910551.6250
Epoch [712/1000], Loss: 794718.2500
Epoch [713/1000], Loss: 706898.1250
Epoch [714/1000], Loss: 252965.6562
Epoch [715/1000], Loss: 61163.4727
Epoch [716/1000], Loss: 97969.2266
Epoch [717/1000], Loss: 237931.2656
Epoch [718/1000], Loss: 3962909.5000
Epoch [719/1000], Loss: 21973.5742
Epoch [720/1000], Loss: 370675.5625
Epoch [721/1000], Loss: 417795.5938
Epoch [722/1000], Loss: 8738.5977
Epoch [723/1000], Loss: 255274.6094
Epoch [724/1000], Loss: 21897.

Epoch [930/1000], Loss: 85177.1562
Epoch [931/1000], Loss: 83459.7656
Epoch [932/1000], Loss: 93168.0234
Epoch [933/1000], Loss: 108210.4766
Epoch [934/1000], Loss: 251710.5625
Epoch [935/1000], Loss: 112008.5312
Epoch [936/1000], Loss: 32058.2422
Epoch [937/1000], Loss: 488315.7188
Epoch [938/1000], Loss: 545332.1250
Epoch [939/1000], Loss: 7873.3560
Epoch [940/1000], Loss: 714.6224
Epoch [941/1000], Loss: 5220.7329
Epoch [942/1000], Loss: 509.8379
Epoch [943/1000], Loss: 216911.3438
Epoch [944/1000], Loss: 25669.8906
Epoch [945/1000], Loss: 30019.1152
Epoch [946/1000], Loss: 45572.6602
Epoch [947/1000], Loss: 27252.6406
Epoch [948/1000], Loss: 17961.7910
Epoch [949/1000], Loss: 81190.0781
Epoch [950/1000], Loss: 72013.4766
Epoch [951/1000], Loss: 17603.2520
Epoch [952/1000], Loss: 155452.5000
Epoch [953/1000], Loss: 1031.6392
Epoch [954/1000], Loss: 5330.4258
Epoch [955/1000], Loss: 579.5327
Epoch [956/1000], Loss: 115379.3047
Epoch [957/1000], Loss: 252.6516
Epoch [958/1000], Loss: 

In [9]:
# Assuming X_test_scaled and y_test are your test data and labels

# Convert the test data to PyTorch tensors
X_test_tensor = torch.tensor(X_test_scaled.astype(np.float32))
y_test_tensor = torch.tensor(y_test.values.astype(np.float32))

# Creating a DataLoader for the test data
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32)  # You can adjust the batch size

# Switch the model to evaluation mode
model.eval()

# Initialize a list to store predictions and actual values
test_predictions = []
test_actuals = []

with torch.no_grad():  # Disable gradient computation
    for inputs, targets in test_loader:
        outputs = model(inputs)
        test_predictions.extend(outputs.view(-1).tolist())
        test_actuals.extend(targets.tolist())

# Convert predictions and actuals to numpy arrays for easy handling
test_predictions = np.array(test_predictions)
test_actuals = np.array(test_actuals)

# Calculate Mean Squared Error
test_mse = np.mean((test_predictions - test_actuals) ** 2)
print(f"Test MSE: {test_mse}")

Test MSE: 1958017.0696471846


# Boosting + GridSearch

In [12]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1
)

# Fit GridSearchCV (Assuming X_train_scaled and y_train are already defined)
# Replace 'X_train_scaled' and 'y_train' with your training data variables
grid_search.fit(X_train_scaled, y_train)

# Best parameters
best_params = grid_search.best_params_
best_params


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200; total time=   0.2s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200; total time=   0.2s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n

[CV] END gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=100; total time=   0.2s
[CV] END gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=200; total time=   0.2s
[CV] END gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=200; total time=   0.2s
[CV] END gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=200; total time=   0.3s
[CV] END gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=200; total time=   0.2s
[CV] END gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=200; total time=   0.3s
[CV] END gamma=0, learning_r

[CV] END gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=200; total time=   0.2s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=200; total time=   0.2s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=200; total time=   0.1s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=200; total time=   0.2s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=200; total time=   0.1s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=300; total time=   0.3s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=300; total time=   0.2s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=300; total time=   0.2s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=300; total time=   0.2s
[CV] END gamma=0, learning_rate=0.1, 

[CV] END gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=200; total time=   0.5s
[CV] END gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=200; total time=   0.5s
[CV] END gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=200; total time=   0.5s
[CV] END gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=300; total time=   0.7s
[CV] END gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=300; total time=   0.7s
[CV] END gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=300; total time=   0.8s
[CV] END gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=300; total time=   0.6s
[CV] END gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=300; total time=   0.7s
[CV] END gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=3, n_estimators=100; total time=   0.2s
[CV] END gamma=0, learning_rate=0.1, 

[CV] END gamma=0, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=300; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=300; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=300; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=300; total time=   0.3s
[CV] END gamma=0, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=300; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.2, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, 

[CV] END gamma=0, learning_rate=0.2, max_depth=7, min_child_weight=3, n_estimators=300; total time=   0.6s
[CV] END gamma=0, learning_rate=0.2, max_depth=7, min_child_weight=3, n_estimators=300; total time=   0.5s
[CV] END gamma=0, learning_rate=0.2, max_depth=7, min_child_weight=3, n_estimators=300; total time=   0.5s
[CV] END gamma=0, learning_rate=0.2, max_depth=7, min_child_weight=5, n_estimators=100; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, max_depth=7, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.2, max_depth=7, min_child_weight=5, n_estimators=100; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, max_depth=7, min_child_weight=5, n_estimators=100; total time=   0.2s
[CV] END gamma=0, learning_rate=0.2, max_depth=7, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0, learning_rate=0.2, max_depth=7, min_child_weight=5, n_estimators=200; total time=   0.3s
[CV] END gamma=0, learning_rate=0.2, 

[CV] END gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=300; total time=   0.5s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=300; total time=   0.5s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=300; total time=   0.3s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=100; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=100; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=200; total time=   0.3s
[CV] END g

[CV] END gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=300; total time=   0.6s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=300; total time=   0.5s
[CV] END gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=300; total time=   0.6s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=200; total time=   0.1s
[CV] END gamma=0

[CV] END gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=300; total time=   0.4s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=300; total time=   0.3s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=200; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=200; total time=   0.2s
[CV] END gamma=0.1,

[CV] END gamma=0.1, learning_rate=0.2, max_depth=3, min_child_weight=3, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=3, min_child_weight=3, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=3, min_child_weight=3, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=3, min_child_weight=3, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=3, min_child_weight=3, n_estimators=100; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=3, min_child_weight=3, n_estimators=200; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=3, min_child_weight=3, n_estimators=200; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=3, min_child_weight=3, n_estimators=200; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=3, min_child_weight=3, n_estimators=200; total time=   0.1s
[CV] END gamma=0.1,

[CV] END gamma=0.1, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=100; total time=   0.4s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=100; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=100; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=100; total time=   0.3s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=200; total time=   0.5s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=200; total time=   0.4s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=200; total time=   0.4s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=200; total time=   0.3s
[CV] END gamma=0.1, learning_rate=0.2, max_depth=7, min_child_weight=1, n_estimators=200; total time=   0.3s
[CV] END gamma=0.1,

[CV] END gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100; total time=   0.1s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=200; total time=   0.1s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=200; total time=   0.1s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=200; total time=   0.2s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=200; total time=   0.1s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=200; total time=   0.1s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=300; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=300; total time=   0.2s
[CV] END g

[CV] END gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100; total time=   0.2s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100; total time=   0.2s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=200; total time=   0.5s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=200; total time=   0.5s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=200; total time=   0.6s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=200; total time=   0.4s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=200; total time=   0.4s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=300; total time=   0.6s
[CV] END gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=300; total time=   0.6s
[CV] END g

[CV] END gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.2s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200; total time=   0.2s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200; total time=   0.2s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=300; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=300; total time=   0.4s
[CV] END gamma=0.2,

[CV] END gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200; total time=   0.5s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=300; total time=   0.4s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=300; total time=   0.6s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=300; total time=   0.6s
[CV] END gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=300; total time=   0.5s
[CV] END gamma=0.2,

[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=200; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=200; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=200; total time=   0.2s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=200; total time=   0.2s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=300; total time=   0.4s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=300; total time=   0.4s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=300; total time=   0.4s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=300; total time=   0.3s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=300; total time=   0.3s
[CV] END gamma=0.2,

{'gamma': 0,
 'learning_rate': 0.01,
 'max_depth': 7,
 'min_child_weight': 5,
 'n_estimators': 100}

In [19]:
boosting = XGBRegressor(**best_params)
boosting.fit(X_train_scaled, y_train)
boosting.score(X_test_scaled, y_test)

-3.4895712620850325

In [48]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Extensive grid search parameters for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [False]
}

# Initialize GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid=param_grid_rf,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=3
)

# Fit GridSearchCV (Assuming X_train_scaled and y_train are already defined)
# Replace 'X_train_scaled' and 'y_train' with your training data variables
grid_search_rf.fit(X_train_scaled, y_train)

# Best parameters
best_params_rf = grid_search_rf.best_params_
best_params_rf



Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV 1/3] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-2607319.289 total time=   1.8s
[CV 2/3] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-500177593.377 total time=   0.5s
[CV 3/3] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-4350763.986 total time=   0.5s
[CV 1/3] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=-2479225.531 total time=   1.0s
[CV 2/3] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=-500173406.093 total time=   1.0s
[CV 3/3] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=-4359991.030 total time=   1.2s
[CV 1/3] END bootstrap=False, max_depth=None, min_samples_leaf=1, min

[CV 3/3] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=-64157761.188 total time=   0.4s
[CV 1/3] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=-32891319.607 total time=   0.8s
[CV 2/3] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=-500239116.237 total time=   0.7s
[CV 3/3] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=-64157549.429 total time=   0.8s
[CV 1/3] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=300;, score=-32890648.615 total time=   1.2s
[CV 2/3] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=300;, score=-500240665.884 total time=   1.1s
[CV 3/3] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=300;, score=-64157153.778 to

[CV 3/3] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=-112334111.748 total time=   0.7s
[CV 1/3] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=-1834119.130 total time=   1.0s
[CV 2/3] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=-500029332.197 total time=   1.0s
[CV 3/3] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=-112332432.698 total time=   1.0s
[CV 1/3] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=-33018901.935 total time=   0.3s
[CV 2/3] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=-500040552.249 total time=   0.4s
[CV 3/3] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=-112372913.606 total time=   

[CV 1/3] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=-33528683.600 total time=   0.5s
[CV 2/3] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=-500122340.563 total time=   0.5s
[CV 3/3] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=-112487830.295 total time=   0.5s
[CV 1/3] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=-33540607.820 total time=   1.0s
[CV 2/3] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=-500121317.781 total time=   0.9s
[CV 3/3] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=-112485896.690 total time=   0.9s
[CV 1/3] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=300;, score=-33536804.187 total time=   

[CV 1/3] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=-32891319.607 total time=   0.8s
[CV 2/3] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=-500239466.233 total time=   0.7s
[CV 3/3] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=-64144219.869 total time=   0.8s
[CV 1/3] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=300;, score=-32890648.615 total time=   1.2s
[CV 2/3] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=300;, score=-500240060.196 total time=   1.1s
[CV 3/3] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=300;, score=-64143831.390 total time=   1.2s
[CV 1/3] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100;, score=-32850028.674 total time=   0

[CV 2/3] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=300;, score=-499176989.036 total time=   1.3s
[CV 3/3] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=300;, score=-112573810.293 total time=   1.4s
[CV 1/3] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=100;, score=-33239781.028 total time=   0.4s
[CV 2/3] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=100;, score=-499176345.424 total time=   0.4s
[CV 3/3] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=100;, score=-112534989.827 total time=   0.4s
[CV 1/3] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=-33239493.070 total time=   0.9s
[CV 2/3] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=-499176700.268 total ti

{'bootstrap': False,
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

In [54]:
print(best_params_rf)
"{'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}"

{'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


"{'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}"

In [55]:
best_random_forest = RandomForestRegressor(**best_params_rf,  n_jobs = -1)

In [56]:
best_random_forest.fit(X_train_scaled, y_train)

In [57]:
d = abs(best_random_forest.predict(X_test_scaled) - y_test)
sum(d) / len(d)

1025.4414807604182

On average, it fails to predict the correct price for 741 dollars

In [70]:
df = pd.DataFrame(zip(X.columns, best_random_forest.feature_importances_))
df.sort_values(by = 1, ascending = False)

Unnamed: 0,0,1
30,davtashen_cemetery,0.2135148
43,avan,0.05321378
41,gergin_subway,0.04520917
40,gorcarain_subway,0.04278841
39,sasunci_subway,0.04085038
4,bathroom_count,0.03994187
13,building_type_Monolit,0.03990418
29,central_cemetery,0.03725767
28,nork_cemetery,0.03722436
44,massiv,0.03461025
