In [14]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [16]:
data = pd.read_csv("../processed_data/data_no_map.csv")

# Assuming the real data is similar in structure
# Data preprocessing
X = data.drop(columns=['price', 'id', 'source'])  # Assuming 'id' and 'source' exist in the real dataset
y = data['price']

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initializing models
linear_model = LinearRegression()
rf_model = RandomForestRegressor(random_state=42)
boosting_model = XGBRegressor(random_state=42)

# Training the models
linear_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
boosting_model.fit(X_train_scaled, y_train)

# Making predictions
linear_preds = linear_model.predict(X_test_scaled)
rf_preds = rf_model.predict(X_test_scaled)
boosting_preds = boosting_model.predict(X_test_scaled)

# Evaluating the models
linear_rmse = mean_squared_error(y_test, linear_preds, squared=False)
rf_rmse = mean_squared_error(y_test, rf_preds, squared=False)
boosting_rmse = mean_squared_error(y_test, boosting_preds, squared=False)

linear_r2 = r2_score(y_test, linear_preds)
rf_r2 = r2_score(y_test, rf_preds)
boosting_r2 = r2_score(y_test, boosting_preds)

(linear_rmse, rf_rmse, boosting_rmse), (linear_r2, rf_r2, boosting_r2)


((1834.884196503025, 6949.328247357532, 933.7036687593801),
 (-0.6691320046988336, -22.94192223234815, 0.5677933002399522))

In [28]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
import numpy as np

# Assuming X_train_scaled and y_train are your training data and labels

# Convert the DataFrame to PyTorch tensors
X_tensor = torch.tensor(X_train_scaled.astype(np.float32))
y_tensor = torch.tensor(y_train.values.astype(np.float32))

# Creating a dataset and dataloader for batch processing
dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define a simple feedforward neural network for regression
class RealEstatePricePredictor(nn.Module):

    def __init__(self):
        super(RealEstatePricePredictor, self).__init__()
        self.fc1 = nn.Linear(X_train_scaled.shape[1], 256)
        self.fc2 = nn.Linear(256, 512)
        self.fc3 = nn.Linear(512, 1000)
        self.fc4 = nn.Linear(1000, 1500)
        self.fc5 = nn.Linear(1500, 1000)
        self.fc6 = nn.Linear(1000, 512)
        self.fc7 = nn.Linear(512, 256)
        self.fc8 = nn.Linear(256, 64)
        self.fc9 = nn.Linear(64, 32)
        self.fc10 = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = F.relu(self.fc8(x))
        x = F.relu(self.fc9(x))
        x = self.fc10(x)  # No activation function as it's a regression problem
        return x

# Initialize the model
model = RealEstatePricePredictor()

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Model training
num_epochs = 1000  # This can be adjusted
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1, 1))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# The model is now trained. You can use it to make predictions on your test set.
# Note: In a real scenario, you would also want to include validation logic.


Epoch [1/1000], Loss: 6114637.5000
Epoch [2/1000], Loss: 2430870.7500
Epoch [3/1000], Loss: 4874899.5000
Epoch [4/1000], Loss: 1946789.6250
Epoch [5/1000], Loss: 142929.5469
Epoch [6/1000], Loss: 123904.4297
Epoch [7/1000], Loss: 15370.0439
Epoch [8/1000], Loss: 3469290.0000
Epoch [9/1000], Loss: 1331783.0000
Epoch [10/1000], Loss: 46300908.0000
Epoch [11/1000], Loss: 317239.1562
Epoch [12/1000], Loss: 53155404.0000
Epoch [13/1000], Loss: 71705.1641
Epoch [14/1000], Loss: 75088.9766
Epoch [15/1000], Loss: 6400798.5000
Epoch [16/1000], Loss: 3203453.2500
Epoch [17/1000], Loss: 7414937.5000
Epoch [18/1000], Loss: 1516581.7500
Epoch [19/1000], Loss: 370620.2500
Epoch [20/1000], Loss: 2254344.0000
Epoch [21/1000], Loss: 4746284.5000
Epoch [22/1000], Loss: 2811570.0000
Epoch [23/1000], Loss: 1267662.1250
Epoch [24/1000], Loss: 519696.3125
Epoch [25/1000], Loss: 5949407.0000
Epoch [26/1000], Loss: 395498.5000
Epoch [27/1000], Loss: 576034.1250
Epoch [28/1000], Loss: 8378244.5000
Epoch [29/10

Epoch [231/1000], Loss: 30690.5742
Epoch [232/1000], Loss: 54593.9297
Epoch [233/1000], Loss: 837.8311
Epoch [234/1000], Loss: 31410.4668
Epoch [235/1000], Loss: 315617.0938
Epoch [236/1000], Loss: 1062102.8750
Epoch [237/1000], Loss: 1304.9186
Epoch [238/1000], Loss: 427067.1875
Epoch [239/1000], Loss: 277178.2188
Epoch [240/1000], Loss: 220235.8906
Epoch [241/1000], Loss: 2574539.7500
Epoch [242/1000], Loss: 3226926.2500
Epoch [243/1000], Loss: 199045.8125
Epoch [244/1000], Loss: 428348.5312
Epoch [245/1000], Loss: 11373872.0000
Epoch [246/1000], Loss: 572981.3750
Epoch [247/1000], Loss: 370374.7500
Epoch [248/1000], Loss: 215655.4844
Epoch [249/1000], Loss: 49853.9180
Epoch [250/1000], Loss: 89667.4922
Epoch [251/1000], Loss: 467130.1875
Epoch [252/1000], Loss: 4948871.5000
Epoch [253/1000], Loss: 46675.5156
Epoch [254/1000], Loss: 44098368.0000
Epoch [255/1000], Loss: 775360.2500
Epoch [256/1000], Loss: 186578.0156
Epoch [257/1000], Loss: 451704.7500
Epoch [258/1000], Loss: 1194072

Epoch [460/1000], Loss: 958979.3125
Epoch [461/1000], Loss: 694900.5000
Epoch [462/1000], Loss: 627.7129
Epoch [463/1000], Loss: 115786.1797
Epoch [464/1000], Loss: 661114.6875
Epoch [465/1000], Loss: 64717.5703
Epoch [466/1000], Loss: 14653.6465
Epoch [467/1000], Loss: 812816.2500
Epoch [468/1000], Loss: 334775.8125
Epoch [469/1000], Loss: 1891626.8750
Epoch [470/1000], Loss: 328361.4688
Epoch [471/1000], Loss: 2342200.7500
Epoch [472/1000], Loss: 1756985.2500
Epoch [473/1000], Loss: 362470.5938
Epoch [474/1000], Loss: 111871.7969
Epoch [475/1000], Loss: 156414.2656
Epoch [476/1000], Loss: 395907.9375
Epoch [477/1000], Loss: 391373.0312
Epoch [478/1000], Loss: 199687.4531
Epoch [479/1000], Loss: 2619.8352
Epoch [480/1000], Loss: 211312.3750
Epoch [481/1000], Loss: 51834.0391
Epoch [482/1000], Loss: 619827.1250
Epoch [483/1000], Loss: 262082.5000
Epoch [484/1000], Loss: 512059.3750
Epoch [485/1000], Loss: 26354.2637
Epoch [486/1000], Loss: 1758157.5000
Epoch [487/1000], Loss: 120701.96

Epoch [689/1000], Loss: 125262.0781
Epoch [690/1000], Loss: 1954380.7500
Epoch [691/1000], Loss: 680062.9375
Epoch [692/1000], Loss: 389886.2188
Epoch [693/1000], Loss: 553478.6250
Epoch [694/1000], Loss: 10735932.0000
Epoch [695/1000], Loss: 20779696.0000
Epoch [696/1000], Loss: 288107.1250
Epoch [697/1000], Loss: 447227.2188
Epoch [698/1000], Loss: 4395137.0000
Epoch [699/1000], Loss: 299064.0000
Epoch [700/1000], Loss: 88008.6953
Epoch [701/1000], Loss: 1429303.7500
Epoch [702/1000], Loss: 1039.0234
Epoch [703/1000], Loss: 51863.5000
Epoch [704/1000], Loss: 20254.2383


KeyboardInterrupt: 

In [29]:
# Assuming X_test_scaled and y_test are your test data and labels

# Convert the test data to PyTorch tensors
X_test_tensor = torch.tensor(X_test_scaled.astype(np.float32))
y_test_tensor = torch.tensor(y_test.values.astype(np.float32))

# Creating a DataLoader for the test data
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32)  # You can adjust the batch size

# Switch the model to evaluation mode
model.eval()

# Initialize a list to store predictions and actual values
test_predictions = []
test_actuals = []

with torch.no_grad():  # Disable gradient computation
    for inputs, targets in test_loader:
        outputs = model(inputs)
        test_predictions.extend(outputs.view(-1).tolist())
        test_actuals.extend(targets.tolist())

# Convert predictions and actuals to numpy arrays for easy handling
test_predictions = np.array(test_predictions)
test_actuals = np.array(test_actuals)

# Calculate Mean Squared Error
test_mse = np.mean((test_predictions - test_actuals) ** 2)
print(f"Test MSE: {test_mse}")

Test MSE: 1864859.9425879607
