In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv("train_cdc.csv")   # replace path

TARGET = "price"

df["log_price"] = np.log(df[TARGET])
df = df.drop(columns=[TARGET, 'id', 'date', 'zipcode'])


In [None]:
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_cols.remove("log_price")

cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)

Numerical columns: ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
Categorical columns: []


In [None]:
X = df.drop(columns=["log_price"])
y = df["log_price"].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

In [None]:
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc   = preprocessor.transform(X_val)

input_dim = X_train_proc.shape[1]

In [None]:
from scipy.sparse import issparse

class TabularDataset(Dataset):
    def __init__(self, X, y):
        if issparse(X):
            X = X.toarray()

        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
train_ds = TabularDataset(X_train_proc, y_train)
val_ds   = TabularDataset(X_val_proc, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)

In [None]:
class TabularFCNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

In [None]:
model = TabularFCNet(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
EPOCHS = 200

for epoch in range(EPOCHS):
    model.train()
    train_losses = []

    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(Xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    # ---------- Validation ----------
    model.eval()
    val_preds, val_true = [], []

    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            preds = model(Xb)

            val_preds.append(preds.cpu().numpy())
            val_true.append(yb.cpu().numpy())

    val_preds = np.concatenate(val_preds)
    val_true  = np.concatenate(val_true)

    rmse = np.sqrt(mean_squared_error(val_true, val_preds))
    r2   = r2_score(val_true, val_preds)

    print(
        f"Epoch [{epoch+1}/{EPOCHS}] | "
        f"Train MSE: {np.mean(train_losses):.4f} | "
        f"Val RMSE (log): {rmse:.4f} | "
        f"Val R²: {r2:.4f}"
    )

Epoch [1/200] | Train MSE: 35.5511 | Val RMSE (log): 0.7284 | Val R²: -0.9228
Epoch [2/200] | Train MSE: 1.4627 | Val RMSE (log): 0.6469 | Val R²: -0.5163
Epoch [3/200] | Train MSE: 1.3132 | Val RMSE (log): 0.6242 | Val R²: -0.4119
Epoch [4/200] | Train MSE: 1.1663 | Val RMSE (log): 0.6051 | Val R²: -0.3266
Epoch [5/200] | Train MSE: 1.1501 | Val RMSE (log): 0.5385 | Val R²: -0.0509
Epoch [6/200] | Train MSE: 1.0993 | Val RMSE (log): 0.4644 | Val R²: 0.2185
Epoch [7/200] | Train MSE: 1.0448 | Val RMSE (log): 0.5081 | Val R²: 0.0644
Epoch [8/200] | Train MSE: 0.9765 | Val RMSE (log): 0.4685 | Val R²: 0.2047
Epoch [9/200] | Train MSE: 0.9533 | Val RMSE (log): 0.5399 | Val R²: -0.0561
Epoch [10/200] | Train MSE: 0.9499 | Val RMSE (log): 0.5255 | Val R²: -0.0006
Epoch [11/200] | Train MSE: 0.9227 | Val RMSE (log): 0.5189 | Val R²: 0.0244
Epoch [12/200] | Train MSE: 0.8941 | Val RMSE (log): 0.3877 | Val R²: 0.4552
Epoch [13/200] | Train MSE: 0.8711 | Val RMSE (log): 0.4424 | Val R²: 0.2907


In [None]:
val_price_true = np.exp(val_true)
val_price_pred = np.exp(val_preds)

rmse_price = np.sqrt(mean_squared_error(val_price_true, val_price_pred))
r2_price   = r2_score(val_price_true, val_price_pred)

print("Final RMSE (price):", rmse_price)
print("Final R² (price):", r2_price)

Final RMSE (price): 129204.18370935207
Final R² (price): 0.8669701814651489
