<a href="https://colab.research.google.com/github/WinC3/SDSS-Datathon-2025/blob/main/SDSS2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install torch torchvision torchaudio pytorch-tabnet scikit-learn pandas numpy

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)

In [30]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from pytorch_tabnet.tab_model import TabNetRegressor
from geopy.distance import geodesic

In [6]:
from google.colab import files
uploaded = files.upload()

Saving pruned-estate-data.csv to pruned-estate-data.csv


In [53]:
# Load dataset (Upload or use Google Drive)
df = pd.read_csv("pruned-estate-data.csv")  # Replace with actual dataset

# Drop columns that contain 'N/A' (NaN) in any row
df.dropna(axis=1, how='any', inplace=True)

# Drop ID column (not useful for ML)
df.drop(columns=["id_"], inplace=True, errors='ignore')

# Convert "DEN" from "Yes"/"No" to 1/0
if "DEN" in df.columns:
    df["DEN"] = df["DEN"].map({"Yes": 1, "No": 0})

# Convert "size" from "0-499 sqft" format to separate min and max columns
def extract_size_range(size_str):
    if isinstance(size_str, str) and "-" in size_str:
        size_range = size_str.split(" sqft")[0].split("-")  # Remove "sqft" and split range
        return int(size_range[0]), int(size_range[1])
    return np.nan, np.nan  # Handle unexpected formats

if "size" in df.columns:
    df["size_min"], df["size_max"] = zip(*df["size"].apply(extract_size_range))
    df.drop(columns=["size"], inplace=True)  # Drop original 'size' column

# Drop rows where size couldn't be converted (if any)
df.dropna(subset=["size_min", "size_max"], inplace=True)

# One-hot encode categorical features (exposure & ward)
if "exposure" in df.columns:
    df = pd.get_dummies(df, columns=["exposure"], prefix="exposure")
if "ward" in df.columns:
    df = pd.get_dummies(df, columns=["ward"], drop_first=True)
if "parking" in df.columns:
    df = pd.get_dummies(df, columns=["parking"], drop_first=True)

# Define features and target
X = df.drop(columns=["price"]).values  # Features
y = df["price"].values  # Target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle Missing Values in X_train and X_test
imputer = SimpleImputer(strategy="median")  # Use median for robustness
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Normalize y_train and y_test (TARGET VARIABLE - PRICE)
y_mean = y_train.mean()
y_std = y_train.std()
y_train = (y_train - y_mean) / y_std  # Normalize target variable
y_test = (y_test - y_mean) / y_std  # Use same mean & std from training set


In [54]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [50]:
# Define a stable feedforward neural network
class RealEstateNN(nn.Module):
    def __init__(self, input_size):
        super(RealEstateNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.relu = nn.LeakyReLU(negative_slope=0.01)  # Avoid dead neurons

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# Initialize model
input_size = X_train.shape[1]
model = RealEstateNN(input_size)

In [55]:
# Improved model
class RealEstateNN(nn.Module):
    def __init__(self, input_size):
        super(ImprovedRealEstateNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 1)
        self.relu = nn.LeakyReLU(negative_slope=0.01)
        self.dropout = nn.Dropout(0.3)  # Dropout for regularization

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)
        return x


In [56]:
# Define loss function and optimizer
criterion = nn.L1Loss()  # Use MAE instead of SmoothL1Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Early stopping parameters
patience = 50  # Number of epochs with no improvement before stopping
best_loss = float("inf")
epochs_without_improvement = 0

# Training loop with early stopping
epochs = 700
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    loss.backward()
    optimizer.step()

    # Early stopping logic
    if loss.item() < best_loss:
        best_loss = loss.item()
        epochs_without_improvement = 0  # Reset counter
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= patience:
        print(f"Early stopping at epoch {epoch+1}, best loss: {best_loss:.4f}")
        break  # Stop training

    if (epoch+1) % 50 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")



Epoch [50/700], Loss: 0.0908
Epoch [100/700], Loss: 0.0881
Epoch [150/700], Loss: 0.0848
Epoch [200/700], Loss: 0.0871
Epoch [250/700], Loss: 0.0792
Early stopping at epoch 300, best loss: 0.0792


In [57]:
# Set model to evaluation mode
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    y_pred = y_pred.numpy().flatten()  # Convert tensor to NumPy array

# Convert predictions back to original price scale
y_pred_original = (y_pred * y_std) + y_mean
y_test_original = (y_test * y_std) + y_mean

# Calculate Mean Absolute Error (MAE)
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test_original, y_pred_original)
print(f"Fixed Test MAE: ${mae:,.2f}")  # Expected ~ $10K - $100K



Fixed Test MAE: $133,960.67


In [58]:
!pip install xgboost



In [74]:
from xgboost import XGBRegressor
# Train XGBoost model
model = XGBRegressor(
    n_estimators=500,  # Number of trees (increase if needed)
    learning_rate=0.05,  # Step size (reduce if unstable)
    max_depth=2,  # Depth of each tree (increase if underfitting)
    subsample=0.6,  # Use 80% of data per tree (reduces overfitting)
    colsample_bytree=0.7,  # Use 80% of features per tree
    random_state=42
)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Convert predictions back to original price scale
y_pred_original = (y_pred * y_std) + y_mean
y_test_original = (y_test * y_std) + y_mean

# Calculate MAE
mae = mean_absolute_error(y_test_original, y_pred_original)
print(f"XGBoost Test MAE: ${mae:,.2f}")

mape = np.mean(np.abs((y_test_original - y_pred_original) / y_test_original)) * 100
print(f"XGBoost Test MAPE: {mape:.2f}%")

XGBoost Test MAE: $108,179.92
XGBoost Test MAPE: 11.84%
