In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from data import ZillowDataProcessor

In [2]:
# NOTE: change to local file storage
processor = ZillowDataProcessor(
    '/mnt/c/CS431/properties_2016.csv',
    '/mnt/c/CS431/properties_2017.csv',
    '/mnt/c/CS431/train_2016_v2.csv',
    '/mnt/c/CS431/train_2017.csv'
)

processor.prepare()
processed_df = processor.get_processed_data()

📦 Loading data...
🧹 Cleaning property datasets...
🛑 Dropping 24 columns from Properties 2016 (>80.0% missing)
🛑 Dropping 24 columns from Properties 2017 (>80.0% missing)

🔗 Merging training data with property data
✅ Train 2016 shape: (90275, 20)
✅ Train 2017 shape: (77613, 20)
✂️ Filtering outliers in logerror...
✅ Data ready for modeling!


In [3]:
print(processed_df.columns)
print(processed_df.dtypes)

Index(['parcelid', 'logerror', 'transactiondate', 'bathroomcnt', 'bedroomcnt',
       'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'fips',
       'garagecarcnt', 'garagetotalsqft', 'lotsizesquarefeet',
       'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidzip', 'roomcnt', 'yearbuilt',
       'numberofstories', 'taxamount'],
      dtype='object')
parcelid                          int64
logerror                        float64
transactiondate                   int64
bathroomcnt                     float64
bedroomcnt                      float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
fips                            float64
garagecarcnt                    float64
garagetotalsqft                 float64
lotsizesquarefeet               float64
propertylandusetypeid           float64
rawcensustractandblock          float64
regionidcity                    float64
regionidcounty             

In [4]:
features = processed_df.drop(columns='logerror')
label = processed_df['logerror'].values

In [5]:
scaler = StandardScaler()
features = scaler.fit_transform(features)

Linear Regression

In [6]:
from linear_reggression import LinearModel
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
linear_model = LinearModel(features, label)

In [8]:
y_pred = linear_model.model.predict(linear_model.X_test)

In [9]:
r2 = r2_score(linear_model.y_test, y_pred)
mse = mean_squared_error(linear_model.y_test, y_pred)
print(f'R2 Score: {r2}')
print(f'Mean Squared Error: {mse}')

R2 Score: 0.007323446591417593
Mean Squared Error: 0.006734180661252891


LSTM

In [10]:
from LSTM import LSTMModel
from create_dataset import CreateDataset
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [18]:
# hyperparameters
batch_size = 64
input_size = 19
output_size = 1
hidden_size = 128
num_layers = 2
dropout_rate = 0.3
learning_rate = 0.001
seq_length = 10

In [23]:
X_train, X_temp, y_train, y_temp = train_test_split(features, label, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

train_dataset = CreateDataset(X_train, y_train, seq_length)
val_dataset = CreateDataset(X_val, y_val, seq_length)
test_dataset = CreateDataset(X_test, y_test, seq_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model = LSTMModel(input_size, hidden_size, num_layers, dropout_rate, output_size).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.00001)

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')

cuda
Epoch 1/10, Loss: 0.0067
Epoch 2/10, Loss: 0.0067
Epoch 3/10, Loss: 0.0067
Epoch 4/10, Loss: 0.0067
Epoch 5/10, Loss: 0.0067
Epoch 6/10, Loss: 0.0066
Epoch 7/10, Loss: 0.0066
Epoch 8/10, Loss: 0.0066
Epoch 9/10, Loss: 0.0066
Epoch 10/10, Loss: 0.0066


In [None]:
model.eval()
preds = []
actuals = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        preds.append(model(X_batch.to(device)).cpu().numpy())
        actuals.append(y_batch.numpy())

preds = np.concatenate(preds)
actuals = np.concatenate(actuals)

preds = np.array(preds).flatten()
actuals = np.array(actuals).flatten()

mse = mean_squared_error(actuals, preds)
r2 = r2_score(actuals, preds)

print(f'R2 Score: {r2}')
print(f'Mean Squared Error: {mse}')

R2 Score: 0.0070732831954956055
Mean Squared Error: 0.006735253147780895
