## Exploring forecasting various home price metric with pytorch
see: 
- https://github.com/jdb78/pytorch-forecasting
- https://pytorch-forecasting.readthedocs.io/en/latest/tutorials/stallion.html 


In [65]:
# imports for training
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

# import dataset, network to train and metric to optimize
from pytorch_forecasting import (
    Baseline,
    TimeSeriesDataSet,
    TemporalFusionTransformer,
    QuantileLoss,
)
from pytorch_forecasting.data import TimeSeriesDataSet, GroupNormalizer, NaNLabelEncoder
from capston_db_conn import db_conn
import torch

conn = db_conn()
# print(conn)

# standard imports
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")  # avoid printing out absolute paths

In [66]:
rf_sql = """
select
*
from redfin_county_full
"""
# NOTE: add filters on SQL query once determined they are needed

redfin = pd.read_sql(rf_sql, con=conn, parse_dates=["period_start", "period_end"])
# conn.close()


In [67]:
redfin = redfin.dropna(subset=["median_sale_price"])
redfin.head()


Unnamed: 0,county_fips,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,...,price_drops,price_drops_mom,price_drops_yoy,off_market_in_two_weeks,off_market_in_two_weeks_mom,off_market_in_two_weeks_yoy,parent_metro_region,parent_metro_region_metro_code,last_updated,county
0,42129,2015-02-01,2015-02-28,30,county,5,2425,f,"Westmoreland County, PA",,...,0.066093,0.007854,0.000304,0.053571,0.053571,0.053571,"Pittsburgh, PA",38300.0,2022-01-09 14:29:56,Westmoreland County
1,51057,2012-08-01,2012-08-31,30,county,5,2964,f,"Essex County, VA",,...,,,,0.0,0.0,0.0,Virginia nonmetropolitan area,,2022-01-09 14:29:56,Essex County
2,21203,2021-01-01,2021-01-31,30,county,5,1211,f,"Rockcastle County, KY",,...,,,,0.0,0.0,0.0,Kentucky nonmetropolitan area,,2022-01-09 14:29:56,Rockcastle County
3,34039,2021-08-01,2021-08-31,30,county,5,1910,f,"Union County, NJ",,...,0.111111,-0.04798,-0.146032,0.0,-0.142857,-0.166667,"Newark, NJ",35084.0,2022-01-09 14:29:56,Union County
4,37063,2019-11-01,2019-11-30,30,county,5,2038,f,"Durham County, NC",,...,0.189723,-0.091415,-0.018972,0.128571,-0.117805,-0.077778,"Durham, NC",20500.0,2022-01-09 14:29:56,Durham County


In [68]:
cols = [
    "region",
    #"county_fips",
    "period_end",
    "property_type",
    #"property_type_id",
    "median_sale_price",
    "median_list_price",
    "median_ppsf",
    "median_list_ppsf",
    "homes_sold",
    "pending_sales",
    "new_listings",
    "inventory",
    "months_of_supply",
    "median_dom",
    "avg_sale_to_list",
    "sold_above_list",
    #"price_drops",
    "off_market_in_two_weeks",
]

In [69]:
# load data: this is pandas dataframe with at least a column for
# * the target (what you want to predict)
# * the timeseries ID (which should be a unique string to identify each timeseries)
# * the time of the observation (which should be a monotonically increasing integer)
data = redfin[cols]

# add time index
data["time_idx"] = data["period_end"].dt.year * 12 + data["period_end"].dt.month
data["time_idx"] -= data["time_idx"].min()
data["month"] = data["period_end"].dt.month.astype(str).astype("category")

In [70]:
data = data.dropna()
data.head()

Unnamed: 0,region,period_end,property_type,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,months_of_supply,median_dom,avg_sale_to_list,sold_above_list,off_market_in_two_weeks,time_idx,month
1,"Essex County, VA",2012-08-31,All Residential,104750.0,159925.0,74.496753,102.758612,4.0,1.0,10.0,51.0,12.8,152.0,0.926464,0.0,0.0,7,8
2,"Rockcastle County, KY",2021-01-31,Single Family Residential,123500.0,216950.0,93.489583,101.938339,6.0,5.0,4.0,28.0,4.7,51.0,0.948879,0.0,0.0,108,1
3,"Union County, NJ",2021-08-31,Condo/Co-op,347500.0,449949.5,267.456359,291.364705,6.0,8.0,14.0,45.0,7.5,45.0,0.989805,0.166667,0.0,115,8
4,"Durham County, NC",2019-11-30,Townhouse,255500.0,267635.0,148.613518,147.895603,89.0,70.0,63.0,253.0,2.8,47.0,0.995308,0.292135,0.128571,94,11
5,"Duval County, FL",2017-12-31,Condo/Co-op,132000.0,189500.0,105.970149,116.0,137.0,88.0,117.0,341.0,2.5,37.0,0.964373,0.109489,0.227273,71,12


In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457071 entries, 1 to 563120
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   region                   457071 non-null  object        
 1   period_end               457071 non-null  datetime64[ns]
 2   property_type            457071 non-null  object        
 3   median_sale_price        457071 non-null  float64       
 4   median_list_price        457071 non-null  float64       
 5   median_ppsf              457071 non-null  float64       
 6   median_list_ppsf         457071 non-null  float64       
 7   homes_sold               457071 non-null  float64       
 8   pending_sales            457071 non-null  float64       
 9   new_listings             457071 non-null  float64       
 10  inventory                457071 non-null  float64       
 11  months_of_supply         457071 non-null  float64       
 12  median_dom      

In [85]:
# define the dataset, i.e. add metadata to pandas dataframe for the model to understand it
max_encoder_length = 36
max_prediction_length = 12
training_cutoff = data["time_idx"].max() - max_prediction_length  # day for cutoff

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",  # column name of time of observation
    target="median_sale_price",  # column name of target to predict
    group_ids=["region", "property_type"],  # column name(s) for timeseries IDs
    max_encoder_length=max_encoder_length,  # how much history to use
    max_prediction_length=max_prediction_length,  # how far to predict into future
    # covariates static for a timeseries ID
    static_categoricals=["region","property_type"],
    static_reals=[],
    # covariates known and unknown in the future to inform prediction
    time_varying_known_categoricals=[],
    time_varying_known_reals=[], # this is where tax rate would go
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "median_sale_price",
        "median_list_price",
        "median_ppsf",
        "median_list_ppsf",
        "homes_sold",
        "pending_sales",
        "new_listings",
        "inventory",
        "months_of_supply",
        "median_dom",
        "avg_sale_to_list",
        "sold_above_list",
        "off_market_in_two_weeks",
    ],
    allow_missing_timesteps=True,
    categorical_encoders={"region":NaNLabelEncoder(add_nan=True)}
)
    # target_normalizer=GroupNormalizer(
    #     groups=["agency", "sku"], transformation="softplus"
    # ),  # use softplus and normalize by group
    # add_relative_time_idx=True,
    # add_target_scales=True,
    # add_encoder_length=True,
# )


AssertionError: Time difference between steps has been idenfied as larger than 1 - set allow_missing_timesteps=True

In [82]:
training.get_parameters() 

{'time_idx': 'time_idx',
 'target': 'median_sale_price',
 'group_ids': ['region', 'property_type'],
 'weight': None,
 'max_encoder_length': 36,
 'min_encoder_length': 36,
 'min_prediction_idx': 0,
 'min_prediction_length': 12,
 'max_prediction_length': 12,
 'static_categoricals': ['region', 'property_type'],
 'static_reals': [],
 'time_varying_known_categoricals': [],
 'time_varying_known_reals': [],
 'time_varying_unknown_categoricals': [],
 'time_varying_unknown_reals': ['median_sale_price',
  'median_list_price',
  'median_ppsf',
  'median_list_ppsf',
  'homes_sold',
  'pending_sales',
  'new_listings',
  'inventory',
  'months_of_supply',
  'median_dom',
  'avg_sale_to_list',
  'sold_above_list',
  'off_market_in_two_weeks'],
 'variable_groups': {},
 'constant_fill_strategy': {},
 'allow_missing_timesteps': True,
 'lags': {},
 'add_relative_time_idx': False,
 'add_target_scales': False,
 'add_encoder_length': False,
 'target_normalizer': EncoderNormalizer(transformation='log'),
 'c

In [83]:
# create validation dataset using the same normalization techniques as for the training dataset
validation = TimeSeriesDataSet.from_dataset(
    training,
    data,
    predict=True,
    min_prediction_idx=training.index.time.max() + 1,
    stop_randomization=True,
)

In [84]:
# convert datasets to dataloaders for training
batch_size = 128
train_dataloader = training.to_dataloader(
    train=True, batch_size=batch_size, num_workers=2
)
val_dataloader = validation.to_dataloader(
    train=False, batch_size=batch_size, num_workers=2
)

In [None]:
# calculate baseline mean absolute error, i.e. predict next value as the last available value from the history
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
baseline_predictions = Baseline().predict(val_dataloader)
(actuals - baseline_predictions).abs().mean().item()



53901.91015625

In [None]:
# create PyTorch Lighning Trainer with early stopping
early_stop_callback = EarlyStopping(
    monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min"
)
lr_logger = LearningRateMonitor()
trainer = pl.Trainer(
    max_epochs=100,
    gpus=0,  # run on CPU, if on multiple GPUs, use accelerator="ddp"
    gradient_clip_val=0.1,
    limit_train_batches=30,  # 30 batches per epoch
    callbacks=[lr_logger, early_stop_callback],
    logger=TensorBoardLogger("lightning_logs"),
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
# define network to train - the architecture is mostly inferred from the dataset,
# so that only a few hyperparameters have to be set by the user
tft = TemporalFusionTransformer.from_dataset(
    # dataset
    training,
    # architecture hyperparameters
    hidden_size=32,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=16,
    # loss metric to optimize
    loss=QuantileLoss(),
    # logging frequency
    log_interval=2,
    # optimizer parameters
    learning_rate=0.03,
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

Number of parameters in network: 136.0k


In [None]:
# find the optimal learning rate
res = trainer.tuner.lr_find(
    "county_fips",
    tft, 
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    early_stop_threshold=1000.0,
    max_lr=0.3,
)
# and plot the result - always visually confirm that the suggested learning rate makes sense
print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()


TypeError: lr_find() got multiple values for argument 'train_dataloaders'

In [None]:
# fit the model on the data - redefine the model with the correct learning rate if necessary
trainer.fit(
    tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader,
)


IndexError: list index out of range