In [1]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.dataprocessing import Pipeline
from darts.dataprocessing.transformers import StaticCovariatesTransformer, Scaler, InvertibleMapper
from sklearn.ensemble import ExtraTreesRegressor
from darts.models import RegressionModel
from sklearn.metrics import mean_squared_error
import math

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
new_df = pd.read_csv("data/State_Processed.csv")

In [4]:
# ------------- 0. Setup DataFrame ------------------------------------------------
# df is your DataFrame with columns: 'year','month','state', 'median_listing_price', ...others...
# Ensure month/day produce a proper date (use first of month)
df = new_df.copy()
df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str).str.zfill(2) + '-01')
# sort
df = df.sort_values(['state','date']).reset_index(drop=True)

In [5]:
df = df.drop(columns=['state_id'])

In [6]:
# ------------- 1. Create per-state TimeSeries ----------------------------------
# We'll build a dict: one grouped TimeSeries object per state (Darts supports grouping)
grouped_ts = {}
grouped_static = {}   # optional static covariates per state

for state, g in df.groupby('state'):
    # create a TimeSeries with monthly frequency
    ts = TimeSeries.from_dataframe(
    g,
    time_col="date",
    value_cols="median_listing_price",
    freq="MS",
    )
    grouped_ts[state] = ts
    
    # optionally create a small static vector for the state (if columns exist)
    # use first row of the group for static features
    # static_cols = ['Region', 'Division']  # example static columns — adapt to your df
    # static_row = g[static_cols].iloc[0].to_dict()
    # grouped_static[state] = static_row

In [7]:
# ------------- 2. Create past and/or future covariates --------------------------
# Choose covariates you consider useful and that exist per (state, month)
# Example: use active_listing_count and pending_ratio as past covariates (monthly history)
past_cov_ts = {}
future_cov_ts = {}
past_cov_cols = ['median_listing_price_mm',
    'median_listing_price_yy', 'active_listing_count',
    'active_listing_count_mm', 'active_listing_count_yy',
    'median_days_on_market', 'median_days_on_market_mm',
    'median_days_on_market_yy', 'new_listing_count', 'new_listing_count_mm',
    'new_listing_count_yy', 'price_increased_count',
    'price_increased_count_mm', 'price_increased_count_yy',
    'price_increased_share', 'price_increased_share_mm',
    'price_increased_share_yy', 'price_reduced_count',
    'price_reduced_count_mm', 'price_reduced_count_yy',
    'price_reduced_share', 'price_reduced_share_mm',
    'price_reduced_share_yy', 'pending_listing_count',
    'pending_listing_count_mm', 'pending_listing_count_yy',
    'median_listing_price_per_square_foot',
    'median_listing_price_per_square_foot_mm',
    'median_listing_price_per_square_foot_yy', 'median_square_feet',
    'median_square_feet_mm', 'median_square_feet_yy',
    'average_listing_price', 'average_listing_price_mm',
    'average_listing_price_yy', 'total_listing_count',
    'total_listing_count_mm', 'total_listing_count_yy', 'pending_ratio',
    'pending_ratio_mm', 'pending_ratio_yy']
future_cov_cols = ['month','year']  # calendar features known ahead

for state, g in df.groupby('state'):
    # Past covariates as a multivariate TimeSeries
    if all(c in g.columns for c in past_cov_cols):
        past_cov_ts[state] = TimeSeries.from_dataframe(g, time_col='date', value_cols=past_cov_cols, freq='MS')
    else:
        past_cov_ts[state] = None

    if all(c in g.columns for c in future_cov_cols):
        future_cov_ts[state] = TimeSeries.from_dataframe(g, time_col='date', value_cols=future_cov_cols, freq='MS')
    else:
        future_cov_ts[state] = None

In [8]:
# ------------- 3. Build and fit preprocessing pipeline (per-state or global) ----
# We'll use a per-state pipeline (keeps scaling per-state) or reuse one global pipeline if preferred.
pipeline_dict = {}
ts_transformed = {}

for state in grouped_ts:
    log_transformer = InvertibleMapper(np.log1p, np.expm1)   # log1p for target, invertible
    scaler = Scaler()
    pipe = Pipeline([log_transformer, scaler])
    # fit_transform expects a TimeSeries (or list); we pass the one series
    transformed = pipe.fit_transform(grouped_ts[state])
    pipeline_dict[state] = pipe
    ts_transformed[state] = transformed

In [9]:
# ------------- 4. Prepare for training: create dictionaries for Darts model -------
# Darts LightGBMModel can be fit on a list (or sequence) of series; collect them
series_list = [ts_transformed[s] for s in ts_transformed]
past_cov_list = [past_cov_ts[s] for s in ts_transformed]
future_cov_list = [future_cov_ts[s] for s in ts_transformed]

In [10]:
# ------------- 5. Train/validation split ---------------------------------------
# Example: hold out the last month for validation for each state
# We'll do a simple split: train on all but last month, validate on last month
n_predict = 1  # next month
train_series = []
val_series = []
train_pasts = []
val_pasts = []
train_futures = []
val_futures = []
test_futures = []

for s in ts_transformed:
    ts = ts_transformed[s]
    # if len(ts) < 24:
    #     # skip too-short series (optional) or handle differently
    #     continue
    train = ts[:-n_predict]
    val = ts[-n_predict:]  # last month
    train_series.append(train)
    val_series.append(val)
    # same slicing for covariates if present
    if past_cov_ts[s] is not None:
        train_pasts.append(past_cov_ts[s][:-n_predict])
        val_pasts.append(past_cov_ts[s][-n_predict:])
    else:
        train_pasts.append(None)
        val_pasts.append(None)
    if future_cov_ts[s] is not None:
        train_futures.append(future_cov_ts[s][:-n_predict])
        val_futures.append(future_cov_ts[s][-n_predict:])
        test_futures.append(future_cov_ts[s])  # for final test prediction
    else:
        train_futures.append(None)
        val_futures.append(None)

In [11]:
# # ------------- 6. Instantiate and fit LightGBMModel -----------------------------
# # Choose lags in months (e.g., last 12 months)
# lags = 12
# lags_past_covariates = list(range(-12,0))   # previous 12 months of past covariates
# lags_future_covariates = list(range(1, n_predict+1))  # months ahead (just 1)

# extra_model = RegressionModel(
#     lags=lags,
#     lags_past_covariates=lags_past_covariates,
#     lags_future_covariates=lags_future_covariates,
#     model=ExtraTreesRegressor(n_estimators=1000, random_state=42),
#     output_chunk_length=n_predict
# )

# # Fit on the training series list (global model trained across all states)
# extra_model.fit(series=train_series, past_covariates=train_pasts, future_covariates=train_futures)

In [12]:
from darts.models import BlockRNNModel
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler, InvertibleMapper
import numpy as np

In [16]:
import torch

# detect GPU
use_gpu = torch.cuda.is_available()
if use_gpu:
    # use one GPU (change devices to -1 or a list to use more)
    pl_trainer_kwargs = {"accelerator": "gpu", "devices": 1, "auto_select_gpus": True}
else:
    pl_trainer_kwargs = {"accelerator": "cpu"}

In [18]:
# Hyperparameters
input_chunk_length = 12   # use the past 12 months
output_chunk_length = 1   # predict 1 month ahead
n_epochs = 100            # training epochs
hidden_dim = 32           # size of LSTM hidden units
n_rnn_layers = 2          # number of stacked LSTM layers
dropout = 0.1             # dropout for regularization
model_type = "LSTM"       # could also be "GRU" or "RNN"

# Instantiate model
block_rnn = BlockRNNModel(
    input_chunk_length = input_chunk_length,
    output_chunk_length = output_chunk_length,
    model = model_type,
    hidden_dim = hidden_dim,
    n_rnn_layers = n_rnn_layers,
    dropout = dropout,
    # Optional: output_chunk_shift if you want to leave a gap so the model doesn't peek right up to the target
    output_chunk_shift = 0,
    # Loss function, optimizer, etc.
    optimizer_kwargs = {"lr": 1e-3},
    # Use GPU if available
    pl_trainer_kwargs=pl_trainer_kwargs  
)

# Fit model on training set (list of states)
block_rnn.fit(
    series = train_series, 
    past_covariates = train_pasts, 
    verbose = True,
    epochs = n_epochs
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type             | Params | Mode 
-------------------------------------------------------------
0 | criterion       | MSELoss          | 0      | train
1 | train_criterion | MSELoss          | 0      | train
2 | val_criterion   | MSELoss          | 0      | train
3 | train_metrics   | MetricCollection | 0      | train
4 | val_metrics     | MetricCollection | 0      | train
5 | rnn             | LSTM             | 18.2 K | train
6 | fc              | Sequential       | 33     | train
-------------------------------------------------------------
18.2 K    Trainable params
0         Non-trainable params
18.2 K    Total params
0.073     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


BlockRNNModel(output_chunk_shift=0, model=LSTM, hidden_dim=32, n_rnn_layers=2, hidden_fc_sizes=None, dropout=0.1, activation=ReLU, input_chunk_length=12, output_chunk_length=1, optimizer_kwargs={'lr': 0.001}, pl_trainer_kwargs={'accelerator': 'cpu'})

In [19]:
# Predict 1 month ahead for validation
preds = block_rnn.predict(
    n = 1,
    series = train_series,
    past_covariates = train_pasts
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

In [None]:
from darts import TimeSeries

# Extend each state's future covariates separately
for i, ts in enumerate(test_futures):
    if ts is None:
        continue
    last_date = ts.end_time()
    future_ext = pd.date_range(last_date + pd.offsets.MonthBegin(1), periods=n_predict, freq="MS")
    
    extra_covs = pd.DataFrame({
        "year": future_ext.year,
        "month": future_ext.month,
    }, index=future_ext)
    
    extra_covs_ts = TimeSeries.from_dataframe(extra_covs)
    test_futures[i] = ts.append(extra_covs_ts)

In [20]:
# ------------- 7. Validate (predict the held-out month) -------------------------
# preds = extra_model.predict(
#     n=n_predict,
#     series=train_series,
#     past_covariates=train_pasts,
#     future_covariates=test_futures  # includes July covariates
# )
# preds is a list/sequence of TimeSeries (one per input series)
# invert transforms per state and compute metrics
import pandas as pd
y_true = []
y_hat = []

for i, sname in enumerate(ts_transformed):
    # preds[i] corresponds to series_list order; be careful with alignment
    pred_ts = preds[i]
    # invert transform using the state's pipeline
    inv = pipeline_dict[sname].inverse_transform(pred_ts)
    # extract scalar value
    y_hat.append(inv.values()[-1].item())   # predicted next-month price
    # true next-month value (from val_series)
    true_val = val_series[i]
    true_inv = pipeline_dict[sname].inverse_transform(true_val)
    y_true.append(true_inv.values()[-1].item())

In [21]:
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_true = np.array(y_true)
y_hat = np.array(y_hat)

rmse = math.sqrt(mean_squared_error(y_true, y_hat))
rmsle = math.sqrt(((np.log1p(np.maximum(0, y_hat)) - np.log1p(np.maximum(0, y_true)))**2).mean())
mae = mean_absolute_error(y_true, y_hat)
mape = np.mean(np.abs((y_true - y_hat) / y_true)) * 100
r2 = r2_score(y_true, y_hat)

print(f"Validation RMSE: {rmse:.4f}, RMSLE: {rmsle:.4f}, "
      f"MAE: {mae:.4f}, MAPE: {mape:.2f}%, R²: {r2:.4f}")

Validation RMSE: 88059.3393, RMSLE: 0.2097, MAE: 80310.0417, MAPE: 17.84%, R²: 0.5991


# Predictions on AUG

In [None]:
# ------------- 5. Train/validation split ---------------------------------------
# Example: hold out the last month for validation for each state
# We'll do a simple split: train on all but last month, validate on last month

train_series = []
train_pasts = []
train_futures = []
test_futures = []

for s in ts_transformed:
    ts = ts_transformed[s]
    train = ts
    train_series.append(train)
    # same slicing for covariates if present
    if past_cov_ts[s] is not None:
        train_pasts.append(past_cov_ts[s])
    else:
        train_pasts.append(None)
    if future_cov_ts[s] is not None:
        train_futures.append(future_cov_ts[s])
        test_futures.append(future_cov_ts[s])
    else:
        train_futures.append(None)

In [None]:
# ------------- 6. Instantiate and fit LightGBMModel -----------------------------
# Choose lags in months (e.g., last 12 months)
lags = 12
lags_past_covariates = list(range(-12,0))   # previous 12 months of past covariates
lags_future_covariates = list(range(1, n_predict+1))  # months ahead (just 1)

extra_model = RegressionModel(
    lags=lags,
    lags_past_covariates=lags_past_covariates,
    lags_future_covariates=lags_future_covariates,
    model=ExtraTreesRegressor(n_estimators=1000, random_state=42),
    output_chunk_length=n_predict
)

# Fit on the training series list (global model trained across all states)
extra_model.fit(series=train_series, past_covariates=train_pasts, future_covariates=train_futures)



RegressionModel(lags=12, lags_past_covariates=[-12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1], lags_future_covariates=[1], output_chunk_length=1, output_chunk_shift=0, add_encoders=None, model=ExtraTreesRegressor(n_estimators=1000, random_state=42), multi_models=True, use_static_covariates=True, random_state=None)

In [None]:
from darts import TimeSeries

# Extend each state's future covariates separately
for i, ts in enumerate(test_futures):
    if ts is None:
        continue
    last_date = ts.end_time()
    future_ext = pd.date_range(last_date + pd.offsets.MonthBegin(1), periods=n_predict, freq="MS")
    
    extra_covs = pd.DataFrame({
        "year": future_ext.year,
        "month": future_ext.month,
    }, index=future_ext)
    
    extra_covs_ts = TimeSeries.from_dataframe(extra_covs)
    test_futures[i] = ts.append(extra_covs_ts)

In [None]:
from darts import TimeSeries

# Extend each state's future covariates separately
for i, ts in enumerate(test_futures):
    if ts is None:
        continue
    last_date = ts.end_time()
    future_ext = pd.date_range(last_date + pd.offsets.MonthBegin(1), periods=n_predict, freq="MS")
    
    extra_covs = pd.DataFrame({
        "year": future_ext.year,
        "month": future_ext.month,
    }, index=future_ext)
    
    extra_covs_ts = TimeSeries.from_dataframe(extra_covs)
    test_futures[i] = ts.append(extra_covs_ts)

In [None]:
test_real = pd.read_csv("data\RDC_Inventory_Core_Metrics_State.csv")

In [None]:
test_real = test_real.sort_values(['state']).reset_index(drop=True)
test_real = test_real['median_listing_price'].values

In [None]:
# ------------- 7. Validate (predict the held-out month) -------------------------
preds = extra_model.predict(
    n=n_predict,
    series=train_series,
    past_covariates=train_pasts,
    future_covariates=test_futures  # includes July covariates
)
# preds is a list/sequence of TimeSeries (one per input series)
# invert transforms per state and compute metrics
import pandas as pd
y_true = []
y_hat = []

for i, sname in enumerate(ts_transformed):
    # preds[i] corresponds to series_list order; be careful with alignment
    pred_ts = preds[i]
    # invert transform using the state's pipeline
    inv = pipeline_dict[sname].inverse_transform(pred_ts)
    # extract scalar value
    y_hat.append(inv.values()[-1].item())   # predicted next-month price
    # true next-month value (from val_series)
    true_val = val_series[i]
    true_inv = pipeline_dict[sname].inverse_transform(true_val)
    y_true.append(true_inv.values()[-1].item())

In [None]:
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_true = np.array(test_real)
y_hat = np.array(y_hat)

rmse = math.sqrt(mean_squared_error(y_true, y_hat))
rmsle = math.sqrt(((np.log1p(np.maximum(0, y_hat)) - np.log1p(np.maximum(0, y_true)))**2).mean())
mae = mean_absolute_error(y_true, y_hat)
mape = np.mean(np.abs((y_true - y_hat) / y_true)) * 100
r2 = r2_score(y_true, y_hat)

print(f"Validation RMSE: {rmse:.4f}, RMSLE: {rmsle:.4f}, "
      f"MAE: {mae:.4f}, MAPE: {mape:.2f}%, R²: {r2:.4f}")

Validation RMSE: 7012.6984, RMSLE: 0.0139, MAE: 4831.6669, MAPE: 1.06%, R²: 0.9973
