In [62]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.dataprocessing import Pipeline
from darts.dataprocessing.transformers import StaticCovariatesTransformer, Scaler, InvertibleMapper
from darts.models import LightGBMModel
from sklearn.metrics import mean_squared_error
import math

In [63]:
import mlflow
from pathlib import Path

def get_project_root():
    return Path().resolve().parent.parent

# set mlflow tracking uri
mlflow.set_tracking_uri(uri=(get_project_root() / 'AI-ML-Services' / 'forecasting_engine' / 'mlruns').as_uri())

In [64]:
# experiment_id = mlflow.create_experiment("RealEstate_Price_Prediction", tags={
#     "topic":"experiment-management",
#     "version": "v1"
# })

# print(f"Experiment created with ID: {experiment_id}")

In [65]:
# We set the experiment to the one we created earlier.
mlflow.set_experiment(experiment_name="RealEstate_Price_Prediction")

<Experiment: artifact_location='file:///C:/1%20Disk%20D/Mora%20Academic/Sem%205/Data%20Science%20and%20Engineering%20Project/AI-ML-Services/forecasting_engine/mlruns/571155472720974951', creation_time=1757264864212, experiment_id='571155472720974951', last_update_time=1757264864212, lifecycle_stage='active', name='RealEstate_Price_Prediction', tags={'topic': 'experiment-management', 'version': 'v1'}>

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
new_df = pd.read_csv("data/State_Processed.csv")

In [7]:
df = new_df.copy()
df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str).str.zfill(2) + '-01')
# sort
df = df.sort_values(['state','date']).reset_index(drop=True)

In [8]:
df = df.drop(columns=['state_id'])

In [9]:
grouped_ts = {}
grouped_static = {}   # optional static covariates per state

for state, g in df.groupby('state'):
    # create a TimeSeries with monthly frequency
    ts = TimeSeries.from_dataframe(
    g,
    time_col="date",
    value_cols="median_listing_price",
    freq="MS",
    )
    grouped_ts[state] = ts

In [10]:
past_cov_ts = {}
future_cov_ts = {}
past_cov_cols = ['median_listing_price_mm',
    'median_listing_price_yy', 'active_listing_count',
    'active_listing_count_mm', 'active_listing_count_yy',
    'median_days_on_market', 'median_days_on_market_mm',
    'median_days_on_market_yy', 'new_listing_count', 'new_listing_count_mm',
    'new_listing_count_yy', 'price_increased_count',
    'price_increased_count_mm', 'price_increased_count_yy',
    'price_increased_share', 'price_increased_share_mm',
    'price_increased_share_yy', 'price_reduced_count',
    'price_reduced_count_mm', 'price_reduced_count_yy',
    'price_reduced_share', 'price_reduced_share_mm',
    'price_reduced_share_yy', 'pending_listing_count',
    'pending_listing_count_mm', 'pending_listing_count_yy',
    'median_listing_price_per_square_foot',
    'median_listing_price_per_square_foot_mm',
    'median_listing_price_per_square_foot_yy', 'median_square_feet',
    'median_square_feet_mm', 'median_square_feet_yy',
    'average_listing_price', 'average_listing_price_mm',
    'average_listing_price_yy', 'total_listing_count',
    'total_listing_count_mm', 'total_listing_count_yy', 'pending_ratio',
    'pending_ratio_mm', 'pending_ratio_yy']
future_cov_cols = ['month','year']  # calendar features known ahead

for state, g in df.groupby('state'):
    # Past covariates as a multivariate TimeSeries
    if all(c in g.columns for c in past_cov_cols):
        past_cov_ts[state] = TimeSeries.from_dataframe(g, time_col='date', value_cols=past_cov_cols, freq='MS')
    else:
        past_cov_ts[state] = None

    if all(c in g.columns for c in future_cov_cols):
        future_cov_ts[state] = TimeSeries.from_dataframe(g, time_col='date', value_cols=future_cov_cols, freq='MS')
    else:
        future_cov_ts[state] = None

In [11]:
pipeline_dict = {}
ts_transformed = {}

for state in grouped_ts:
    log_transformer = InvertibleMapper(np.log1p, np.expm1)   # log1p for target, invertible
    scaler = Scaler()
    pipe = Pipeline([log_transformer, scaler])
    # fit_transform expects a TimeSeries (or list); we pass the one series
    transformed = pipe.fit_transform(grouped_ts[state])
    pipeline_dict[state] = pipe
    ts_transformed[state] = transformed

In [12]:
series_list = [ts_transformed[s] for s in ts_transformed]
past_cov_list = [past_cov_ts[s] for s in ts_transformed]
future_cov_list = [future_cov_ts[s] for s in ts_transformed]

In [13]:
n_predict = 3
train_series = []
val_series = []
train_pasts = []
val_pasts = []
train_futures = []
val_futures = []
test_futures = []

for s in ts_transformed:
    ts = ts_transformed[s]
    # if len(ts) < 24:
    #     # skip too-short series (optional) or handle differently
    #     continue
    train = ts[:-n_predict]
    val = ts[-n_predict:]
    train_series.append(train)
    val_series.append(val)
    # same slicing for covariates if present
    if past_cov_ts[s] is not None:
        train_pasts.append(past_cov_ts[s][:-n_predict])
        val_pasts.append(past_cov_ts[s][-n_predict:])
    else:
        train_pasts.append(None)
        val_pasts.append(None)
    if future_cov_ts[s] is not None:
        train_futures.append(future_cov_ts[s][:-n_predict])
        val_futures.append(future_cov_ts[s][-n_predict:])
        test_futures.append(future_cov_ts[s])  # for final test prediction
    else:
        train_futures.append(None)
        val_futures.append(None)

In [57]:
from darts import TimeSeries

# Extend each state's future covariates separately
for i, ts in enumerate(test_futures):
    if ts is None:
        continue
    last_date = ts.end_time()
    future_ext = pd.date_range(last_date + pd.offsets.MonthBegin(1), periods=n_predict, freq="MS")
    
    extra_covs = pd.DataFrame({
        "year": future_ext.year,
        "month": future_ext.month,
    }, index=future_ext)
    
    extra_covs_ts = TimeSeries.from_dataframe(extra_covs)
    test_futures[i] = ts.append(extra_covs_ts)

In [None]:
import mlflow
import mlflow.lightgbm
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

with mlflow.start_run(run_name="LightGBM_Darts_Model"):

    # Log model hyperparameters
    mlflow.log_params({
        "lags": 12,
        "lags_past_covariates": list(range(-24, 0)),
        "lags_future_covariates": list(range(1, 2)),
        "output_chunk_length": n_predict,
        "random_state": 42
    })

    lgbm_model = LightGBMModel(
        lags=12,
        lags_past_covariates=list(range(-24, 0)),
        lags_future_covariates=list(range(1, 2)),
        output_chunk_length=n_predict,
        random_state=42
    )

    lgbm_model.fit(
        series=train_series,
        past_covariates=train_pasts,
        future_covariates=train_futures
    )

    preds = lgbm_model.predict(
        n=n_predict,
        series=train_series,
        past_covariates=train_pasts,
        future_covariates=test_futures
    )

    y_true, y_hat = [], []
    for i, sname in enumerate(ts_transformed):
        pred_ts = preds[i]
        inv = pipeline_dict[sname].inverse_transform(pred_ts)
        y_hat.append(inv.values()[-1].item())

        true_val = val_series[i]
        true_inv = pipeline_dict[sname].inverse_transform(true_val)
        y_true.append(true_inv.values()[-1].item())

    y_true = np.array(y_true)
    y_hat = np.array(y_hat)

    rmse = math.sqrt(mean_squared_error(y_true, y_hat))
    rmsle = math.sqrt(((np.log1p(np.maximum(0, y_hat)) - np.log1p(np.maximum(0, y_true)))**2).mean())
    mae = mean_absolute_error(y_true, y_hat)
    mape = np.mean(np.abs((y_true - y_hat) / y_true)) * 100
    r2 = r2_score(y_true, y_hat)

    print(f"Validation RMSE: {rmse:.4f}, RMSLE: {rmsle:.4f}, "
          f"MAE: {mae:.4f}, MAPE: {mape:.2f}%, R²: {r2:.4f}")

    mlflow.log_metrics({
        "RMSE": rmse,
        "RMSLE": rmsle,
        "MAE": mae,
        "MAPE": mape,
        "R2": r2
    })

    # Log trained model
    mlflow.lightgbm.log_model(lgbm_model.model, artifact_path="darts_lgbm_model")

# End MLflow run
mlflow.end_run()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250261
[LightGBM] [Info] Number of data points in the train set: 3621, number of used features: 998
[LightGBM] [Info] Start training from score 0.642702




Validation RMSE: 4945.9197, RMSLE: 0.0092, MAE: 3632.5093, MAPE: 0.75%, R²: 0.9987




In [None]:
lags = 12
lags_past_covariates = list(range(-24,0))   # previous 24 months of past covariates
lags_future_covariates = list(range(1, 2))  # months ahead 

lgbm_model = LightGBMModel(
    lags=lags,
    lags_past_covariates=lags_past_covariates,
    lags_future_covariates=lags_future_covariates,
    output_chunk_length=n_predict,
    random_state=42
)

# Fit on the training series list (global model trained across all states)
lgbm_model.fit(series=train_series, past_covariates=train_pasts, future_covariates=train_futures)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250298
[LightGBM] [Info] Number of data points in the train set: 3417, number of used features: 1002
[LightGBM] [Info] Start training from score 0.628580
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250298
[LightGBM] [Info] Number of data points in the train set: 3417, number of used features: 1002
[LightGBM] [Info] Start training from score 0.636690
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018723 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250298
[LightGBM] [Info] Number of data points in the train set: 3417, number of used features: 1002
[LightGBM] [Info] 

LightGBMModel(lags=12, lags_past_covariates=[-24, -23, -22, -21, -20, -19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1], lags_future_covariates=[1, 2, 3], output_chunk_length=3, output_chunk_shift=0, add_encoders=None, likelihood=None, quantiles=None, random_state=42, multi_models=True, use_static_covariates=True, categorical_past_covariates=None, categorical_future_covariates=None, categorical_static_covariates=None)

In [41]:
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def calculate_metrics(y_true, y_hat):
    y_true = np.array(y_true)
    y_hat = np.array(y_hat)

    rmse = math.sqrt(mean_squared_error(y_true, y_hat))
    rmsle = math.sqrt(((np.log1p(np.maximum(0, y_hat)) - np.log1p(np.maximum(0, y_true)))**2).mean())
    mae = mean_absolute_error(y_true, y_hat)
    mape = np.mean(np.abs((y_true - y_hat) / y_true)) * 100
    r2 = r2_score(y_true, y_hat)

    return rmse, rmsle, mae, mape, r2

In [58]:
preds = lgbm_model.predict(
    n=n_predict,
    series=train_series,
    past_covariates=train_pasts,
    future_covariates=test_futures  # includes July covariates
)
# preds is a list/sequence of TimeSeries (one per input series)
# invert transforms per state and compute metrics
import pandas as pd
y_true = []
y_hat = []

for j in range (n_predict):
    for i, sname in enumerate(ts_transformed):
        # preds[i] corresponds to series_list order; be careful with alignment
        pred_ts = preds[i][j]
        # invert transform using the state's pipeline
        inv = pipeline_dict[sname].inverse_transform(pred_ts)
        # extract scalar value
        y_hat.append(inv.values()[-1].item())   # predicted next-month price
        # true next-month value (from val_series)
        true_val = val_series[i][j]
        true_inv = pipeline_dict[sname].inverse_transform(true_val)
        y_true.append(true_inv.values()[-1].item())

    rmse, rmsle, mae, mape, r2 = calculate_metrics(y_true, y_hat)
    print(f"Step {j+1} Validation RMSE: {rmse:.4f}, RMSLE: {rmsle:.4f}, "
              f"MAE: {mae:.4f}, MAPE: {mape:.2f}%, R²: {r2:.4f}")
    y_hat = []
    y_true = []

Step 1 Validation RMSE: 4458.3532, RMSLE: 0.0103, MAE: 3543.1254, MAPE: 0.82%, R²: 0.9990
Step 2 Validation RMSE: 8425.9086, RMSLE: 0.0188, MAE: 6672.5978, MAPE: 1.52%, R²: 0.9964
Step 3 Validation RMSE: 10510.6731, RMSLE: 0.0218, MAE: 8299.3399, MAPE: 1.82%, R²: 0.9943




# Predictions on AUG

In [246]:
train_series = []
train_pasts = []
train_futures = []
test_futures = []

for s in ts_transformed:
    ts = ts_transformed[s]
    train = ts
    train_series.append(train)
    # same slicing for covariates if present
    if past_cov_ts[s] is not None:
        train_pasts.append(past_cov_ts[s])
    else:
        train_pasts.append(None)
    if future_cov_ts[s] is not None:
        train_futures.append(future_cov_ts[s])
        test_futures.append(future_cov_ts[s])
    else:
        train_futures.append(None)

In [247]:
lags = 12
lags_past_covariates = list(range(-24,0))   # previous 24 months of past covariates
lags_future_covariates = list(range(1, n_predict+1))  # months ahead (just 1)

lgbm_model = LightGBMModel(
    lags=lags,
    lags_past_covariates=lags_past_covariates,
    lags_future_covariates=lags_future_covariates,
    output_chunk_length=n_predict,
    random_state=42
)

# Fit on the training series list (global model trained across all states)
lgbm_model.fit(series=train_series, past_covariates=train_pasts, future_covariates=train_futures)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250262
[LightGBM] [Info] Number of data points in the train set: 3672, number of used features: 998
[LightGBM] [Info] Start training from score 0.646784


LightGBMModel(lags=12, lags_past_covariates=[-24, -23, -22, -21, -20, -19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1], lags_future_covariates=[1], output_chunk_length=1, output_chunk_shift=0, add_encoders=None, likelihood=None, quantiles=None, random_state=42, multi_models=True, use_static_covariates=True, categorical_past_covariates=None, categorical_future_covariates=None, categorical_static_covariates=None)

In [248]:
from darts import TimeSeries

# Extend each state's future covariates separately
for i, ts in enumerate(test_futures):
    if ts is None:
        continue
    last_date = ts.end_time()
    future_ext = pd.date_range(last_date + pd.offsets.MonthBegin(1), periods=n_predict, freq="MS")
    
    extra_covs = pd.DataFrame({
        "year": future_ext.year,
        "month": future_ext.month,
    }, index=future_ext)
    
    extra_covs_ts = TimeSeries.from_dataframe(extra_covs)
    test_futures[i] = ts.append(extra_covs_ts)

In [249]:
from darts import TimeSeries

# Extend each state's future covariates separately
for i, ts in enumerate(test_futures):
    if ts is None:
        continue
    last_date = ts.end_time()
    future_ext = pd.date_range(last_date + pd.offsets.MonthBegin(1), periods=n_predict, freq="MS")
    
    extra_covs = pd.DataFrame({
        "year": future_ext.year,
        "month": future_ext.month,
    }, index=future_ext)
    
    extra_covs_ts = TimeSeries.from_dataframe(extra_covs)
    test_futures[i] = ts.append(extra_covs_ts)

In [250]:
test_real = pd.read_csv("data\RDC_Inventory_Core_Metrics_State.csv")

In [251]:
test_real = test_real.sort_values(['state']).reset_index(drop=True)
test_real = test_real['median_listing_price'].values

In [252]:
preds = lgbm_model.predict(
    n=n_predict,
    series=train_series,
    past_covariates=train_pasts,
    future_covariates=test_futures  # includes July covariates
)
# preds is a list/sequence of TimeSeries (one per input series)
# invert transforms per state and compute metrics
import pandas as pd
y_true = []
y_hat = []

for i, sname in enumerate(ts_transformed):
    # preds[i] corresponds to series_list order; be careful with alignment
    pred_ts = preds[i]
    # invert transform using the state's pipeline
    inv = pipeline_dict[sname].inverse_transform(pred_ts)
    # extract scalar value
    y_hat.append(inv.values()[-1].item())   # predicted next-month price
    # true next-month value (from val_series)
    true_val = val_series[i]
    true_inv = pipeline_dict[sname].inverse_transform(true_val)
    y_true.append(true_inv.values()[-1].item())



In [253]:
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_true = np.array(test_real)
y_hat = np.array(y_hat)

rmse = math.sqrt(mean_squared_error(y_true, y_hat))
rmsle = math.sqrt(((np.log1p(np.maximum(0, y_hat)) - np.log1p(np.maximum(0, y_true)))**2).mean())
mae = mean_absolute_error(y_true, y_hat)
mape = np.mean(np.abs((y_true - y_hat) / y_true)) * 100
r2 = r2_score(y_true, y_hat)

print(f"Validation RMSE: {rmse:.4f}, RMSLE: {rmsle:.4f}, "
      f"MAE: {mae:.4f}, MAPE: {mape:.2f}%, R²: {r2:.4f}")

Validation RMSE: 6524.4285, RMSLE: 0.0126, MAE: 4475.2012, MAPE: 0.97%, R²: 0.9977
