## Import Libraries

In [132]:
import importlib, sys

import vars
importlib.reload(vars)

<module 'vars' from 'c:\\GitHub\\realtor-analysis\\vars.py'>

In [182]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier, DMatrix
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import warnings
warnings.simplefilter('ignore', category=pd.errors.SettingWithCopyWarning)

## Load and View Data

In [7]:
supply = pd.read_csv(vars.supply_data_path)
print(f"Read {len(supply)} records under Supply Dataset")
print('columns:')
print(supply.columns)

demand = pd.read_csv(vars.demand_data_path)
print(f"Read {len(demand)} records under Demand Dataset")
print('columns:')
print(demand.columns)

Read 3085041 records under Supply Dataset
columns:
Index(['month_date_yyyymm', 'postal_code', 'zip_name', 'median_listing_price',
       'median_listing_price_mm', 'median_listing_price_yy',
       'active_listing_count', 'active_listing_count_mm',
       'active_listing_count_yy', 'median_days_on_market',
       'median_days_on_market_mm', 'median_days_on_market_yy',
       'new_listing_count', 'new_listing_count_mm', 'new_listing_count_yy',
       'price_increased_count', 'price_increased_count_mm',
       'price_increased_count_yy', 'price_increased_share',
       'price_increased_share_mm', 'price_increased_share_yy',
       'price_reduced_count', 'price_reduced_count_mm',
       'price_reduced_count_yy', 'price_reduced_share',
       'price_reduced_share_mm', 'price_reduced_share_yy',
       'pending_listing_count', 'pending_listing_count_mm',
       'pending_listing_count_yy', 'median_listing_price_per_square_foot',
       'median_listing_price_per_square_foot_mm',
       'median

In [8]:
supply[['postal_code', 'zip_name']]

Unnamed: 0,postal_code,zip_name
0,56662,"outing, mn"
1,24531,"chatham, va"
2,26505,"morgantown, wv"
3,23838,"chesterfield, va"
4,38115,"memphis, tn"
...,...,...
3085036,7826,"branchville, nj"
3085037,29307,"spartanburg, sc"
3085038,84620,"aurora, ut"
3085039,60517,"woodridge, il"


In [23]:
zip2county = pd.read_excel(vars.zip2county)
print(zip2county)

print(f"Unique Zips (Supply): {supply['postal_code'].nunique()}")
print(f"Unique Zips (Demand): {demand['postal_code'].nunique()}")
print(f"Unique Zips (Crosswalk): {zip2county['zip'].nunique()}")
print(f"Unique County Codes (Crosswalk): {zip2county['county'].nunique()}")

# Check if there are any zips in supply and demand that are not in the crosswalk
supply_zips = set(supply['postal_code'].unique())
demand_zips = set(demand['postal_code'].unique())
crosswalk_zips = set(zip2county['zip'].unique())
missing_in_crosswalk = (supply_zips | demand_zips) - crosswalk_zips
if missing_in_crosswalk:
    print(f"Zips in supply/demand not in crosswalk: {len(missing_in_crosswalk)}")
else:
    print("All zips in supply and demand are covered in the crosswalk.")


         zip  county  res_ratio  bus_ratio  oth_ratio  tot_ratio
0        606   72093   1.000000   1.000000   1.000000   1.000000
1        617   72017   0.969103   0.998428   1.000000   0.971027
2        617   72054   0.030897   0.001572   0.000000   0.028973
3        674   72145   0.004759   0.017114   0.022523   0.006106
4        674   72091   0.995241   0.982886   0.977477   0.993894
...      ...     ...        ...        ...        ...        ...
53813  99688    2170   1.000000   1.000000   1.000000   1.000000
53814  99705    2090   1.000000   1.000000   1.000000   1.000000
53815  99694    2170   0.000000   0.000000   1.000000   1.000000
53816  99659    2180   0.000000   1.000000   1.000000   1.000000
53817  99680    2050   0.000000   0.000000   1.000000   1.000000

[53818 rows x 6 columns]
Unique Zips (Supply): 34104
Unique Zips (Demand): 18777
Unique Zips (Crosswalk): 39461
Unique County Codes (Crosswalk): 3227
Zips in supply/demand not in crosswalk: 33


In [91]:
# Clean up dataframes such that we only keep records where zip codes are in all three: supply, demand, and crosswalk.
zip_intersection = supply_zips & demand_zips & crosswalk_zips
sup_df = supply[supply['postal_code'].isin(zip_intersection)].copy()
dem_df = demand[demand['postal_code'].isin(zip_intersection)].copy()

# Drop Overlapping columns in dem
join_keys = ['month_date_yyyymm', 'postal_code']
overlap = [c for c in dem_df.columns if c in sup_df.columns and c not in join_keys]

# Drop them from demand side before merge
dem_df_nodup = dem_df.drop(columns=overlap)

# Decision Point: Lets make the decision to only keep zip codes and months that are present in both supply and demand.
df = sup_df.merge(dem_df_nodup, 'inner', on=join_keys)

print(f"After cleaning, Supply records: {len(sup_df)}, Demand records: {len(dem_df)}")
print(f"Merged records: {len(df)}, columns: {len(df.columns)}")
print(list(df.columns))

After cleaning, Supply records: 2033177, Demand records: 1192128
Merged records: 1192128, columns: 60
['month_date_yyyymm', 'postal_code', 'zip_name', 'median_listing_price', 'median_listing_price_mm', 'median_listing_price_yy', 'active_listing_count', 'active_listing_count_mm', 'active_listing_count_yy', 'median_days_on_market', 'median_days_on_market_mm', 'median_days_on_market_yy', 'new_listing_count', 'new_listing_count_mm', 'new_listing_count_yy', 'price_increased_count', 'price_increased_count_mm', 'price_increased_count_yy', 'price_increased_share', 'price_increased_share_mm', 'price_increased_share_yy', 'price_reduced_count', 'price_reduced_count_mm', 'price_reduced_count_yy', 'price_reduced_share', 'price_reduced_share_mm', 'price_reduced_share_yy', 'pending_listing_count', 'pending_listing_count_mm', 'pending_listing_count_yy', 'median_listing_price_per_square_foot', 'median_listing_price_per_square_foot_mm', 'median_listing_price_per_square_foot_yy', 'median_square_feet', 'm

In [64]:
print(df['month_date_yyyymm'].unique())
print(df['month_date_yyyymm'].nunique())

# Group by postal_code and count months
counts = df.groupby('postal_code')['month_date_yyyymm'].nunique().reset_index()

# Rename for clarity
counts = counts.rename(columns={'month_date_yyyymm': 'month_count'})

# Now group by month_count and count how often each occurs
distribution = counts['month_count'].value_counts().reset_index()
distribution.columns = ['month_count', 'num_postal_codes']
distribution = distribution.sort_values('month_count', ascending=False)

print(distribution)

[202507 202506 202505 202504 202503 202502 202501 202412 202411 202410
 202409 202408 202407 202406 202405 202404 202403 202402 202401 202312
 202311 202310 202309 202308 202307 202306 202305 202304 202303 202302
 202301 202212 202211 202210 202209 202208 202207 202206 202205 202204
 202203 202202 202201 202112 202111 202110 202109 202108 202107 202106
 202105 202104 202103 202102 202101 202012 202011 202010 202009 202008
 202007 202006 202005 202004 202003 202002 202001 201912 201911 201910
 201909 201908 201907 201906 201905 201904 201903 201902 201901 201812
 201811 201810 201809 201808 201807 201806 201805 201804 201803 201802
 201801 201712 201711 201710 201709 201708]
96
    month_count  num_postal_codes
0            96              5245
1            95               583
2            94               456
5            93               380
6            92               324
..          ...               ...
15            5               186
10            4               256
8       

In [107]:
# Anchor and required 36-month window [2025-07, 2025-06, ..., 2022-08]
anchor = pd.Period(vars.start_date, freq='M')
last_date = pd.Period(vars.end_date, freq='M')
required_months = {anchor - i for i in range(vars.check_window_months)}  # set for fast membership

# Add a Period month column
df2 = df.copy()
df2['month'] = pd.to_datetime(df['month_date_yyyymm'], format='%Y%m').dt.to_period('M')

# Keep only rows in the required 36-month window
df_win = df2[df2['month'].isin(required_months)]

# Count distinct months per postal_code within the window
counts = df_win.groupby('postal_code')['month'].nunique()

# Postal codes with all vars.min_records months present (consecutive by construction of the window)
valid_postals = counts[counts >= vars.min_acceptable_records].index

# Final result: only those postal_codes and only those 36 months
final_df = (
    df2[
        (df2['postal_code'].isin(valid_postals)) &
        (df2['month'] >= last_date)]
    .sort_values(['postal_code', 'month'], ascending=[True, False])
)

# quick summary
print(f"Len of result: {len(final_df)}")
print(valid_postals)
print(f"Length of 92620: {len(final_df[final_df['postal_code'] == 92620])}")
print(f"Length of 99645: {len(final_df[final_df['postal_code'] == 99645])}")
summary = counts.value_counts().sort_index(ascending=False)
print(summary)  # how many postal_codes have 36, 35, ..., months present in the window


Len of result: 857566
Index([ 1056,  1085,  1095,  1109,  1201,  1230,  1247,  1267,  1331,  1420,
       ...
       99645, 99654, 99669, 99688, 99705, 99709, 99712, 99737, 99801, 99901],
      dtype='int64', name='postal_code', length=9297)
Length of 92620: 94
Length of 99645: 96
month
36    7796
35     513
34     395
33     330
32     263
31     259
30     216
29     211
28     185
27     177
26     155
25     180
24     158
23     137
22     141
21     144
20     166
19     136
18     148
17     138
16     132
15     164
14     151
13     174
12     159
11     159
10     143
9      168
8      210
7      173
6      215
5      224
4      263
3      349
2      434
1      522
Name: count, dtype: int64


In [94]:
# Lets get a sense for how many zip-codes we have thrown out after the cleaning process from both the supply and demand.
print(f"Unique Zips (Supply): {supply['postal_code'].nunique()}")
print(f"Unique Zips (Demand): {demand['postal_code'].nunique()}")
print(f"Unique Zips (Full DF): {final_df['postal_code'].nunique()}")
print(f"DF Columns: {final_df.columns}")

Unique Zips (Supply): 34104
Unique Zips (Demand): 18777
Unique Zips (Full DF): 9297
DF Columns: Index(['month_date_yyyymm', 'postal_code', 'zip_name', 'median_listing_price',
       'median_listing_price_mm', 'median_listing_price_yy',
       'active_listing_count', 'active_listing_count_mm',
       'active_listing_count_yy', 'median_days_on_market',
       'median_days_on_market_mm', 'median_days_on_market_yy',
       'new_listing_count', 'new_listing_count_mm', 'new_listing_count_yy',
       'price_increased_count', 'price_increased_count_mm',
       'price_increased_count_yy', 'price_increased_share',
       'price_increased_share_mm', 'price_increased_share_yy',
       'price_reduced_count', 'price_reduced_count_mm',
       'price_reduced_count_yy', 'price_reduced_share',
       'price_reduced_share_mm', 'price_reduced_share_yy',
       'pending_listing_count', 'pending_listing_count_mm',
       'pending_listing_count_yy', 'median_listing_price_per_square_foot',
       'median_list

In [54]:
# Experimenting with grab_records function build
zips = [92620]
counties = zip2county[zip2county['zip'].isin(zips)]['county'].unique()
all_zips = zip2county[zip2county['county'].isin(counties)]['zip'].unique()
Z = list(set(all_zips).intersection(zip_intersection))
postal_df = df[df['postal_code'].isin(Z)]
postal_df.loc[:, 'original_zip'] = postal_df['postal_code'].isin(zips).astype(int)
for z in Z:
    postal_df.loc[:, f'zip_{z}'] = (postal_df['postal_code'] == z).astype(int)
print(counties, len(Z), len(postal_df), len(postal_df.columns))
print(postal_df.columns.to_list())

[6059] 86 7271 157
['month_date_yyyymm', 'postal_code', 'zip_name_sup', 'median_listing_price_sup', 'median_listing_price_mm_sup', 'median_listing_price_yy_sup', 'active_listing_count', 'active_listing_count_mm', 'active_listing_count_yy', 'median_days_on_market_sup', 'median_days_on_market_mm_sup', 'median_days_on_market_yy_sup', 'new_listing_count', 'new_listing_count_mm', 'new_listing_count_yy', 'price_increased_count', 'price_increased_count_mm', 'price_increased_count_yy', 'price_increased_share', 'price_increased_share_mm', 'price_increased_share_yy', 'price_reduced_count', 'price_reduced_count_mm', 'price_reduced_count_yy', 'price_reduced_share', 'price_reduced_share_mm', 'price_reduced_share_yy', 'pending_listing_count', 'pending_listing_count_mm', 'pending_listing_count_yy', 'median_listing_price_per_square_foot', 'median_listing_price_per_square_foot_mm', 'median_listing_price_per_square_foot_yy', 'median_square_feet', 'median_square_feet_mm', 'median_square_feet_yy', 'averag

In [None]:
def grab_county_records(df, zips, zip2county=zip2county):
    counties = zip2county[zip2county['zip'].isin(zips)]['county'].unique()
    all_zips = zip2county[zip2county['county'].isin(counties)]['zip'].unique()
    Z = list(set(all_zips).intersection(zip_intersection))
    postal_df = df[df['postal_code'].isin(Z)]
    postal_df.loc[:, 'original_zip'] = postal_df['postal_code'].isin(zips).astype(int)
    for z in Z:
        postal_df.loc[:, f'zip_{z}'] = (postal_df['postal_code'] == z).astype(int)
    return postal_df

In [190]:
def grab_training_records(final_df, zip, outcome_thresholds=vars.outcome_thresholds):
    postal_df = final_df[final_df['postal_code'] == zip].reset_index(0).drop(columns=['index'])
    training_sets = {i: [] for i in range(1, vars.max_months_predicted + 1)}
    outcome_sets = {i: [] for i in range(1, vars.max_months_predicted + 1)}
    appended = {i: 0 for i in range(1, vars.max_months_predicted + 1)}

    outcome_trend = postal_df[['month']].copy().reset_index(drop=True).iloc[:-1]

    price_after = postal_df['median_listing_price'].iloc[:-1].to_numpy()
    price_before = postal_df['median_listing_price'].iloc[1:].to_numpy()
    price_ratio = (price_after - price_before) / price_before
    outcome_trend['price_change'] = price_ratio

    outcome_trend['days_on_market'] = postal_df['median_days_on_market'].iloc[:-1].to_numpy()

    count_after = postal_df['page_view_count_per_property_vs_us'].iloc[:-1].to_numpy()
    count_before = postal_df['page_view_count_per_property_vs_us'].iloc[1:].to_numpy()
    count_ratio = (count_after - count_before) / count_before
    outcome_trend['view_count_change'] = count_ratio

    for i in range(len(postal_df)):
        for t in training_sets:
            j = i + t
            if j >= len(postal_df):
                continue

            # Get how many months the two records are separted by
            months_sep = postal_df.loc[i, 'month'] - postal_df.loc[j, 'month']
            months_int = months_sep.n
            if (months_int > vars.max_months_predicted):
                continue
            elif months_int < 1:
                raise AssertionError(f"Somehow months_int < 1 for {i}, {j}, {months_int}")

            # Calculate Price Outcomes
            price_before = postal_df.loc[i, 'median_listing_price']
            price_after = postal_df.loc[j, 'median_listing_price']
            price_ratio = (price_after - price_before) / price_before

            high_risk_price = int(price_ratio <= outcome_thresholds['high_risk']['price_change'])
            strong_market_price = int(price_ratio >= outcome_thresholds['strong_market']['price_change'])

            # Calculate Day on Market Outcomes
            dom_after = postal_df.loc[j, 'median_days_on_market']

            high_risk_dom = int(dom_after >= outcome_thresholds['high_risk']['days_on_market'])
            strong_market_dom = int(dom_after <= outcome_thresholds['strong_market']['days_on_market'])

            # Calculate View Count Outcomes
            count_before = postal_df.loc[i, 'page_view_count_per_property_vs_us']
            count_after = postal_df.loc[j, 'page_view_count_per_property_vs_us']
            count_ratio = (count_after - count_before) / count_before

            high_risk_count = int(count_ratio <= outcome_thresholds['high_risk']['view_count_change'])
            strong_market_count = int(count_ratio >= outcome_thresholds['strong_market']['view_count_change'])

            # Append to training and outcome sets
            training_sets[months_int].append(postal_df.loc[j, vars.features].to_numpy())
            outcome_sets[months_int].append([high_risk_price, strong_market_price,
                                             high_risk_dom, strong_market_dom,
                                             high_risk_count, strong_market_count])
            appended[months_int] += 1
    
    inference_point = np.array([postal_df.loc[0, vars.features].to_numpy()])
    for t in training_sets:
        training_sets[t] = np.array(training_sets[t])
        outcome_sets[t] = np.array(outcome_sets[t])

    return training_sets, outcome_sets, outcome_trend, inference_point

        

In [191]:
zip = 92620
training_sets, outcome_sets, outcome_trend, inference_point = grab_training_records(final_df, zip)
for t in training_sets:
    print(t, len(training_sets[t]), len(outcome_sets[t]), len(training_sets[t][0]), len(outcome_sets[t][0]))
    

1 91 91 56 6
2 90 90 56 6
3 89 89 56 6
4 88 88 56 6
5 87 87 56 6
6 86 86 56 6
7 85 85 56 6
8 84 84 56 6
9 83 83 56 6
10 82 82 56 6
11 81 81 56 6
12 80 80 56 6


In [195]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge

def forecast_12_fourier(outcome_trend, horizon=12, K=3, period=12):
    df = outcome_trend.copy()

    # --- normalize month to Timestamp at month-start ---
    m = df['month']
    if pd.api.types.is_period_dtype(m):
        # Coerce to monthly periods and convert to timestamps (start of month)
        df['month'] = pd.PeriodIndex(m, freq='M').to_timestamp()  # <-- no 'MS' here
    elif pd.api.types.is_datetime64_any_dtype(m):
        df['month'] = m.dt.to_period('M').dt.to_timestamp()
    else:
        df['month'] = pd.to_datetime(m, errors='coerce').dt.to_period('M').dt.to_timestamp()

    df = df.sort_values('month').reset_index(drop=True)

    # time index
    t = np.arange(len(df))
    t_future = np.arange(len(df), len(df) + horizon)

    def fourier_basis(tt, K, period):
        X = [np.ones_like(tt), tt]  # intercept + linear trend
        for k in range(1, K + 1):
            X.append(np.sin(2 * np.pi * k * tt / period))
            X.append(np.cos(2 * np.pi * k * tt / period))
        return np.column_stack(X)

    X  = fourier_basis(t, K, period)
    Xf = fourier_basis(t_future, K, period)

    target_cols = ['price_change', 'days_on_market', 'view_count_change']
    preds = {}

    for col in target_cols:
        y = df[col].astype(float).to_numpy()
        model = Ridge(alpha=1.0)
        model.fit(X, y)
        yhat = model.predict(Xf)
        if col == 'days_on_market':
            yhat = np.clip(yhat, 0, None)
        preds[col] = yhat

    future_months = pd.date_range(df['month'].iloc[-1] + pd.offsets.MonthBegin(1),
                                  periods=horizon, freq='MS')
    out = pd.DataFrame({'month': future_months})
    for col in target_cols:
        out[col] = preds[col]
    return out


# usage:
future_12 = forecast_12_fourier(outcome_trend, horizon=12, K=3)
print(future_12)

        month  price_change  days_on_market  view_count_change
0  2025-08-01      0.030263       29.602907          -0.084367
1  2025-09-01      0.001926       33.215659          -0.034709
2  2025-10-01     -0.012744       37.183797           0.023847
3  2025-11-01      0.005887       38.465034           0.044645
4  2025-12-01      0.009730       41.411805           0.064530
5  2026-01-01     -0.008759       46.373143           0.093796
6  2026-02-01      0.004418       45.154970           0.066239
7  2026-03-01      0.049048       35.601929          -0.027098
8  2026-04-01      0.055040       27.949649          -0.087720
9  2026-05-01      0.011627       28.870069          -0.067316
10 2026-06-01     -0.008129       31.754612          -0.044343
11 2026-07-01      0.019416       30.368441          -0.072361


  if pd.api.types.is_period_dtype(m):


In [196]:
import numpy as np
import pandas as pd

# ---------- metrics ----------
def rmse(y, yhat):
    return float(np.sqrt(np.mean((y - yhat) ** 2)))

def smape(y, yhat, eps=1e-8):
    num = np.abs(y - yhat)
    den = (np.abs(y) + np.abs(yhat) + eps) / 2.0
    return float(100 * np.mean(num / den))  # percent

def mase(y_true, y_pred, y_train, m=12):
    # scale by seasonal naive MAE on the *training* period
    if len(y_train) <= m:
        return np.nan
    denom = np.mean(np.abs(y_train[m:] - y_train[:-m]))
    if denom == 0:
        return np.nan
    return float(np.mean(np.abs(y_true - y_pred)) / denom)

# ---------- month normalization ----------
def _normalize_month_index(df):
    d = df.copy()
    m = d['month']
    if pd.api.types.is_period_dtype(m):
        d['month'] = pd.PeriodIndex(m, freq='M').to_timestamp()
    elif pd.api.types.is_datetime64_any_dtype(m):
        d['month'] = m.dt.to_period('M').dt.to_timestamp()
    else:
        d['month'] = pd.to_datetime(m, errors='coerce').dt.to_period('M').dt.to_timestamp()
    d = d.sort_values('month').set_index('month')
    return d

# ---------- backtester ----------
def backtest_forecaster(outcome_trend, forecaster, horizon=12, seasonality=12, min_train=36, step=1):
    """
    forecaster: function(train_df, horizon) -> DataFrame with 'month' and same numeric columns
    """
    df = _normalize_month_index(outcome_trend)
    cols = [c for c in df.columns if c != 'month']  # index now holds month
    n = len(df)
    cut_starts = list(range(min_train-1, n - horizon, step))  # index of last training obs

    # collectors
    per_h_metrics = {c: {h+1: [] for h in range(horizon)} for c in cols}
    overall = {c: {"RMSE": [], "sMAPE": [], "MASE": [], "RMSE_baseline": [], "sMAPE_baseline": [], "MASE_baseline": []} for c in cols}

    for cut in cut_starts:
        train = df.iloc[:cut+1].copy()
        future_idx = pd.date_range(train.index[-1] + pd.offsets.MonthBegin(1), periods=horizon, freq='MS')

        # model forecast
        fcast = forecaster(train.reset_index().rename(columns={'index': 'month'}), horizon=horizon)
        fcast = _normalize_month_index(fcast).reindex(future_idx)

        # actuals
        truth = df.iloc[cut+1:cut+1+horizon].reindex(future_idx)

        # seasonal naive baseline: y_{t+h} = y_{t+h-12}
        baseline = df[cols].shift(seasonality).iloc[cut+1:cut+1+horizon].reindex(future_idx)

        # compute metrics per column
        for c in cols:
            y_true = truth[c].astype(float).to_numpy()
            y_hat  = fcast[c].astype(float).to_numpy()
            y_base = baseline[c].astype(float).to_numpy()

            mask_model = ~np.isnan(y_true) & ~np.isnan(y_hat)
            mask_base  = ~np.isnan(y_true) & ~np.isnan(y_base)

            if mask_model.any():
                overall[c]["RMSE"].append(rmse(y_true[mask_model], y_hat[mask_model]))
                overall[c]["sMAPE"].append(smape(y_true[mask_model], y_hat[mask_model]))
                # MASE needs training history of this column
                overall[c]["MASE"].append(mase(y_true[mask_model], y_hat[mask_model], train[c].astype(float).to_numpy(), m=seasonality))
                # per-horizon
                for h in range(horizon):
                    if not np.isnan(y_true[h]) and not np.isnan(y_hat[h]):
                        per_h_metrics[c][h+1].append(abs(y_true[h]-y_hat[h]))

            if mask_base.any():
                overall[c]["RMSE_baseline"].append(rmse(y_true[mask_base], y_base[mask_base]))
                overall[c]["sMAPE_baseline"].append(smape(y_true[mask_base], y_base[mask_base]))
                overall[c]["MASE_baseline"].append(mase(y_true[mask_base], y_base[mask_base], train[c].astype(float).to_numpy(), m=seasonality))

    # aggregate
    rows = []
    for c in cols:
        if overall[c]["RMSE"]:
            rmse_m   = np.mean(overall[c]["RMSE"])
            rmse_b   = np.mean(overall[c]["RMSE_baseline"]) if overall[c]["RMSE_baseline"] else np.nan
            smape_m  = np.mean(overall[c]["sMAPE"])
            smape_b  = np.mean(overall[c]["sMAPE_baseline"]) if overall[c]["sMAPE_baseline"] else np.nan
            mase_m   = np.nanmean(overall[c]["MASE"])
            mase_b   = np.nanmean(overall[c]["MASE_baseline"]) if overall[c]["MASE_baseline"] else np.nan
            rows.append({
                "series": c,
                "RMSE": rmse_m,
                "RMSE_baseline": rmse_b,
                "RMSE_improvement_%": (1 - rmse_m/rmse_b)*100 if rmse_b and rmse_b>0 else np.nan,
                "sMAPE_%": smape_m,
                "sMAPE_baseline_%": smape_b,
                "sMAPE_improvement_%": (1 - smape_m/smape_b)*100 if smape_b and smape_b>0 else np.nan,
                "MASE": mase_m,               # <1 means better than seasonal naive
                "MASE_baseline": mase_b       # ~1 for seasonal naive
            })

    summary = pd.DataFrame(rows).sort_values("series")

    # Optional: mean absolute error by forecast horizon (useful to see decay)
    by_horizon = {}
    for c in cols:
        by_horizon[c] = pd.DataFrame({
            "horizon": list(per_h_metrics[c].keys()),
            "mean_abs_error": [np.mean(per_h_metrics[c][h]) if per_h_metrics[c][h] else np.nan
                               for h in per_h_metrics[c]]
        })

    return summary, by_horizon

# your forecaster (wraps the function you already have)
def fourier_forecaster(train_df, horizon):
    return forecast_12_fourier(train_df, horizon=horizon, K=3, period=12)

summary, by_h = backtest_forecaster(outcome_trend, forecaster=fourier_forecaster,
                                    horizon=12, seasonality=12, min_train=36, step=1)
print(summary)                 # overall metrics vs seasonal naive
print(by_h['price_change'])    # error by horizon for a specific series


  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.

              series       RMSE  RMSE_baseline  RMSE_improvement_%  \
1     days_on_market  16.345059      19.283580           15.238462   
0       price_change   0.069273       0.083718           17.253702   
2  view_count_change   0.140987       0.233121           39.522080   

      sMAPE_%  sMAPE_baseline_%  sMAPE_improvement_%      MASE  MASE_baseline  
1   39.446466         45.972967            14.196388  1.123391       1.200344  
0  141.916400        154.604815             8.206999  1.055244       1.379221  
2  130.571404        156.289588            16.455469  0.908872       1.492491  
    horizon  mean_abs_error
0         1        0.048155
1         2        0.046980
2         3        0.046670
3         4        0.049525
4         5        0.050771
5         6        0.051534
6         7        0.047543
7         8        0.048029
8         9        0.045594
9        10        0.046935
10       11        0.046089
11       12        0.046103


  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):


In [197]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

def forecast_12_xgb(outcome_trend, horizon=12, use_gpu=False, max_lag=12,
                    add_calendar=True, params=None, nonneg_cols=("days_on_market",)):
    """
    Forecast next `horizon` months for each numeric column except 'month'
    using XGBoost with lag/rolling features (univariate per series).

    Parameters
    ----------
    outcome_trend : pd.DataFrame
        Must contain a 'month' column and ≥1 numeric series columns.
    horizon : int
        Number of months to forecast.
    use_gpu : bool
        If True, try to use GPU (falls back to CPU if unavailable).
    max_lag : int
        Maximum number of monthly lags to include (auto-reduced if history is short).
    add_calendar : bool
        If True, add month-of-year sin/cos features.
    params : dict or None
        Extra/override XGBRegressor params.
    nonneg_cols : tuple of str
        Columns to clip at [0, ∞) after prediction.

    Returns
    -------
    pd.DataFrame with 'month' and one column per input series (forecasts).
    """

    def _normalize_month_index(df):
        d = df.copy()
        m = d["month"]
        if pd.api.types.is_period_dtype(m):
            d["month"] = pd.PeriodIndex(m, freq="M").to_timestamp()
        elif pd.api.types.is_datetime64_any_dtype(m):
            d["month"] = m.dt.to_period("M").dt.to_timestamp()
        else:
            d["month"] = pd.to_datetime(m, errors="coerce").dt.to_period("M").dt.to_timestamp()
        d = d.sort_values("month").set_index("month")
        return d

    df = _normalize_month_index(outcome_trend)

    # choose numeric cols except 'month'
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not num_cols:
        raise ValueError("No numeric columns to forecast. Ensure you have numeric series columns in addition to 'month'.")

    # base XGB params
    xgb_params = dict(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=0,
        objective="reg:squarederror",
    )

    # GPU toggle
    if use_gpu:
        try:
            import xgboost as xgb
            from packaging.version import Version
            if Version(xgb.__version__) >= Version("2.0.0"):
                xgb_params.update(tree_method="hist", device="cuda")
            else:
                xgb_params.update(tree_method="gpu_hist", predictor="gpu_predictor")
        except Exception:
            # fall back silently to CPU
            pass

    if params:
        xgb_params.update(params)

    # helper to build supervised frame for one series
    def _make_supervised(series: pd.Series, L: int):
        s = series.astype(float).copy()

        parts = []
        # lags 1..L
        for l in range(1, L + 1):
            parts.append(s.shift(l).rename(f"lag_{l}"))

        # rolling means based on prior values
        parts.append(s.shift(1).rolling(3, min_periods=1).mean().rename("roll3"))
        parts.append(s.shift(1).rolling(6, min_periods=1).mean().rename("roll6"))
        parts.append(s.shift(1).rolling(12, min_periods=1).mean().rename("roll12"))

        if add_calendar:
            # month-of-year sin/cos (align with current timestamp rows)
            idx = s.index
            month_num = idx.month.values  # 1..12
            # convert to series aligned with index
            mo = pd.Series(month_num, index=idx)
            parts.append(np.sin(2 * np.pi * mo / 12).rename("mo_sin"))
            parts.append(np.cos(2 * np.pi * mo / 12).rename("mo_cos"))

        X = pd.concat(parts, axis=1)
        y = s
        data = pd.concat([X, y.rename("y")], axis=1).dropna()
        return data.drop(columns=["y"]), data["y"]

    forecasts = {}
    last_month = df.index[-1]
    future_idx = pd.date_range(last_month + pd.offsets.MonthBegin(1), periods=horizon, freq="MS")

    for col in num_cols:
        s = df[col].astype(float)
        if len(s) < 3:
            # too short; repeat last value
            forecasts[col] = pd.Series([float(s.iloc[-1])] * horizon, index=future_idx)
            continue

        L = max(1, min(max_lag, len(s) - 1))
        X, y = _make_supervised(s, L)

        # keep feature order for recursive steps
        feature_order = X.columns.tolist()

        model = XGBRegressor(**xgb_params)
        model.fit(X.values, y.values)

        # recursive multi-step forecast
        hist = s.copy()
        preds = []
        for step in range(horizon):
            t_next = future_idx[step]

            feats = {}
            # lags
            for l in range(1, L + 1):
                feats[f"lag_{l}"] = float(hist.iloc[-l]) if len(hist) >= l else float(hist.iloc[-1])

            # rolling (use available history)
            feats["roll3"] = float(hist.iloc[-3:].mean()) if len(hist) >= 3 else float(hist.mean())
            feats["roll6"] = float(hist.iloc[-6:].mean()) if len(hist) >= 6 else float(hist.mean())
            feats["roll12"] = float(hist.iloc[-12:].mean()) if len(hist) >= 12 else float(hist.mean())

            if add_calendar:
                mo = t_next.month  # 1..12
                feats["mo_sin"] = float(np.sin(2 * np.pi * mo / 12))
                feats["mo_cos"] = float(np.cos(2 * np.pi * mo / 12))

            # align to training feature order
            x_next = np.array([feats[k] for k in feature_order], dtype=float).reshape(1, -1)
            yhat = float(model.predict(x_next)[0])

            if col in nonneg_cols:
                yhat = max(0.0, yhat)

            preds.append(yhat)
            hist.loc[t_next] = yhat  # extend history for next step

        forecasts[col] = pd.Series(preds, index=future_idx)

    out = pd.DataFrame({"month": future_idx})
    for col in num_cols:
        out[col] = forecasts[col].values
    return out

#--- Usage ---
future_12_xgb = forecast_12_xgb(outcome_trend, horizon=12, use_gpu=True)
print(future_12_xgb)


  if pd.api.types.is_period_dtype(m):


        month  price_change  days_on_market  view_count_change
0  2025-08-01      0.019507       49.288494          -0.015636
1  2025-09-01     -0.027931       46.054058           0.003950
2  2025-10-01     -0.000581       45.142601          -0.019385
3  2025-11-01     -0.016734       60.664890           0.075709
4  2025-12-01     -0.024842       66.730011           0.022362
5  2026-01-01      0.021792       58.692070          -0.021615
6  2026-02-01      0.008189       46.484100          -0.021890
7  2026-03-01      0.001586       37.830471          -0.005836
8  2026-04-01     -0.006501       36.096016          -0.047094
9  2026-05-01     -0.006828       40.076237          -0.066617
10 2026-06-01      0.038745       46.827927          -0.063030
11 2026-07-01      0.047669       48.941154          -0.041157


In [198]:
# ---- Wrapper so your existing backtest_forecaster can call XGB ----
def make_xgb_forecaster(use_gpu=True, max_lag=12, add_calendar=True, params=None,
                        nonneg_cols=("days_on_market",)):
    """
    Returns a callable forecaster(train_df, horizon) -> forecast df,
    which is exactly what backtest_forecaster expects.
    """
    def _xgb_forecaster(train_df, horizon):
        return forecast_12_xgb(
            train_df,
            horizon=horizon,
            use_gpu=use_gpu,
            max_lag=max_lag,
            add_calendar=add_calendar,
            params=params,
            nonneg_cols=nonneg_cols,
        )
    return _xgb_forecaster

# ---- Example usage with your existing backtest_forecaster ----
xgb_fcst = make_xgb_forecaster(
    use_gpu=True,
    max_lag=12,
    params={"n_estimators": 600, "max_depth": 5, "learning_rate": 0.05}
)
summary_xgb, by_h_xgb = backtest_forecaster(
    outcome_trend,
    forecaster=xgb_fcst,
    horizon=12,
    seasonality=12,
    min_train=36,
    step=1
)
print(summary_xgb)


  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.types.is_period_dtype(m):
  if pd.api.

              series       RMSE  RMSE_baseline  RMSE_improvement_%  \
1     days_on_market  16.077661      19.283580           16.625121   
0       price_change   0.077784       0.083718            7.087992   
2  view_count_change   0.155686       0.233121           33.216397   

      sMAPE_%  sMAPE_baseline_%  sMAPE_improvement_%      MASE  MASE_baseline  
1   40.719116         45.972967            11.428132  1.035579       1.200344  
0  156.035517        154.604815            -0.925393  1.201933       1.379221  
2  145.932220        156.289588             6.627037  1.008859       1.492491  


  if pd.api.types.is_period_dtype(m):


In [156]:
class XGBDegenerateClassifier(XGBClassifier):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.is_degenerate_ = None
        self.degenerate_class_ = None

    def fit_degen(self, X, y=None, **kwargs):
        # If all target values are the same, skip training
        if (sum(y) == len(y)) or (sum(y) == 0):
            self.is_degenerate_ = True
            self.degenerate_class_ = y[0]
        else:
            self.is_degenerate_ = False
            self.fit(X, y, **kwargs)
        return self

    def predict_degen(self, X):
        if self.is_degenerate_:
            return np.full((X.shape[0],), self.degenerate_class_)
        else:
            return self.predict(X)

    def predict_proba_degen(self, X):
        if self.is_degenerate_:
            proba = np.zeros((X.shape[0], 2))
            proba[:, self.degenerate_class_] = 1.0
            return proba
        else:
            return self.predict_proba(X)

In [None]:
class XGBDegenerateMultiClassifier(XGBClassifier):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.degenerate_classes_ = {}

    def fit_degen(self, X, y=None, **kwargs):
        # Check each class in the multi-label setting
        for class_idx in range(y.shape[1]):
            y_class = y[:, class_idx]
            if (sum(y_class) == len(y_class)) or (sum(y_class) == 0):
                self.degenerate_classes_[class_idx] = y_class[0]
            else:
                self.degenerate_classes_[class_idx] = None

        # If any class is not degenerate, fit the model
        if any(v is None for v in self.degenerate_classes_.values()):
            self.fit(X, y, **kwargs)
        return self

    def predict_degen(self, X):
        if all(v is not None for v in self.degenerate_classes_.values()):
            # All classes are degenerate
            predictions = np.array([v for v in self.degenerate_classes_.values()])
            return np.tile(predictions, (X.shape[0], 1))
        else:
            return self.predict(X)

    def predict_proba_degen(self, X):
        if all(v is not None for v in self.degenerate_classes_.values()):
            # All classes are degenerate
            proba = np.zeros((X.shape[0], len(self.degenerate_classes_), 2))
            for class_idx, cls in self.degenerate_classes_.items():
                proba[:, class_idx, cls] = 1.0
            return proba
        else:
            return self.predict_proba(X)

In [None]:
def train_and_predict(final_df, zip):
    results, predictions, importances = [], [], []
    training_sets, outcome_sets, inference_point = grab_training_records(final_df, zip)

    for t in training_sets:
        print(f"\nTraining model for {t} months prediction with {len(training_sets[t])} records")
        
        predicted = [[] for _ in range(len(outcome_sets[t][0]))]
        for i in range(len(training_sets[t])):
            train_idx = np.ones((len(training_sets[t]),), dtype=bool)
            train_idx[i] = False
            X_train, X_val = training_sets[t][train_idx], training_sets[t][~train_idx]
            
            for outcome in range(len(outcome_sets[t][i])):
                y_train, y_val = outcome_sets[t][train_idx][:, outcome], outcome_sets[t][~train_idx][:, outcome]
                num_positives = np.sum(y_train)
                
                model = XGBDegenerateClassifier(eval_metric='logloss')
                #print(f"  Outcome {vars.outcomes[outcome]}: Training on {len(y_train)} records, count of Positives: {num_positives}")
                model.fit_degen(X_train, y_train)
                preds = model.predict_proba_degen(X_val)
                predicted[outcome].append(preds[0, 1])

        predicted = np.array(predicted)

        for outcome in range(len(outcome_sets[t][0])):
            y_true = outcome_sets[t][:, outcome]
            if np.all(y_true == 0) or np.all(y_true == 1):
                auc_score = None  # undefined
            else:
                auc_score = np.round(roc_auc_score(y_true, predicted[outcome]), 4)
            print(f"Months Out: {t}, Outcome {vars.outcomes[outcome]}, AUC: {auc_score}")
            results.append([t, vars.outcomes[outcome], auc_score])

            # Final Prediction
            model = XGBDegenerateClassifier(eval_metric='logloss')
            model.fit_degen(training_sets[t], outcome_sets[t][:, outcome])
            
            if np.all(y_true == 0) or np.all(y_true == 1):
                importances.append([t, vars.outcomes[outcome], None])
            else:
                importances.append([t, vars.outcomes[outcome], model.feature_importances_])
            final_pred = model.predict_proba_degen(inference_point)
            predictions.append([t, vars.outcomes[outcome], final_pred[0, 1]])
            print(f"Months Out: {t}, Outcome {vars.outcomes[outcome]}, Final Prediction: {final_pred[0, 1]}")
    
    return predicted, results, predictions, importances
    
    

In [None]:
def train_and_predict_multiclass(final_df, zip):
    results, predictions, importances = [], [], []
    training_sets, outcome_sets, inference_point = grab_training_records(final_df, zip)

    for t in training_sets:
        print(f"\nTraining model for {t} months prediction with {len(training_sets[t])} records")
        
        predicted = [[] for _ in range(len(outcome_sets[t][0]))]
        for i in range(len(training_sets[t])):
            train_idx = np.ones((len(training_sets[t]),), dtype=bool)
            train_idx[i] = False
            X_train, X_val = training_sets[t][train_idx], training_sets[t][~train_idx]
            
            for o in range(len(outcome_sets[t][i])//2):
                o1 = 2*o
                o2 = 2*o + 1
                y_train = np.zeros((len(outcome_sets[t]), 3), dtype=int)
                y_train[outcome_sets[t][:, o1] == 1, 0] = 1  # high risk
                y_train[outcome_sets[t][:, o2] == 1, 2] = 1  # strong market
                y_train[(y_train[:, 0] == 0) & (y_train[:, 2] == 0), 1] = 1  # neutral
                y_val = np.zeros((1, 3), dtype=int)
                if outcome_sets[t][i, o1] == 1:
                    y_val[0, 0] = 1
                elif outcome_sets[t][i, o2] == 1:
                    y_val[0, 2] = 1
                else:
                    y_val[0, 1] = 1
                
                num_high_risk = y_train[:, 0].sum()
                num_strong_market = y_train[:, 2].sum()

                model = XGBDegenerateMultiClassifier(eval_metric='mlogloss')
                model.fit_degen(X_train, y_train)
                preds = model.predict_proba_degen(X_val)
                
            for outcome in range(len(outcome_sets[t][i])):
                y_train, y_val = outcome_sets[t][train_idx][:, outcome], outcome_sets[t][~train_idx][:, outcome]
                num_positives = np.sum(y_train)
                
                model = XGBDegenerateClassifier(eval_metric='logloss')
                #print(f"  Outcome {vars.outcomes[outcome]}: Training on {len(y_train)} records, count of Positives: {num_positives}")
                model.fit_degen(X_train, y_train)
                preds = model.predict_proba_degen(X_val)
                predicted[outcome].append(preds[0, 1])

        predicted = np.array(predicted)

        for outcome in range(len(outcome_sets[t][0])):
            y_true = outcome_sets[t][:, outcome]
            if np.all(y_true == 0) or np.all(y_true == 1):
                auc_score = None  # undefined
            else:
                auc_score = np.round(roc_auc_score(y_true, predicted[outcome]), 4)
            print(f"Months Out: {t}, Outcome {vars.outcomes[outcome]}, AUC: {auc_score}")
            results.append([t, vars.outcomes[outcome], auc_score])

            # Final Prediction
            model = XGBDegenerateClassifier(eval_metric='logloss')
            model.fit_degen(training_sets[t], outcome_sets[t][:, outcome])
            
            if np.all(y_true == 0) or np.all(y_true == 1):
                importances.append([t, vars.outcomes[outcome], None])
            else:
                importances.append([t, vars.outcomes[outcome], model.feature_importances_])
            final_pred = model.predict_proba_degen(inference_point)
            predictions.append([t, vars.outcomes[outcome], final_pred[0, 1]])
            print(f"Months Out: {t}, Outcome {vars.outcomes[outcome]}, Final Prediction: {final_pred[0, 1]}")
    
    return predicted, results, predictions, importances
    
    

In [183]:

def _is_degenerate(y):
    return np.all(y == 0) or np.all(y == 1)

def _train_with_dmatrix(X_train, y_train, X_val, y_val, *, use_gpu=True, verbose=False):
    # Choose params (XGBoost ≥ 2.0 shown; for <2.0 use tree_method='gpu_hist', predictor='gpu_predictor')
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "hist",
    }
    if use_gpu:
        params["device"] = "cuda"

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val,   label=y_val)

    evals = [(dtrain, "train"), (dval, "val")]
    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=50 if verbose else False,
    )
    return booster

def train_and_predict_dmatrix(final_df, zip, use_gpu=True, verbose=False):
    results, predictions, importances = [], [], []
    training_sets, outcome_sets, inference_point = grab_training_records(final_df, zip)

    for t in training_sets:
        print(f"\nTraining model for {t} months prediction with {len(training_sets[t])} records")
        n_outcomes = outcome_sets[t].shape[1]
        predicted = [[] for _ in range(n_outcomes)]

        # Leave-one-out CV
        n = len(training_sets[t])
        for i in range(n):
            train_idx = np.ones((n,), dtype=bool)
            train_idx[i] = False

            X_train, X_val = training_sets[t][train_idx], training_sets[t][~train_idx]
            for outcome in range(n_outcomes):
                y_train = outcome_sets[t][train_idx][:, outcome]
                y_val   = outcome_sets[t][~train_idx][:, outcome]

                if _is_degenerate(y_train):
                    # Degenerate: probability is the constant class
                    p = float(y_train[0])
                    predicted[outcome].append(p)
                else:
                    booster = _train_with_dmatrix(X_train, y_train, X_val, y_val, use_gpu=use_gpu, verbose=verbose)
                    dval = xgb.DMatrix(X_val)
                    p = booster.predict(dval, iteration_range=(0, booster.best_iteration + 1))[0]
                    predicted[outcome].append(float(p))

        predicted = np.array(predicted)

        # Report AUCs and train final model for inference
        X_all = training_sets[t]
        d_infer = xgb.DMatrix(inference_point)

        for outcome in range(n_outcomes):
            y_true = outcome_sets[t][:, outcome]
            auc_score = None if _is_degenerate(y_true) else np.round(roc_auc_score(y_true, predicted[outcome]), 4)
            print(f"Months Out: {t}, Outcome {vars.outcomes[outcome]}, AUC: {auc_score}")
            results.append([t, vars.outcomes[outcome], auc_score])

            if _is_degenerate(y_true):
                # no model, constant prediction
                importances.append([t, vars.outcomes[outcome], None])
                final_pred = float(y_true[0])
            else:
                # train on all data then predict inference point
                dtrain_all = xgb.DMatrix(X_all, label=y_true)
                booster = xgb.train(
                    {"objective": "binary:logistic", "eval_metric": "logloss", "tree_method": "hist", **({"device": "cuda"} if use_gpu else {})},
                    dtrain_all,
                    num_boost_round= int(1.1 * (getattr(booster, "best_iteration", 100) or 100))  # simple heuristic
                )
                final_pred = float(booster.predict(d_infer)[0])

                # Feature importances (gain)
                # booster.get_score returns a dict {feature_name: importance}; if names are None it uses f0,f1,...
                fmap = booster.get_score(importance_type="gain")
                # turn into a vector ordered by feature index (f0,f1,...)
                if fmap:
                    # Sort by feature index order
                    keys = sorted(fmap.keys(), key=lambda k: int(k[1:]) if k.startswith("f") else 0)
                    import_vec = np.array([fmap[k] for k in keys], dtype=float)
                else:
                    import_vec = None
                importances.append([t, vars.outcomes[outcome], import_vec])

            predictions.append([t, vars.outcomes[outcome], final_pred])
            print(f"Months Out: {t}, Outcome {vars.outcomes[outcome]}, Final Prediction: {final_pred}")

    return predicted, results, predictions, importances


In [185]:
zip = 92620
predicted, results, predictions, importances = train_and_predict_dmatrix(final_df, zip)


Training model for 1 months prediction with 91 records


KeyboardInterrupt: 

In [188]:
zip = 92620
predicted, results, predictions, importances = train_and_predict(final_df, zip)


Training model for 1 months prediction with 91 records
Months Out: 1, Outcome high_risk_price_change, AUC: 0.603
Months Out: 1, Outcome high_risk_price_change, Final Prediction: 0.037774939090013504
Months Out: 1, Outcome strong_market_price_change, AUC: 0.3801
Months Out: 1, Outcome strong_market_price_change, Final Prediction: 0.038898736238479614
Months Out: 1, Outcome high_risk_days_on_market, AUC: None
Months Out: 1, Outcome high_risk_days_on_market, Final Prediction: 0.0
Months Out: 1, Outcome strong_market_days_on_market, AUC: 1.0
Months Out: 1, Outcome strong_market_days_on_market, Final Prediction: 0.01665658690035343
Months Out: 1, Outcome high_risk_view_count_change, AUC: 0.7851
Months Out: 1, Outcome high_risk_view_count_change, Final Prediction: 0.7286840677261353
Months Out: 1, Outcome strong_market_view_count_change, AUC: 0.602
Months Out: 1, Outcome strong_market_view_count_change, Final Prediction: 0.007675907574594021

Training model for 2 months prediction with 90 r