# China Real Estate Demand Prediction - Modeling

This notebook builds classic ML baselines (linear, ridge/lasso), a Gaussian-ish baseline, and SOTA tree models (XGBoost/LightGBM/CatBoost). It performs time-series CV with the competition score, and produces RMSE/MAPE curves and a submission.



In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import sys

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt

sys.path.append(str(Path('..').resolve().parent))
from src.data import DatasetPaths, load_all_training_tables, load_test, split_month_sector, prepare_train_target, explode_test_id
from src.features import build_time_lagged_features, join_static_sector_features
from src.models import competition_score, build_linear_pipeline

ROOT = str(Path('..').resolve().parent)
paths = DatasetPaths(root_dir=ROOT)

# Load
train = load_all_training_tables(paths)

target_wide, sector_index = prepare_train_target(train['new_house_transactions'])

# Build supervised dataset from lags
lag_feats = build_time_lagged_features(train['new_house_transactions'])
lag_feats = lag_feats.sort_values(['time', 'sector_id'])

# Align target
y_long = target_wide.unstack().reset_index(name='y')
y_long = y_long.rename(columns={'level_0': 'sector_id', 'time': 'time'})

df = lag_feats.merge(y_long, on=['time', 'sector_id'], how='left')

# Drop rows with NaN features (due to lags)
df_model = df.dropna(subset=[c for c in df.columns if c.startswith('lag_') or c.startswith('roll_')]).copy()

feature_cols = [c for c in df_model.columns if c.startswith('lag_') or c.startswith('roll_')]
X = df_model[feature_cols]
y = df_model['y']

# Baseline: Ridge with grid over alpha and plot curves
alphas = np.logspace(-3, 2, 10)
results = []
rmse_curve, mape_curve = [], []
for a in alphas:
    pipe = build_linear_pipeline(alpha=a, kind='ridge')
    # Simple hold-forward split for speed: train until t<=54, validate on last 12 months
    mask_train = df_model['time'] <= 54
    X_tr, y_tr = X[mask_train], y[mask_train]
    X_va, y_va = X[~mask_train], y[~mask_train]
    pipe.fit(X_tr, y_tr)
    yhat = pipe.predict(X_va)
    sc = competition_score(y_va.values, yhat)
    r = {
        'alpha': a,
        'score': sc['score'],
        'good_rate': sc['good_rate'],
        'rmse': np.sqrt(mean_squared_error(y_va, yhat)),
        'mape': np.mean(np.abs((y_va.values - yhat) / np.maximum(y_va.values, 1e-12)))
    }
    results.append(r)
    rmse_curve.append(r['rmse'])
    mape_curve.append(r['mape'])

res_df = pd.DataFrame(results)
print(res_df.sort_values('score', ascending=False).head())

# Plot RMSE/MAPE vs alpha
fig, ax = plt.subplots(1,2, figsize=(12,4))
ax[0].plot(alphas, rmse_curve, marker='o')
ax[0].set_xscale('log')
ax[0].set_title('RMSE vs alpha (Ridge)')
ax[1].plot(alphas, mape_curve, marker='o', color='orange')
ax[1].set_xscale('log')
ax[1].set_title('MAPE vs alpha (Ridge)')
plt.show()



In [None]:
# Train best Ridge on all available past months and generate submission
best_alpha = res_df.sort_values('score', ascending=False).iloc[0]['alpha']
print('Best alpha:', best_alpha)

pipe = build_linear_pipeline(alpha=float(best_alpha), kind='ridge')
pipe.fit(X, y)

# Build test design matrix using last lags and rollings
from src.data import load_test
from src.features import build_time_lagged_features

paths = DatasetPaths(root_dir=ROOT)
train = load_all_training_tables(paths)

test_df = load_test(paths)
from src.data import explode_test_id

# Need features for times 67..78; using lag features built from training data only
lag_feats_full = build_time_lagged_features(train['new_house_transactions'])
lag_feats_full = lag_feats_full.sort_values(['time', 'sector_id'])

# Take only rows with time in test horizon
test_exploded = explode_test_id(test_df)
lag_test = lag_feats_full[lag_feats_full['time'].isin(test_exploded['time'].unique())]

# Merge to align sector_id and time
lag_test = lag_test.merge(test_exploded[['time','sector','sector_id','id']], on=['time','sector_id'], how='right')

X_test = lag_test[feature_cols]
# Rows with NA (insufficient lag history) -> fill 0 as conservative
X_test = X_test.fillna(0)

y_pred_test = pipe.predict(X_test)

submission = lag_test[['id']].copy()
submission['new_house_transaction_amount'] = y_pred_test

# Ensure row order follows test.csv
submission = submission.sort_values('id')
submission.to_csv('submission.csv', index=False)
print('Saved submission.csv with', len(submission), 'rows')



In [None]:
# LightGBM/XGBoost/CatBoost with simple hyper sweeps and CV (optional if packages available)
import warnings
warnings.filterwarnings('ignore')

try:
    import lightgbm as lgb
    has_lgb = True
except Exception:
    has_lgb = False

try:
    import xgboost as xgb
    has_xgb = True
except Exception:
    has_xgb = False

try:
    from catboost import CatBoostRegressor
    has_cat = True
except Exception:
    has_cat = False

advanced_results = []

# Prepare train/val split used earlier
mask_train = df_model['time'] <= 54
X_tr, y_tr = X[mask_train], y[mask_train]
X_va, y_va = X[~mask_train], y[~mask_train]

if has_lgb:
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dvalid = lgb.Dataset(X_va, label=y_va, reference=dtrain)
    lgb_params_grid = [
        {'num_leaves': nl, 'learning_rate': lr, 'min_data_in_leaf': 50, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'objective': 'regression', 'metric': 'rmse', 'seed': 42}
        for nl in [15, 31, 63]
        for lr in [0.05, 0.1]
    ]
    lgb_rmse_curve, lgb_mape_curve = [], []
    for p in lgb_params_grid:
        booster = lgb.train(p, dtrain, num_boost_round=1000, valid_sets=[dvalid], verbose_eval=False, early_stopping_rounds=50)
        yhat = booster.predict(X_va, num_iteration=booster.best_iteration)
        sc = competition_score(y_va.values, yhat)
        rm = np.sqrt(mean_squared_error(y_va, yhat))
        mp = np.mean(np.abs((y_va.values - yhat) / np.maximum(y_va.values, 1e-12)))
        advanced_results.append({'model':'lightgbm','params':p,'score':sc['score'],'rmse':rm,'mape':mp})
        lgb_rmse_curve.append(rm)
        lgb_mape_curve.append(mp)
    print(pd.DataFrame(advanced_results)[pd.DataFrame(advanced_results)['model']=='lightgbm'].sort_values('score', ascending=False).head())

if has_xgb:
    xgb_rmse_curve, xgb_mape_curve = [], []
    for max_depth in [4,6,8]:
        for lr in [0.05, 0.1]:
            model = xgb.XGBRegressor(max_depth=max_depth, learning_rate=lr, n_estimators=1000, subsample=0.8, colsample_bytree=0.8, objective='reg:squarederror', random_state=42, tree_method='hist')
            model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
            yhat = model.predict(X_va)
            sc = competition_score(y_va.values, yhat)
            rm = np.sqrt(mean_squared_error(y_va, yhat))
            mp = np.mean(np.abs((y_va.values - yhat) / np.maximum(y_va.values, 1e-12)))
            advanced_results.append({'model':'xgboost','params':{'max_depth':max_depth,'learning_rate':lr},'score':sc['score'],'rmse':rm,'mape':mp})
            xgb_rmse_curve.append(rm)
            xgb_mape_curve.append(mp)
    print(pd.DataFrame(advanced_results)[pd.DataFrame(advanced_results)['model']=='xgboost'].sort_values('score', ascending=False).head())

if has_cat:
    cat_rmse_curve, cat_mape_curve = [], []
    for depth in [4,6,8]:
        model = CatBoostRegressor(depth=depth, learning_rate=0.1, loss_function='RMSE', random_seed=42, iterations=2000, verbose=False)
        model.fit(X_tr, y_tr, eval_set=(X_va, y_va))
        yhat = model.predict(X_va)
        sc = competition_score(y_va.values, yhat)
        rm = np.sqrt(mean_squared_error(y_va, yhat))
        mp = np.mean(np.abs((y_va.values - yhat) / np.maximum(y_va.values, 1e-12)))
        advanced_results.append({'model':'catboost','params':{'depth':depth},'score':sc['score'],'rmse':rm,'mape':mp})
        cat_rmse_curve.append(rm)
        cat_mape_curve.append(mp)
    print(pd.DataFrame(advanced_results)[pd.DataFrame(advanced_results)['model']=='catboost'].sort_values('score', ascending=False).head())

pd.DataFrame(advanced_results).sort_values('score', ascending=False).head()



In [None]:
# Recursive forecasting over test horizon using selected model (e.g., best of ridge/lgb/xgb)
# Choose best model by score from advanced_results if any; otherwise use ridge pipe
best_overall = None
if 'advanced_results' in globals() and len(advanced_results) > 0:
    best_overall = sorted(advanced_results, key=lambda d: d['score'], reverse=True)[0]['model']
else:
    best_overall = 'ridge'
print('Selected model for submission:', best_overall)

# Prepare base series to update lags as we roll forward
nht = train['new_house_transactions'].copy()
nht_aug = split_month_sector(nht)

# Build initial lag features including last available training rows
lag_full = build_time_lagged_features(nht)
lag_full = lag_full.sort_values(['time','sector_id'])

# Prepare exploded test ids
paths = DatasetPaths(root_dir=ROOT)
test_df = load_test(paths)
test_exploded = explode_test_id(test_df)

# Ensure we iterate time in ascending order and per time predict 96 sectors
predictions = []
current_series = nht_aug[['time','sector_id','amount_new_house_transactions']].copy()

# Helper to get design row for a given time and sector from current_series
from src.features import build_time_lagged_features

for t in sorted(test_exploded['time'].unique()):
    # Recompute lag features from current_series up to time t for all sectors
    tmp = current_series.rename(columns={'amount_new_house_transactions':'amount_new_house_transactions'})
    lag_tmp = build_time_lagged_features(tmp)
    lag_t = lag_tmp[lag_tmp['time'] == t]

    step_df = test_exploded[test_exploded['time'] == t][['id','sector_id','time']].merge(
        lag_t, on=['time','sector_id'], how='left'
    )
    X_t = step_df[feature_cols].fillna(0)

    if best_overall == 'ridge':
        yhat_t = pipe.predict(X_t)
    elif best_overall == 'lightgbm':
        # choose best lightgbm trained earlier
        # retrain with best params on full past
        best_lgb = sorted([r for r in advanced_results if r['model']=='lightgbm'], key=lambda d: d['score'], reverse=True)[0]
        params = best_lgb['params']
        dtrain_full = lgb.Dataset(X, label=y)
        booster = lgb.train(params, dtrain_full, num_boost_round= int(1.2 *  (best_lgb.get('best_iteration', 200))), verbose_eval=False)
        yhat_t = booster.predict(X_t)
    elif best_overall == 'xgboost':
        best_xgb = sorted([r for r in advanced_results if r['model']=='xgboost'], key=lambda d: d['score'], reverse=True)[0]
        model = xgb.XGBRegressor(**best_xgb['params'], n_estimators=1000, subsample=0.8, colsample_bytree=0.8, objective='reg:squarederror', random_state=42, tree_method='hist')
        model.fit(X, y, verbose=False)
        yhat_t = model.predict(X_t)
    elif best_overall == 'catboost':
        best_cat = sorted([r for r in advanced_results if r['model']=='catboost'], key=lambda d: d['score'], reverse=True)[0]
        model = CatBoostRegressor(depth=best_cat['params']['depth'], learning_rate=0.1, loss_function='RMSE', random_seed=42, iterations=2000, verbose=False)
        model.fit(X, y)
        yhat_t = model.predict(X_t)
    else:
        yhat_t = pipe.predict(X_t)

    step_df = step_df[['id','sector_id','time']].copy()
    step_df['new_house_transaction_amount'] = yhat_t
    predictions.append(step_df[['id','new_house_transaction_amount']])

    # Update current_series with predicted values so next time step can use them in lags
    update = step_df.copy()
    update = update.rename(columns={'new_house_transaction_amount':'amount_new_house_transactions'})
    update['sector'] = 'sector ' + update['sector_id'].astype(int).astype(str)
    current_series = pd.concat([current_series, update[['time','sector_id','amount_new_house_transactions']]], ignore_index=True)

submission_df = pd.concat(predictions, ignore_index=True)
# Merge back to test to preserve exact original order
final_submission = test_df[['id']].merge(submission_df, on='id', how='left')
final_submission.to_csv('submission.csv', index=False)
print('Saved recursively-rolled submission.csv with', len(final_submission), 'rows')

