# China Real Estate Demand Prediction - Modeling

This notebook builds classic ML baselines (linear, ridge/lasso), a Gaussian-ish baseline, and SOTA tree models (XGBoost/LightGBM/CatBoost). It performs time-series CV with the competition score, and produces RMSE/MAPE curves and a submission.



In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import sys

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt

sys.path.append(str(Path('..').resolve().parent))
from src.data import DatasetPaths, load_all_training_tables, load_test, split_month_sector, prepare_train_target, explode_test_id
from src.features import build_time_lagged_features, join_static_sector_features
from src.models import competition_score, build_linear_pipeline

ROOT = str(Path('..').resolve().parent)
paths = DatasetPaths(root_dir=ROOT)

# Load
train = load_all_training_tables(paths)

target_wide, sector_index = prepare_train_target(train['new_house_transactions'])

# Build supervised dataset from lags
lag_feats = build_time_lagged_features(train['new_house_transactions'])
lag_feats = lag_feats.sort_values(['time', 'sector_id'])

# Align target
y_long = target_wide.unstack().reset_index(name='y')
y_long = y_long.rename(columns={'level_0': 'sector_id', 'time': 'time'})

df = lag_feats.merge(y_long, on=['time', 'sector_id'], how='left')

# Drop rows with NaN features (due to lags)
df_model = df.dropna(subset=[c for c in df.columns if c.startswith('lag_') or c.startswith('roll_')]).copy()

feature_cols = [c for c in df_model.columns if c.startswith('lag_') or c.startswith('roll_')]
X = df_model[feature_cols]
y = df_model['y']

# Baseline: Ridge with grid over alpha and plot curves
alphas = np.logspace(-3, 2, 10)
results = []
rmse_curve, mape_curve = [], []
for a in alphas:
    pipe = build_linear_pipeline(alpha=a, kind='ridge')
    # Simple hold-forward split for speed: train until t<=54, validate on last 12 months
    mask_train = df_model['time'] <= 54
    X_tr, y_tr = X[mask_train], y[mask_train]
    X_va, y_va = X[~mask_train], y[~mask_train]
    pipe.fit(X_tr, y_tr)
    yhat = pipe.predict(X_va)
    sc = competition_score(y_va.values, yhat)
    r = {
        'alpha': a,
        'score': sc['score'],
        'good_rate': sc['good_rate'],
        'rmse': np.sqrt(mean_squared_error(y_va, yhat)),
        'mape': np.mean(np.abs((y_va.values - yhat) / np.maximum(y_va.values, 1e-12)))
    }
    results.append(r)
    rmse_curve.append(r['rmse'])
    mape_curve.append(r['mape'])

res_df = pd.DataFrame(results)
print(res_df.sort_values('score', ascending=False).head())

# Plot RMSE/MAPE vs alpha
fig, ax = plt.subplots(1,2, figsize=(12,4))
ax[0].plot(alphas, rmse_curve, marker='o')
ax[0].set_xscale('log')
ax[0].set_title('RMSE vs alpha (Ridge)')
ax[1].plot(alphas, mape_curve, marker='o', color='orange')
ax[1].set_xscale('log')
ax[1].set_title('MAPE vs alpha (Ridge)')
plt.show()



In [None]:
# Train best Ridge on all available past months and generate submission
best_alpha = res_df.sort_values('score', ascending=False).iloc[0]['alpha']
print('Best alpha:', best_alpha)

pipe = build_linear_pipeline(alpha=float(best_alpha), kind='ridge')
pipe.fit(X, y)

# Build test design matrix using last lags and rollings
from src.data import load_test
from src.features import build_time_lagged_features

paths = DatasetPaths(root_dir=ROOT)
train = load_all_training_tables(paths)

test_df = load_test(paths)
from src.data import explode_test_id

# Need features for times 67..78; using lag features built from training data only
lag_feats_full = build_time_lagged_features(train['new_house_transactions'])
lag_feats_full = lag_feats_full.sort_values(['time', 'sector_id'])

# Take only rows with time in test horizon
test_exploded = explode_test_id(test_df)
lag_test = lag_feats_full[lag_feats_full['time'].isin(test_exploded['time'].unique())]

# Merge to align sector_id and time
lag_test = lag_test.merge(test_exploded[['time','sector','sector_id','id']], on=['time','sector_id'], how='right')

X_test = lag_test[feature_cols]
# Rows with NA (insufficient lag history) -> fill 0 as conservative
X_test = X_test.fillna(0)

y_pred_test = pipe.predict(X_test)

submission = lag_test[['id']].copy()
submission['new_house_transaction_amount'] = y_pred_test

# Ensure row order follows test.csv
submission = submission.sort_values('id')
submission.to_csv('submission.csv', index=False)
print('Saved submission.csv with', len(submission), 'rows')

