# 02 â€“ Training (K-fold & Time-split)

In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from src.data_prep import build_preprocessor
from src.features import add_engineered_features
from src.modeling import get_models
from src.evaluation import metrics_df
import os

seed = 42
np.random.seed(seed)

Df = pd.read_csv('../data/processed/clean.csv')
Df = add_engineered_features(Df)
Df['log_price'] = np.log(Df['price'].clip(lower=1))

X = Df.drop(columns=['price','log_price'])
y = Df['log_price']

pre = build_preprocessor()
models = get_models(random_state=seed)

results = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)
for name, model in models.items():
    fold_rows = []
    for tr, te in kf.split(X):
        pipe = Pipeline([('pre', pre), ('model', model)])
        pipe.fit(X.iloc[tr], y.iloc[tr])
        pred = pipe.predict(X.iloc[te])
        y_true = np.exp(y.iloc[te])
        y_pred = np.exp(pred)
        fold_rows.append(metrics_df(y_true, y_pred, name))
    import pandas as pd
    results.append(pd.concat(fold_rows, ignore_index=True).assign(protocol='kfold'))

if 'year' in Df.columns:
    cutoff = int(Df['year'].quantile(0.75))
    tr_idx = Df['year'] <= cutoff
    te_idx = Df['year'] > cutoff
    for name, model in models.items():
        pipe = Pipeline([('pre', pre), ('model', model)])
        pipe.fit(X[tr_idx], y[tr_idx])
        pred = pipe.predict(X[te_idx])
        y_true = np.exp(y[te_idx])
        y_pred = np.exp(pred)
        results.append(metrics_df(y_true, y_pred, name).assign(protocol='time_split', cutoff=cutoff))

out = pd.concat(results, ignore_index=True)
os.makedirs('../outputs', exist_ok=True)
out.to_csv('../outputs/metrics_summary.csv', index=False)
print(out.groupby(['protocol','model']).mean(numeric_only=True))