In [81]:
from pathlib import Path
import os
import numpy as np
print("CWD:", os.getcwd())

PROJECT_ROOT = Path.cwd()

# if notebook is launched from /notebooks, move one level up
if not (PROJECT_ROOT / "data").exists() and (PROJECT_ROOT.parent / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Has data/:", (PROJECT_ROOT / "data").exists())
print("Has raw file:", (PROJECT_ROOT / "data" / "raw" / "house_prices_train.csv").exists())


CWD: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/notebooks
PROJECT_ROOT: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price
Has data/: True
Has raw file: True


In [82]:
from pathlib import Path

DATA_PATH = PROJECT_ROOT / "data" / "raw" / "house_prices_train.csv"
SPLITS_DIR = PROJECT_ROOT / "data" / "splits"
TARGET_COL = "SalePrice"

MODEL_PATH = PROJECT_ROOT / "models" / "model.joblib"
META_PATH = PROJECT_ROOT / "models" / "model_meta.json"

REPORTS_DIR = PROJECT_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"

TEST_REPORT_PATH = REPORTS_DIR / "test_metrics.md"
TOP_ERRORS_PATH = REPORTS_DIR / "test_top_errors.csv"

FIG_TRUE_VS_PRED = FIGURES_DIR / "test_y_true_vs_y_pred.png"

print("DATA_PATH:", DATA_PATH)
print("Exists:", DATA_PATH.exists())


DATA_PATH: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/data/raw/house_prices_train.csv
Exists: True


In [83]:
print("Raw dir:", (PROJECT_ROOT / "data" / "raw"))
print(list((PROJECT_ROOT / "data" / "raw").glob("*")))


Raw dir: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/data/raw
[PosixPath('/Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/data/raw/house_prices_train.csv')]


In [84]:
import pandas as pd

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [85]:
def pick_split_file(splits_dir: Path, candidates: list[str]) -> Path:
    # English comments: try common filenames
    for name in candidates:
        p = splits_dir / name
        if p.exists():
            return p
    raise FileNotFoundError(f"Split file not found in {splits_dir}. Tried: {candidates}")

train_idx_path = pick_split_file(SPLITS_DIR, ["train_indices.npy", "train_idx.npy", "train_index.npy"])
test_idx_path  = pick_split_file(SPLITS_DIR, ["test_indices.npy", "test_idx.npy", "test_index.npy"])

train_idx = np.asarray(np.load(train_idx_path), dtype=int)
test_idx  = np.asarray(np.load(test_idx_path), dtype=int)

print("Train idx file:", train_idx_path)
print("Test idx file:", test_idx_path)
print("Train size:", len(train_idx), "Test size:", len(test_idx))

# Basic sanity checks
overlap = np.intersect1d(train_idx, test_idx)
if overlap.size > 0:
    raise ValueError("Train/test indices overlap. This should not happen.")

print("OK: splits loaded, no overlap.")


Train idx file: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/data/splits/train_indices.npy
Test idx file: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/data/splits/test_indices.npy
Train size: 1168 Test size: 292
OK: splits loaded, no overlap.


In [86]:
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].to_numpy(dtype=float)

# Decide whether indices are positional or label-based
is_range_index = isinstance(df.index, pd.RangeIndex)
positional_ok = (test_idx.min() >= 0) and (test_idx.max() < len(df))

use_iloc = bool(is_range_index and positional_ok)

print("df.index type:", type(df.index))
print("use_iloc:", use_iloc)

if use_iloc:
    X_test = X.iloc[test_idx].copy()
    y_test = y[test_idx].copy()
else:
    # Label-based index selection
    X_test = X.loc[test_idx].copy()
    y_test = df.loc[test_idx, TARGET_COL].to_numpy(dtype=float)

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
X_test.head()


df.index type: <class 'pandas.core.indexes.range.RangeIndex'>
use_iloc: True
X_test shape: (292, 80)
y_test shape: (292,)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
892,893,20,RL,70.0,8414,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,2,2006,WD,Normal
1105,1106,60,RL,98.0,12256,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal
413,414,30,RM,56.0,8960,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,,,,0,3,2010,WD,Normal
522,523,50,RM,50.0,5000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,10,2006,WD,Normal
1036,1037,20,RL,89.0,12898,Pave,,IR1,HLS,AllPub,...,0,0,,,,0,9,2009,WD,Normal


In [87]:
if not MODEL_PATH.exists():
    raise FileNotFoundError(f"Model not found: {MODEL_PATH}. Run training first.")

model = joblib.load(MODEL_PATH)
y_pred_test = np.asarray(model.predict(X_test), dtype=float)

print("Predictions shape:", y_pred_test.shape)
print("Predictions example:", y_pred_test[:5])

Predictions shape: (292,)
Predictions example: [141958.42729106 307007.34947028 114308.80337314 169083.22214924
 323775.12064351]


In [88]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred_test = model.predict(X_test)

rmse = float(np.sqrt(mean_squared_error(y_test, y_pred_test)))
mae  = float(mean_absolute_error(y_test, y_pred_test))
r2   = float(r2_score(y_test, y_pred_test))

test_metrics = {"rmse": rmse, "mae": mae, "r2": r2}
test_metrics


{'rmse': 26344.8762981074, 'mae': 16246.562687457115, 'r2': 0.9095146036276461}

In [89]:
plt.figure()
plt.scatter(y_test, y_pred_test, alpha=0.5, s=12)
plt.xlabel("y_true")
plt.ylabel("y_pred")
plt.title("Test: y_true vs y_pred")
plt.tight_layout()
plt.savefig(FIG_TRUE_VS_PRED, dpi=150)
plt.close()

print("Saved:", FIG_TRUE_VS_PRED)


Saved: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/reports/figures/test_y_true_vs_y_pred.png


In [90]:
hist_path, resid_scatter_path = save_residual_plots(
    y_true=y_test,
    y_pred=y_pred_test,
    out_dir=FIGURES_DIR,
    prefix="test_residuals",
    bins=40,
)

print("Saved:", hist_path)
print("Saved:", resid_scatter_path)


Saved: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/reports/figures/test_residuals_hist.png
Saved: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/reports/figures/test_residuals_pred_vs_resid.png


In [91]:
top_errors_df = save_top_errors(
    X=X_test,
    y_true=y_test,
    y_pred=y_pred_test,
    out_path=TOP_ERRORS_PATH,
    top_n=50,
)

print("Saved:", TOP_ERRORS_PATH)
top_errors_df.head(10)


Saved: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/reports/test_top_errors.csv


Unnamed: 0,y_true,y_pred,residual,abs_error
0,755000.0,556440.992556,198559.007444,198559.007444
1,611657.0,465238.747504,146418.252496,146418.252496
2,253293.0,377643.692562,-124350.692562,124350.692562
3,311500.0,219478.144663,92021.855337,92021.855337
4,285000.0,369289.795967,-84289.795967,84289.795967
5,143000.0,223777.549129,-80777.549129,80777.549129
6,556581.0,486938.485468,69642.514532,69642.514532
7,395000.0,327438.59526,67561.40474,67561.40474
8,301000.0,240395.335192,60604.664808,60604.664808
9,135000.0,195193.230491,-60193.230491,60193.230491


In [92]:
metrics_table_path = REPORTS_DIR / "metrics_models.csv"
if metrics_table_path.exists():
    metrics_df = pd.read_csv(metrics_table_path)
    display(metrics_df)

    model_name_guess = None
    if META_PATH.exists():
        meta = json.loads(META_PATH.read_text(encoding="utf-8"))
        model_name_guess = meta.get("model_name") or meta.get("name") or None

    print("Model name guess from meta:", model_name_guess)

    # If your metrics table contains the row "leader", this will find it
    candidates = ["leader"]
    if model_name_guess:
        candidates.insert(0, model_name_guess)

    chosen_row = None
    for name in candidates:
        if "model" in metrics_df.columns:
            sub = metrics_df[metrics_df["model"] == name]
            if len(sub) == 1:
                chosen_row = sub.iloc[0]
                print("Found CV row for model:", name)
                break

    if chosen_row is not None:
        print("\nCV summary:")
        print("main_metric:", chosen_row["main_metric"])
        print("main_mean:", chosen_row["main_mean"], "main_std:", chosen_row["main_std"])
        print("secondary_metric:", chosen_row["secondary_metric"])
        print("secondary_mean:", chosen_row["secondary_mean"], "secondary_std:", chosen_row["secondary_std"])

        print("\nTEST metrics:")
        print(test_metrics)
    else:
        print("Could not automatically match a CV row in metrics_models.csv. "
              "Check the 'model' column and pick the right row manually.")
else:
    print("metrics_models.csv not found, skipping CV vs test comparison.")


Unnamed: 0,model,main_metric,main_mean,main_std,secondary_metric,secondary_mean,secondary_std,n_splits
0,HGB_tuned,rmse,28566.109638,4796.254568,mae,17169.066736,1310.557653,5


Model name guess from meta: HGB_tuned
Found CV row for model: HGB_tuned

CV summary:
main_metric: rmse
main_mean: 28566.109638302063 main_std: 4796.254567862704
secondary_metric: mae
secondary_mean: 17169.06673586387 secondary_std: 1310.5576533390956

TEST metrics:
{'rmse': 26344.8762981074, 'mae': 16246.562687457115, 'r2': 0.9095146036276461}


In [94]:
# Optional: include meta JSON in report for reproducibility
meta_block = ""
if META_PATH.exists():
    try:
        meta = json.loads(META_PATH.read_text(encoding="utf-8"))
        meta_block = "\n### Model meta\n```json\n" + json.dumps(meta, indent=2) + "\n```\n"
    except Exception:
        meta_block = "\n### Model meta\n(Found meta file, but failed to parse JSON)\n"

report_text = f"""# Test metrics

**Target:** `{TARGET_COL}`  
**Model artifact:** `{MODEL_PATH.as_posix()}`

## Metrics (test)
- RMSE: **{test_metrics["rmse"]:.4f}**
- MAE: **{test_metrics["mae"]:.4f}**
- R2: **{test_metrics["r2"]:.4f}**

## Figures
- y_true vs y_pred: `{FIG_TRUE_VS_PRED.as_posix()}`
- residual histogram: `{hist_path.as_posix()}`
- y_pred vs residual: `{resid_scatter_path.as_posix()}`

## Tables
- top errors: `{TOP_ERRORS_PATH.as_posix()}`
{meta_block}
"""

TEST_REPORT_PATH.write_text(report_text, encoding="utf-8")
print("Saved:", TEST_REPORT_PATH)
print("TEST:", test_metrics)


Saved: /Users/uvlazhnitel/Documents/coding/DataScience/p2-house-price/reports/test_metrics.md
TEST: {'rmse': 26344.8762981074, 'mae': 16246.562687457115, 'r2': 0.9095146036276461}
