In [1]:
import os
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt

RANDOM_STATE = 42

# –ù–∞–¥—ë–∂–Ω–æ –æ–ø—Ä–µ–¥–µ–ª—è–µ–º –∫–æ—Ä–µ–Ω—å –ø—Ä–æ–µ–∫—Ç–∞
cwd = Path.cwd()
PROJECT_ROOT = cwd.parent if cwd.name == "notebooks" else cwd

DATA_PATH = PROJECT_ROOT / "data" / "Real_Estate_Tel_Aviv_20_years.csv"
MODELS_DIR = PROJECT_ROOT / "models"
PICS_DIR = PROJECT_ROOT / "pics"

MODELS_DIR.mkdir(exist_ok=True)
PICS_DIR.mkdir(exist_ok=True)

DATA_PATH, MODELS_DIR, PICS_DIR


(PosixPath('/Users/nikitamarshchonok/Desktop/end-to-end ML project/data/Real_Estate_Tel_Aviv_20_years.csv'),
 PosixPath('/Users/nikitamarshchonok/Desktop/end-to-end ML project/models'),
 PosixPath('/Users/nikitamarshchonok/Desktop/end-to-end ML project/pics'))

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
display(df.head(3))

# --- –∞–≤—Ç–æ-–ø–æ–∏—Å–∫ target –∫–æ–ª–æ–Ω–∫–∏ (—Ü–µ–Ω—ã) ---
cols_lower = {c: str(c).lower() for c in df.columns}

# –∫–∞–Ω–¥–∏–¥–∞—Ç—ã –ø–æ –∫–ª—é—á–µ–≤—ã–º —Å–ª–æ–≤–∞–º
keywords = ["price", "deal", "amount", "value", "nis", "ils", "‚Ç™", "sale", "sum", "total"]
candidates = [c for c, cl in cols_lower.items() if any(k in cl for k in keywords)]

print("\nCandidates by keywords:")
print(candidates)

target_col = None
# –ø—Ä–∏–æ—Ä–∏—Ç–µ—Ç: —Ä–æ–≤–Ω–æ "price", –ø–æ—Ç–æ–º —Å–æ–¥–µ—Ä–∂–∞—â–∏–µ "price"
for c in df.columns:
    if cols_lower[c] == "price":
        target_col = c
        break
if target_col is None:
    for c in df.columns:
        if "price" in cols_lower[c]:
            target_col = c
            break

# –µ—Å–ª–∏ –Ω–µ –Ω–∞—à–ª–∏ ‚Äî –ø–æ–∫–∞–∂–µ–º —Ç–æ–ø —á–∏—Å–ª–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫ (—á—Ç–æ–±—ã –≤—ã–±—Ä–∞—Ç—å —Ä—É–∫–∞–º–∏)
if target_col is None:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print("\nNo explicit price found. Numeric columns:")
    print(num_cols[:40])
    print("\nüëâ If you see the price column above, set it manually like:")
    print("target_col = 'YOUR_COLUMN_NAME'")
else:
    print("\n‚úÖ Selected target_col:", target_col)

# –±–∞–∑–æ–≤–∞—è —á–∏—Å—Ç–∫–∞ —Ü–µ–Ω—ã (–µ—Å–ª–∏ –Ω–∞—à–ª–∏)
if target_col is not None:
    df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
    before = len(df)
    df = df[df[target_col].notna() & (df[target_col] > 0)].copy()
    print(f"Target cleaned: {before} -> {len(df)} rows")


Shape: (5901, 40)


Unnamed: 0,subLot,lot,bloc,transactionDate,street,houseNumber,apartmentNumber,price,grossArea,netArea,...,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39
0,5,814,6628,01/12/2020,1,9,5.0,4636021,118.0,118,...,,,,,,,,,,
1,0,814,6628,29/10/2020,1,5,5.0,4559044,118.0,118,...,,,,,,,,,,
2,48,640,6628,15/06/2020,1,48,48.0,7350000,196.0,196,...,,,,,,,,,,



Candidates by keywords:
['price']

‚úÖ Selected target_col: price
Target cleaned: 5901 -> 5896 rows


In [3]:
import json

v2_feats_path = MODELS_DIR / "tel_aviv_feature_cols_v2.json"
print("v2 feats json:", v2_feats_path)

with open(v2_feats_path, "r", encoding="utf-8") as f:
    feat_cols = json.load(f)

print("Features in v2:", len(feat_cols))
print("First 10 feats:", feat_cols[:10])

missing = [c for c in feat_cols if c not in df.columns]
extra_in_df = [c for c in df.columns if c not in feat_cols]

print("\nMissing v2 features in dataset:", missing)
print("Example columns in df:", list(df.columns)[:25])

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ X/y (–∫–∞–∫ –≤ v2)
if len(missing) == 0:
    X = df[feat_cols].copy()
    y_raw = df["price"].values
    y = np.log1p(y_raw)

    # safety
    X = X.replace([np.inf, -np.inf], np.nan)
    for c in X.columns:
        X[c] = pd.to_numeric(X[c], errors="coerce")

    print("\n‚úÖ X shape:", X.shape)
    print("‚úÖ y shape:", y.shape)
else:
    print("\n‚ùóÔ∏è–ù–∞–¥–æ —Å—Ç—Ä–æ–∏—Ç—å v2-—Ñ–∏—á–∏ —á–µ—Ä–µ–∑ build_tel_aviv_v2_features (–≤ —Å–ª–µ–¥—É—é—â–µ–º —à–∞–≥–µ –¥–∞–º –∫–æ–¥).")


v2 feats json: /Users/nikitamarshchonok/Desktop/end-to-end ML project/models/tel_aviv_feature_cols_v2.json
Features in v2: 16
First 10 feats: ['netArea', 'grossArea', 'rooms', 'floor', 'floors', 'apartmentsInBuilding', 'parking', 'storage', 'roof', 'yard']

Missing v2 features in dataset: ['tx_year', 'tx_month', 'tx_quarter', 'building_age_at_tx', 'floor_ratio']
Example columns in df: ['subLot', 'lot', 'bloc', 'transactionDate', 'street', 'houseNumber', 'apartmentNumber', 'price', 'grossArea', 'netArea', 'rooms', 'ppsm', 'ppr', 'floor', 'floors', 'apartmentsInBuilding', 'parking', 'storage', 'roof', 'yard', 'constructionYear', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24']

‚ùóÔ∏è–ù–∞–¥–æ —Å—Ç—Ä–æ–∏—Ç—å v2-—Ñ–∏—á–∏ —á–µ—Ä–µ–∑ build_tel_aviv_v2_features (–≤ —Å–ª–µ–¥—É—é—â–µ–º —à–∞–≥–µ –¥–∞–º –∫–æ–¥).


In [4]:
import numpy as np
import pandas as pd

df_feat = df.copy()

# 1) transactionDate -> datetime
df_feat["transactionDate"] = pd.to_datetime(df_feat["transactionDate"], errors="coerce")

# 2) –ß–∏—Å–ª–æ–≤—ã–µ –ø–æ–ª—è (–Ω–∞ –≤—Å—è–∫–∏–π)
for col in ["constructionYear", "floor", "floors"]:
    if col in df_feat.columns:
        df_feat[col] = pd.to_numeric(df_feat[col], errors="coerce")

# 3) Derived time features
df_feat["tx_year"] = df_feat["transactionDate"].dt.year
df_feat["tx_month"] = df_feat["transactionDate"].dt.month
df_feat["tx_quarter"] = df_feat["transactionDate"].dt.quarter

# 4) building age at tx
df_feat["building_age_at_tx"] = df_feat["tx_year"] - df_feat["constructionYear"]
# –∑–∞—â–∏—Ç–∞ –æ—Ç –º—É—Å–æ—Ä–∞
df_feat.loc[df_feat["building_age_at_tx"] < 0, "building_age_at_tx"] = np.nan
df_feat.loc[df_feat["building_age_at_tx"] > 200, "building_age_at_tx"] = np.nan

# 5) floor_ratio
den = df_feat["floors"].replace({0: np.nan})
df_feat["floor_ratio"] = df_feat["floor"] / den
df_feat.loc[(df_feat["floor_ratio"] < 0) | (df_feat["floor_ratio"] > 1.5), "floor_ratio"] = np.nan

# 6) –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ —Ç–µ–ø–µ—Ä—å –≤—Å–µ v2 —Ñ–∏—á–∏ –µ—Å—Ç—å
missing_after = [c for c in feat_cols if c not in df_feat.columns]
print("Missing after build:", missing_after)

# 7) –°–æ–±–∏—Ä–∞–µ–º X/y
X = df_feat[feat_cols].copy()
y_raw = df_feat["price"].values
y = np.log1p(y_raw)

# 8) –ë—ã—Å—Ç—Ä—ã–π "–º–µ–¥–∏–∞–Ω–Ω—ã–π" –∏–º–ø—å—é—Ç–µ—Ä –¥–ª—è –æ—Ü–µ–Ω–∫–∏ (–Ω–∞ —Å–ª—É—á–∞–π, –µ—Å–ª–∏ pkl –Ω–µ pipeline)
X_num = X.apply(pd.to_numeric, errors="coerce")
nan_counts = X_num.isna().sum().sort_values(ascending=False)
print("\nNaNs per feature (top):")
print(nan_counts.head(10))

X_ready = X_num.copy()
for c in X_ready.columns:
    med = X_ready[c].median()
    if pd.isna(med):
        med = 0
    X_ready[c] = X_ready[c].fillna(med)

print("\n‚úÖ X_ready shape:", X_ready.shape)
print("‚úÖ y shape:", y.shape)


Missing after build: []

NaNs per feature (top):
building_age_at_tx      4155
tx_year                 3959
tx_month                3959
tx_quarter              3959
storage                 2574
yard                    2541
roof                    2514
parking                 2352
apartmentsInBuilding    1585
grossArea               1243
dtype: int64

‚úÖ X_ready shape: (5896, 16)
‚úÖ y shape: (5896,)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import numpy as np

# –ë–µ—Ä—ë–º "—Å—ã—Ä–æ–π" X —Å NaN (–º–æ–¥–µ–ª—å v2 —á–∞—Å—Ç–æ = pipeline –∏ —Å–∞–º–∞ —É–º–µ–µ—Ç –∏–ºpute)
X_raw = df_feat[feat_cols].copy()
X_raw = X_raw.apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)

# y —É–∂–µ –µ—Å—Ç—å (log1p), y_raw —Ç–æ–∂–µ –µ—Å—Ç—å (price)
# –ù–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π:
y_raw = df_feat["price"].values
y = np.log1p(y_raw)

X_train_raw, X_test_raw, y_train, y_test, y_raw_train, y_raw_test = train_test_split(
    X_raw, y, y_raw, test_size=0.2, random_state=RANDOM_STATE
)

v2_model_path = MODELS_DIR / "tel_aviv_real_estate_model_v2.pkl"
print("Loading:", v2_model_path)

v2_model = joblib.load(v2_model_path)

# –ü—ã—Ç–∞–µ–º—Å—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞—Ç—å —á–µ—Ä–µ–∑ v2 pipeline –Ω–∞ X_raw.
# –ï—Å–ª–∏ –≤–¥—Ä—É–≥ –º–æ–¥–µ–ª—å –ù–ï pipeline –∏ —É–ø–∞–¥–µ—Ç –∏–∑-–∑–∞ NaN ‚Äî –∏—Å–ø–æ–ª—å–∑—É–µ–º X_ready (—É–∂–µ –±–µ–∑ NaN).
try:
    pred_log_v2 = v2_model.predict(X_test_raw)
    used = "X_test_raw (with NaNs)"
except Exception as e:
    print("\n‚ö†Ô∏è v2_model failed on X_test_raw, fallback to X_ready.")
    print("Error:", repr(e))
    # –¥–µ–ª–∞–µ–º —Ç–∞–∫–æ–π –∂–µ split –¥–ª—è X_ready
    X_train_ready, X_test_ready = train_test_split(
        X_ready, test_size=0.2, random_state=RANDOM_STATE
    )
    pred_log_v2 = v2_model.predict(X_test_ready)
    used = "X_test_ready (imputed)"

pred_v2 = np.expm1(pred_log_v2)

mae_v2 = mean_absolute_error(y_raw_test, pred_v2)
rmse_v2 = np.sqrt(mean_squared_error(y_raw_test, pred_v2))
r2_v2 = r2_score(y_raw_test, pred_v2)

print("\n‚úÖ Baseline v2 metrics")
print("Used input:", used)
print(f"MAE : {mae_v2:,.0f} NIS")
print(f"RMSE: {rmse_v2:,.0f} NIS")
print(f"R2  : {r2_v2:.4f}")


Loading: /Users/nikitamarshchonok/Desktop/end-to-end ML project/models/tel_aviv_real_estate_model_v2.pkl


TypeError: got an unexpected keyword argument 'squared'