In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib

# ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
train_df = pd.read_csv("train_cleaned.csv")

print("Shape:", train_df.shape)
print("Columns:", len(train_df.columns))


Shape: (1460, 78)
Columns: 78


In [8]:
selected_features = [
    'OverallQual', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
    'GarageCars', 'YearBuilt', 'YearRemodAdd', 'FullBath', 'BsmtQual',
    'KitchenQual', 'Neighborhood', 'LotArea', 'Fireplaces'
]
target = 'SalePrice'

X = train_df[selected_features]
y = train_df[target]

# ‚úÖ STEP: Create Price Category Groups
# Convert continuous SalePrice into groups (classification)
bins = [0, 50000, 100000, 150000, 200000, 250000, 500000, 1000000]
labels = ['<50k', '50-100k', '100-150k', '150-200k', '200-250k', '250-500k', '500k+']

X = train_df[selected_features]
y = pd.cut(train_df['SalePrice'], bins=bins, labels=labels)



In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [10]:
from sklearn.preprocessing import OrdinalEncoder

# ‡πÅ‡∏ö‡πà‡∏á‡∏ä‡∏ô‡∏¥‡∏î‡∏Ç‡∏≠‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
numeric_features = [
    'GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
    'GarageCars', 'YearBuilt', 'YearRemodAdd', 'FullBath', 'LotArea', 'Fireplaces'
]
categorical_features = ['OverallQual', 'BsmtQual', 'KitchenQual', 'Neighborhood']

# ‡∏ï‡∏±‡∏ß‡πÅ‡∏õ‡∏•‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# ‡∏ï‡∏±‡∏ß‡πÅ‡∏õ‡∏•‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏´‡∏°‡∏ß‡∏î‡∏´‡∏°‡∏π‡πà
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# ‡∏£‡∏ß‡∏°‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡πÉ‡∏ô ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# ‡∏™‡∏£‡πâ‡∏≤‡∏á pipeline ‡∏£‡∏ß‡∏°‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
model = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])


In [12]:
feature_names = list(pipeline.named_steps['preprocessor'].get_feature_names_out())
print("‡∏à‡∏≥‡∏ô‡∏ß‡∏ô features ‡∏´‡∏•‡∏±‡∏á encode:", len(feature_names))
print("‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á feature:", feature_names[:20])


NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [13]:
model = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [14]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [15]:
pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)


In [16]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Accuracy: 0.7568493150684932

Classification Report:
               precision    recall  f1-score   support

    100-150k       0.80      0.94      0.86       113
    150-200k       0.73      0.72      0.72        71
    200-250k       0.50      0.59      0.54        34
    250-500k       0.91      0.69      0.78        45
     50-100k       0.86      0.50      0.63        24
       500k+       1.00      0.33      0.50         3
        <50k       0.00      0.00      0.00         2

    accuracy                           0.76       292
   macro avg       0.68      0.54      0.58       292
weighted avg       0.77      0.76      0.75       292



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
joblib.dump(pipeline, "house_price_pipeline_new.joblib")
print("‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÉ‡∏´‡∏°‡πà‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢‡πÅ‡∏•‡πâ‡∏ß")


‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÉ‡∏´‡∏°‡πà‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢‡πÅ‡∏•‡πâ‡∏ß


In [18]:
import joblib
import inspect

def inspect_model(file_path):
    print(f"\nüîé Checking file: {file_path}")
    try:
        model = joblib.load(file_path)
        print("‚úÖ File loaded successfully!")
        print("üì¶ Object type:", type(model))
        
        # ‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡πá‡∏ô Pipeline
        if hasattr(model, 'named_steps'):
            print("\nüß± Pipeline Steps:")
            for step_name, step_obj in model.named_steps.items():
                print(f"  - {step_name}: {type(step_obj)}")
            
            # ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ preprocessor
            if 'preprocessor' in model.named_steps:
                pre = model.named_steps['preprocessor']
                if hasattr(pre, 'transformers'):
                    print("\nüìä Transformers in preprocessor:")
                    for name, transformer, cols in pre.transformers:
                        print(f"  ‚Ä¢ {name}: {type(transformer)} (columns: {cols})")
            
            # ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ model
            if 'model' in model.named_steps:
                print("\nü§ñ Model details:")
                mdl = model.named_steps['model']
                print("  - Model type:", type(mdl))
                if hasattr(mdl, 'get_params'):
                    print("  - Model parameters:")
                    for k, v in list(mdl.get_params().items())[:10]:
                        print(f"     {k}: {v}")
        
        # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà pipeline
        else:
            print("\n‚öôÔ∏è This object is not a Pipeline.")
            if hasattr(model, 'get_params'):
                print("Model parameters:")
                for k, v in list(model.get_params().items())[:10]:
                    print(f"   {k}: {v}")
        
    except Exception as e:
        print("‚ùå Error loading file:", e)


# üîπ ‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡πÉ‡∏ä‡πâ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ó‡∏±‡πâ‡∏á‡∏™‡∏≠‡∏á‡πÑ‡∏ü‡∏•‡πå
inspect_model("house_price_pipeline.pkl")
inspect_model("house_price_pipeline.joblib")


üîé Checking file: house_price_pipeline.pkl
‚úÖ File loaded successfully!
üì¶ Object type: <class 'sklearn.linear_model._base.LinearRegression'>

‚öôÔ∏è This object is not a Pipeline.
Model parameters:
   copy_X: True
   fit_intercept: True
   n_jobs: None
   positive: False
   tol: 1e-06

üîé Checking file: house_price_pipeline.joblib
‚úÖ File loaded successfully!
üì¶ Object type: <class 'sklearn.pipeline.Pipeline'>

üß± Pipeline Steps:
  - preprocess: <class 'sklearn.compose._column_transformer.ColumnTransformer'>
  - reg: <class 'sklearn.compose._target.TransformedTargetRegressor'>


***Train 10 Model***

In [None]:
# 0) Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib

# 1) Load data
df = pd.read_csv("train_cleaned.csv")

# 2) Define features/target (SalePrice = target)
feature_cols = [
    "OverallQual",
    "TotalBsmtSF",
    "LotArea",
    "GarageCars",
    "Fireplaces",
    "BedroomAbvGr",
    "Neighborhood",
    "GrLivArea",
    "FullBath",
]
target_col = "SalePrice"

X = df[feature_cols].copy()
y = df[target_col].astype(float)

# 3) Enforce dtypes (‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡∏°‡∏≤‡∏Å‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö front-end)
numeric_int_cols = [
    "OverallQual", "TotalBsmtSF", "LotArea", "GarageCars",
    "Fireplaces", "BedroomAbvGr", "GrLivArea", "FullBath"
]
cat_cols = ["Neighborhood"]

# ‡πÅ‡∏õ‡∏•‡∏á numeric ‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç (‡∏Å‡∏±‡∏ô‡∏û‡∏±‡∏á‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡∏™‡∏ï‡∏£‡∏¥‡∏á‡∏´‡∏•‡∏á‡∏°‡∏≤)
for c in numeric_int_cols:
    X[c] = pd.to_numeric(X[c], errors="coerce")

# Neighborhood ‡πÄ‡∏õ‡πá‡∏ô string
X[cat_cols] = X[cat_cols].astype(str)

# 4) Basic sanity clips (‡∏ä‡πà‡∏ß‡∏¢‡πÉ‡∏´‡πâ dropdown/front-end ‡∏°‡∏µ guardrail)
X["OverallQual"] = X["OverallQual"].clip(1, 10)
X["GarageCars"]  = X["GarageCars"].clip(0, 4)
X["FullBath"]    = X["FullBath"].clip(lower=0)
X["Fireplaces"]  = X["Fireplaces"].clip(lower=0)
X["BedroomAbvGr"]= X["BedroomAbvGr"].clip(lower=0)
X["TotalBsmtSF"] = X["TotalBsmtSF"].clip(lower=0)
X["GrLivArea"]   = X["GrLivArea"].clip(lower=0)
X["LotArea"]     = X["LotArea"].clip(lower=0)

# 5) Preprocessor
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, numeric_int_cols),
        ("cat", cat_transformer, cat_cols)
    ]
)

# 6) Model
model = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# 7) 10-fold CV score (R^2)
cv = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring="r2")
print(f"10-fold R^2: mean={cv_scores.mean():.3f}, std={cv_scores.std():.3f}")

# 8) Hold-out test (optional ‡πÅ‡∏ï‡πà‡∏ä‡πà‡∏ß‡∏¢‡πÄ‡∏ä‡πá‡∏Ñ over/underfit)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
pipeline.fit(X_train, y_train)
pred_train = pipeline.predict(X_train)
pred_test  = pipeline.predict(X_test)

rmse = lambda a,b: np.sqrt(((a-b)**2).mean())

print(f"Train R^2: {r2_score(y_train, pred_train):.3f}")
print(f"Test  R^2: {r2_score(y_test,  pred_test):.3f}")
print(f"MAE: {mean_absolute_error(y_test, pred_test):.2f}")
print(f"RMSE: {rmse(y_test, pred_test):.2f}")

# 9) Fit on all data & save
pipeline.fit(X, y)
joblib.dump(pipeline, "house_price_pipeline_9f_cv10.joblib")
print("‚úÖ Saved: house_price_pipeline_9f_cv10.joblib")

# (Optional) ‡∏î‡∏π‡∏ä‡∏∑‡πà‡∏≠‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏´‡∏•‡∏±‡∏á‡πÅ‡∏õ‡∏•‡∏á‡πÅ‡∏•‡πâ‡∏ß
feat_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
print("Features after transform:", len(feat_names))
print(feat_names[:15])


10-fold R^2: mean=0.793, std=0.095
Train R^2: 0.808
Test  R^2: 0.834
MAE: 21798.35
RMSE: 35685.43
‚úÖ Saved: house_price_pipeline_9f_cv10.joblib
Features after transform: 33
['num__OverallQual' 'num__TotalBsmtSF' 'num__LotArea' 'num__GarageCars'
 'num__Fireplaces' 'num__BedroomAbvGr' 'num__GrLivArea' 'num__FullBath'
 'cat__Neighborhood_Blmngtn' 'cat__Neighborhood_Blueste'
 'cat__Neighborhood_BrDale' 'cat__Neighborhood_BrkSide'
 'cat__Neighborhood_ClearCr' 'cat__Neighborhood_CollgCr'
 'cat__Neighborhood_Crawfor']


In [None]:
import joblib, pickle, pprint, sys
from pathlib import Path

p = Path("house_price_pipeline_9f_cv10.joblib")  # ‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏ü‡∏•‡πå‡∏≠‡∏∑‡πà‡∏ô‡∏ó‡∏µ‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡∏î‡∏π
obj = joblib.load(p) if p.suffix == ".joblib" else pickle.load(open(p, "rb"))

print("=== ROOT OBJECT ===")
print(type(obj))

# ‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡πá‡∏ô Pipeline:
try:
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.base import BaseEstimator
except Exception as e:
    Pipeline = ColumnTransformer = BaseEstimator = None

if Pipeline and isinstance(obj, Pipeline):
    print("\n=== PIPELINE STEPS ===")
    for name, est in obj.steps:
        print(f"- {name}: {type(est).__name__}")

    ct = None
    for name, est in obj.steps:
        if ColumnTransformer and isinstance(est, ColumnTransformer):
            ct = est
            break

    if ct:
        print("\n=== COLUMN TRANSFORMER ===")
        print("remainder:", ct.remainder)
        for name, tr, cols in ct.transformers:
            print(f"* transformer: {name} -> {type(tr).__name__} on {cols}")

        try:
            fno = ct.get_feature_names_out()
            print("\nfeature_names_out (sample):", fno[:50])
            print("total transformed features:", len(fno))
        except Exception as e:
            print("\n(get_feature_names_out) ->", e)

   
    final_est = obj.steps[-1][1]
    print("\n=== FINAL ESTIMATOR ===")
    print(type(final_est).__name__)
    try:
        params = final_est.get_params()
        print("params (subset):")
        for k in list(params)[:30]:
            print("  ", k, "=", params[k])
    except Exception:
        pass

    for attr in ["coef_", "intercept_", "feature_importances_", "n_features_in_"]:
        if hasattr(final_est, attr):
            val = getattr(final_est, attr)
            try:
                size = len(val)
                show = val[:10] if size > 10 else val
                print(f"\n{attr}: (len={size})", show)
            except TypeError:
                print(f"\n{attr}:", val)

else:
    print("\n=== GENERIC OBJECT INFO ===")
    if hasattr(obj, "get_params"):
        print("Sklearn estimator params keys:", list(obj.get_params().keys())[:30])
    elif isinstance(obj, dict):
        print("dict keys:", list(obj.keys()))
    else:
        pprint.pprint(obj)


=== ROOT OBJECT ===
<class 'sklearn.pipeline.Pipeline'>

=== PIPELINE STEPS ===
- preprocessor: ColumnTransformer
- model: RidgeCV

=== COLUMN TRANSFORMER ===
remainder: drop
* transformer: num -> Pipeline on ['OverallQual', 'TotalBsmtSF', 'LotArea', 'GarageCars', 'Fireplaces', 'BedroomAbvGr', 'GrLivArea', 'FullBath']
* transformer: cat -> Pipeline on ['Neighborhood']

feature_names_out (sample): ['num__OverallQual' 'num__TotalBsmtSF' 'num__LotArea' 'num__GarageCars'
 'num__Fireplaces' 'num__BedroomAbvGr' 'num__GrLivArea' 'num__FullBath'
 'cat__Neighborhood_Blmngtn' 'cat__Neighborhood_Blueste'
 'cat__Neighborhood_BrDale' 'cat__Neighborhood_BrkSide'
 'cat__Neighborhood_ClearCr' 'cat__Neighborhood_CollgCr'
 'cat__Neighborhood_Crawfor' 'cat__Neighborhood_Edwards'
 'cat__Neighborhood_Gilbert' 'cat__Neighborhood_IDOTRR'
 'cat__Neighborhood_MeadowV' 'cat__Neighborhood_Mitchel'
 'cat__Neighborhood_NAmes' 'cat__Neighborhood_NPkVill'
 'cat__Neighborhood_NWAmes' 'cat__Neighborhood_NoRidge'
 'cat

In [None]:
# train_house_price.py
# ====================
# ‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏£‡∏≤‡∏Ñ‡∏≤‡∏ö‡πâ‡∏≤‡∏ô (Regression) ‡∏à‡∏≤‡∏Å Kaggle (train_cleaned.csv)
# - ‡πÉ‡∏ä‡πâ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå 9 ‡∏ï‡∏±‡∏ß + ‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢ SalePrice (log-transform)
# - ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ö‡∏ô test split
# - ‡πÄ‡∏ã‡∏ü‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÄ‡∏õ‡πá‡∏ô house_price_pipeline.joblib ‡πÅ‡∏•‡∏∞ features.json
# - ‡∏°‡∏µ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå 3 ‡πÄ‡∏Ñ‡∏™

import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# ------------------------
# 1) ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
# ------------------------
DATA_PATH = "train_cleaned.csv"
assert Path(DATA_PATH).exists(), f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå {DATA_PATH} ‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô"

df = pd.read_csv(DATA_PATH)

# ------------------------
# 2) ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå/‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢
# ------------------------
FEATURES_NUM = [
    "OverallQual", "TotalBsmtSF", "LotArea", "GarageCars",
    "Fireplaces", "BedroomAbvGr", "GrLivArea", "FullBath"
]
FEATURES_CAT = ["Neighborhood"]
FEATURES = FEATURES_NUM + FEATURES_CAT
TARGET = "SalePrice"

# ‡∏Å‡∏£‡∏≠‡∏á‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà target ‡πÄ‡∏õ‡πá‡∏ô NaN ‡∏≠‡∏≠‡∏Å (‡∏Å‡∏±‡∏ô error)
df = df.dropna(subset=[TARGET])

X = df[FEATURES].copy()
y = np.log1p(df[TARGET].astype(float))  # ‡πÅ‡∏õ‡∏•‡∏á log1p ‡∏ä‡πà‡∏ß‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏ñ‡∏µ‡∏¢‡∏£‡∏Ç‡∏≠‡∏á‡∏£‡∏≤‡∏Ñ‡∏≤

# ------------------------
# 3) ‡πÅ‡∏ö‡πà‡∏á train/test
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------
# 4) Preprocessor (impute + encode + scale)
# ------------------------
num_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer(transformers=[
    ("num", num_tf, FEATURES_NUM),
    ("cat", cat_tf, FEATURES_CAT),
])

# ------------------------
# 5) ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•
# ------------------------
model = RidgeCV(alphas=(0.1, 1.0, 10.0))

# ------------------------
# 6) Pipeline ‡∏£‡∏ß‡∏°‡∏ó‡∏∏‡∏Å‡∏≠‡∏¢‡πà‡∏≤‡∏á
# ------------------------
pipe = Pipeline(steps=[
    ("preprocessor", pre),
    ("model", model),
])

# ------------------------
# 7) ‡πÄ‡∏ó‡∏£‡∏ô
# ------------------------
pipe.fit(X_train, y_train)

# ------------------------
# 8) ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏• (‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì RMSE ‡πÄ‡∏≠‡∏á‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö sklearn ‡∏£‡∏∏‡πà‡∏ô‡πÄ‡∏Å‡πà‡∏≤)
# ------------------------
# ‡πÅ‡∏õ‡∏•‡∏á‡∏Å‡∏•‡∏±‡∏ö‡∏à‡∏≤‡∏Å log ‚Üí ‡∏™‡πÄ‡∏Å‡∏•‡∏à‡∏£‡∏¥‡∏á
pred_train = np.expm1(pipe.predict(X_train))
true_train = np.expm1(y_train)

pred_test = np.expm1(pipe.predict(X_test))
true_test = np.expm1(y_test)

# MSE/MAE/R2 train
mse_tr  = mean_squared_error(true_train, pred_train)  # ‡πÑ‡∏°‡πà‡∏°‡∏µ squared= ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏Ç‡πâ‡∏≤‡∏Å‡∏±‡∏ô‡πÑ‡∏î‡πâ
rmse_tr = np.sqrt(mse_tr)
mae_tr  = mean_absolute_error(true_train, pred_train)
r2_tr   = r2_score(true_train, pred_train)

# MSE/MAE/R2 test
mse_te  = mean_squared_error(true_test, pred_test)
rmse_te = np.sqrt(mse_te)
mae_te  = mean_absolute_error(true_test, pred_test)
r2_te   = r2_score(true_test, pred_test)

print("=== ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô ===")
print(f"Train RMSE: {rmse_tr:,.2f} | MAE: {mae_tr:,.2f} | R2: {r2_tr:.3f}")
print(f" Test RMSE: {rmse_te:,.2f} | MAE: {mae_te:,.2f} | R2: {r2_te:.3f}")

# ------------------------
# 9) ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏• + ‡πÄ‡∏°‡∏ó‡∏≤‡∏î‡∏≤‡∏ó‡∏≤
# ------------------------
MODEL_PATH = "house_price_pipeline_Train_10.joblib"
META_PATH = "features.json"

joblib.dump(pipe, MODEL_PATH)
with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump({"features": FEATURES, "num": FEATURES_NUM, "cat": FEATURES_CAT}, f, ensure_ascii=False, indent=2)

print(f"\n‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡πâ‡∏ß: {MODEL_PATH}")
print(f"‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå: {META_PATH}")

# ------------------------
# 10) ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå (‡∏≠‡∏¥‡∏ô‡∏û‡∏∏‡∏ï‡πÑ‡∏°‡πà‡∏Ñ‡∏£‡∏ö‡∏Å‡πá‡πÑ‡∏î‡πâ)
# ------------------------
def predict_saleprice(input_dict: dict) -> float:
    """
    input_dict: key ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö FEATURES (‡πÉ‡∏™‡πà‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ó‡∏µ‡πà‡∏°‡∏µ‡πÑ‡∏î‡πâ)
      ‡πÄ‡∏ä‡πà‡∏ô {"LotArea": 8500, "OverallQual": 6, "Neighborhood": "CollgCr"}
    """
    row = {f: input_dict.get(f, None) for f in FEATURES}
    X_one = pd.DataFrame([row], columns=FEATURES)
    y_log = pipe.predict(X_one)[0]
    y_hat = float(np.expm1(y_log))
    return round(y_hat, 2)

# ------------------------
# 11) ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå 3 ‡πÄ‡∏Ñ‡∏™ (‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á)
# ------------------------
examples = [
    {"LotArea": 8500, "OverallQual": 6, "Neighborhood": "CollgCr"},
    {"OverallQual": 7, "GrLivArea": 1800, "FullBath": 2, "BedroomAbvGr": 3, "Neighborhood": "NAmes"},
    {"LotArea": 12000, "TotalBsmtSF": 900, "GarageCars": 2, "GrLivArea": 2200, "FullBath": 2},
]

print("\n=== ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á ===")
for i, ex in enumerate(examples, 1):
    print(f"Case {i}: {ex}")
    print("‚Üí Predicted SalePrice:", f"{predict_saleprice(ex):,.2f}", "USD\n")


=== ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô ===
Train RMSE: 34,471.10 | MAE: 19,641.75 | R2: 0.801
 Test RMSE: 29,620.65 | MAE: 19,180.85 | R2: 0.886

‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡πâ‡∏ß: house_price_pipeline.joblib
‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå: features.json

=== ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á ===
Case 1: {'LotArea': 8500, 'OverallQual': 6, 'Neighborhood': 'CollgCr'}
‚Üí Predicted SalePrice: 184,608.38 USD

Case 2: {'OverallQual': 7, 'GrLivArea': 1800, 'FullBath': 2, 'BedroomAbvGr': 3, 'Neighborhood': 'NAmes'}
‚Üí Predicted SalePrice: 194,257.68 USD

Case 3: {'LotArea': 12000, 'TotalBsmtSF': 900, 'GarageCars': 2, 'GrLivArea': 2200, 'FullBath': 2}
‚Üí Predicted SalePrice: 190,951.88 USD



In [21]:
# train_house_price.py
# ====================
# ‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏£‡∏≤‡∏Ñ‡∏≤‡∏ö‡πâ‡∏≤‡∏ô (Regression) ‡∏à‡∏≤‡∏Å Kaggle (train_cleaned.csv)
# - ‡πÉ‡∏ä‡πâ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå 9 ‡∏ï‡∏±‡∏ß + ‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢ SalePrice (log-transform)
# - ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ö‡∏ô test split
# - ‡πÄ‡∏ã‡∏ü‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÄ‡∏õ‡πá‡∏ô house_price_pipeline_Train_10.joblib ‡πÅ‡∏•‡∏∞ features.json
# - ‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡πÅ‡∏ö‡∏ö "‡πÇ‡∏ä‡∏ß‡πå‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏Å‡∏£‡∏≠‡∏Å" ‡πÅ‡∏•‡∏∞ "‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ"

import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# ------------------------
# 1) ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
# ------------------------
DATA_PATH = "train_cleaned.csv"
assert Path(DATA_PATH).exists(), f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå {DATA_PATH} ‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô"

df = pd.read_csv(DATA_PATH)

# ------------------------
# 2) ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå/‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢
# ------------------------
FEATURES_NUM = [
    "OverallQual", "TotalBsmtSF", "LotArea", "GarageCars",
    "Fireplaces", "BedroomAbvGr", "GrLivArea", "FullBath"
]
FEATURES_CAT = ["Neighborhood"]
FEATURES = FEATURES_NUM + FEATURES_CAT
TARGET = "SalePrice"

# ‡∏Å‡∏£‡∏≠‡∏á‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà target ‡πÄ‡∏õ‡πá‡∏ô NaN ‡∏≠‡∏≠‡∏Å (‡∏Å‡∏±‡∏ô error)
df = df.dropna(subset=[TARGET])

X = df[FEATURES].copy()
y = np.log1p(df[TARGET].astype(float))  # ‡πÅ‡∏õ‡∏•‡∏á log1p ‡∏ä‡πà‡∏ß‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏ñ‡∏µ‡∏¢‡∏£‡∏Ç‡∏≠‡∏á‡∏£‡∏≤‡∏Ñ‡∏≤

# ------------------------
# 3) ‡πÅ‡∏ö‡πà‡∏á train/test
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------
# 4) Preprocessor (impute + encode + scale)
# ------------------------
num_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer(transformers=[
    ("num", num_tf, FEATURES_NUM),
    ("cat", cat_tf, FEATURES_CAT),
])

# ------------------------
# 5) ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•
# ------------------------
model = RidgeCV(alphas=(0.1, 1.0, 10.0))

# ------------------------
# 6) Pipeline ‡∏£‡∏ß‡∏°‡∏ó‡∏∏‡∏Å‡∏≠‡∏¢‡πà‡∏≤‡∏á
# ------------------------
pipe = Pipeline(steps=[
    ("preprocessor", pre),
    ("model", model),
])

# ------------------------
# 7) ‡πÄ‡∏ó‡∏£‡∏ô
# ------------------------
pipe.fit(X_train, y_train)

# ------------------------
# 8) ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏• (‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì RMSE ‡πÄ‡∏≠‡∏á‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö sklearn ‡∏£‡∏∏‡πà‡∏ô‡πÄ‡∏Å‡πà‡∏≤)
# ------------------------
# ‡πÅ‡∏õ‡∏•‡∏á‡∏Å‡∏•‡∏±‡∏ö‡∏à‡∏≤‡∏Å log ‚Üí ‡∏™‡πÄ‡∏Å‡∏•‡∏à‡∏£‡∏¥‡∏á
pred_train = np.expm1(pipe.predict(X_train))
true_train = np.expm1(y_train)

pred_test = np.expm1(pipe.predict(X_test))
true_test = np.expm1(y_test)

# MSE/MAE/R2 train
mse_tr  = mean_squared_error(true_train, pred_train)  # ‡πÑ‡∏°‡πà‡∏°‡∏µ squared=
rmse_tr = np.sqrt(mse_tr)
mae_tr  = mean_absolute_error(true_train, pred_train)
r2_tr   = r2_score(true_train, pred_train)

# MSE/MAE/R2 test
mse_te  = mean_squared_error(true_test, pred_test)
rmse_te = np.sqrt(mse_te)
mae_te  = mean_absolute_error(true_test, pred_test)
r2_te   = r2_score(true_test, pred_test)

print("=== ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô ===")
print(f"Train RMSE: {rmse_tr:,.2f} | MAE: {mae_tr:,.2f} | R2: {r2_tr:.3f}")
print(f" Test RMSE: {rmse_te:,.2f} | MAE: {mae_te:,.2f} | R2: {r2_te:.3f}")

# ------------------------
# 9) ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏• + ‡πÄ‡∏°‡∏ó‡∏≤‡∏î‡∏≤‡∏ó‡∏≤
# ------------------------
MODEL_PATH = "house_price_pipeline_Train_10.joblib"
META_PATH = "features.json"

joblib.dump(pipe, MODEL_PATH)
with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump({"features": FEATURES, "num": FEATURES_NUM, "cat": FEATURES_CAT}, f, ensure_ascii=False, indent=2)

print(f"\n‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡πâ‡∏ß: {MODEL_PATH}")
print(f"‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå: {META_PATH}")

# ------------------------
# 10) Helper: preview ‡∏Ñ‡πà‡∏≤‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå "‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå‡πÅ‡∏•‡πâ‡∏ß"
# ------------------------
def impute_preview(pipe: Pipeline, row_dict: dict) -> dict:
    """
    ‡∏Ñ‡∏∑‡∏ô dict ‡∏Ç‡∏≠‡∏á‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏î‡∏¥‡∏ö‡∏ó‡∏±‡πâ‡∏á 9 ‡∏ï‡∏±‡∏ß '‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå‡πÅ‡∏•‡πâ‡∏ß' (‡∏Å‡πà‡∏≠‡∏ô scale/one-hot)
    """
    row = {f: row_dict.get(f, np.nan) for f in FEATURES}
    X_one = pd.DataFrame([row], columns=FEATURES)

    pre: ColumnTransformer = pipe.named_steps["preprocessor"]
    # ‡πÄ‡∏Ç‡πâ‡∏≤‡∏ñ‡∏∂‡∏á imputer ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô
    num_imp: SimpleImputer = pre.named_transformers_["num"].named_steps["imputer"]
    cat_imp: SimpleImputer = pre.named_transformers_["cat"].named_steps["imputer"]

    # ‡πÄ‡∏ï‡∏¥‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö numeric
    num_stats = pd.Series(num_imp.statistics_, index=FEATURES_NUM) if FEATURES_NUM else pd.Series(dtype=float)
    X_num = X_one[FEATURES_NUM].copy().apply(pd.to_numeric, errors="coerce") if FEATURES_NUM else pd.DataFrame()
    if FEATURES_NUM:
        X_num = X_num.where(~X_num.isna(), num_stats)

    # ‡πÄ‡∏ï‡∏¥‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö categorical
    cat_stats = pd.Series(cat_imp.statistics_, index=FEATURES_CAT) if FEATURES_CAT else pd.Series(dtype=object)
    X_cat = X_one[FEATURES_CAT].copy() if FEATURES_CAT else pd.DataFrame()
    if FEATURES_CAT:
        X_cat = X_cat.where(~X_cat.isna(), cat_stats)

    if FEATURES_NUM and FEATURES_CAT:
        X_imp = pd.concat([X_num, X_cat], axis=1)[FEATURES]
    elif FEATURES_NUM:
        X_imp = X_num[FEATURES]
    else:
        X_imp = X_cat[FEATURES]

    # ‡πÅ‡∏õ‡∏•‡∏á‡∏ä‡∏ô‡∏¥‡∏î‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏û‡∏¥‡∏°‡∏û‡πå‡∏™‡∏ß‡∏¢ ‡πÜ
    out = {}
    for k, v in X_imp.iloc[0].to_dict().items():
        if k in FEATURES_NUM:
            out[k] = None if pd.isna(v) else float(v)
        else:
            out[k] = None if (isinstance(v, float) and pd.isna(v)) else str(v)
    return out

# ------------------------
# 11) ‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå + ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô "‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß/‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ"
# ------------------------
def predict_with_report(input_dict: dict):
    """
    - ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏£‡∏≤‡∏Ñ‡∏≤ (expm1)
    - ‡πÅ‡∏ö‡πà‡∏á‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡πÄ‡∏õ‡πá‡∏ô:
        features_provided: ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡∏Å‡∏£‡∏≠‡∏Å
        features_filled:   ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏Å‡∏£‡∏≠‡∏Å ‡πÅ‡∏•‡πâ‡∏ß‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡∏Ñ‡πà‡∏≤‡πÉ‡∏´‡πâ (imputed)
    - ‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÇ‡∏ä‡∏ß‡πå‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡∏à‡∏£‡∏¥‡∏á‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå
    """
    # ‡∏Ñ‡πà‡∏≤‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå (‡πÑ‡∏ß‡πâ‡πÇ‡∏ä‡∏ß‡πå)
    used_after_impute = impute_preview(pipe, input_dict)

    # ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö predict (‡∏õ‡∏•‡πà‡∏≠‡∏¢‡πÉ‡∏´‡πâ pipeline impute ‡∏≠‡∏µ‡∏Å‡∏ä‡∏±‡πâ‡∏ô‚Äî‡πÑ‡∏°‡πà‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡∏ú‡∏•)
    row = {f: input_dict.get(f, None) for f in FEATURES}
    X_one = pd.DataFrame([row], columns=FEATURES)
    y_log = pipe.predict(X_one)[0]
    y_hat = float(np.expm1(y_log))

    # ‡∏à‡∏±‡∏î‡πÅ‡∏¢‡∏Å "‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß" vs "‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ"
    provided, filled = {}, {}
    for f in FEATURES:
        if f in input_dict and input_dict[f] is not None:
            provided[f] = used_after_impute[f]  # ‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡∏à‡∏£‡∏¥‡∏á (‡∏≠‡∏≤‡∏à‡∏ñ‡∏π‡∏Å‡πÅ‡∏Ñ‡∏™/‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡πÅ‡∏•‡πâ‡∏ß)
        else:
            filled[f] = used_after_impute[f]     # ‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏£‡∏∞‡∏ö‡∏ö‡∏ï‡πâ‡∏≠‡∏á‡πÄ‡∏ï‡∏¥‡∏°

    return {
        "predicted_price_usd": round(y_hat, 2),
        "features_provided": provided,
        "features_filled": filled
    }

# ------------------------
# 12) ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå 3 ‡πÄ‡∏Ñ‡∏™ (‡∏û‡∏¥‡∏°‡∏û‡πå‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡πÄ‡∏ï‡πá‡∏°)
# ------------------------
examples = [
    {"LotArea": 8500, "OverallQual": 6, "Neighborhood": "CollgCr"},
    {"OverallQual": 7, "GrLivArea": 1800, "FullBath": 2, "BedroomAbvGr": 3, "Neighborhood": "NAmes"},
    {"LotArea": 12000, "TotalBsmtSF": 900, "GarageCars": 2, "GrLivArea": 2200, "FullBath": 2},
]

print("\n=== ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á (‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÅ‡∏¢‡∏Å '‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß' / '‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ') ===")
for i, ex in enumerate(examples, 1):
    out = predict_with_report(ex)
    print(f"Case {i}: input={ex}")
    print(f"‚Üí Predicted SalePrice: {out['predicted_price_usd']:,.2f} USD")

    print("‚Üí Features (‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß):")
    if out["features_provided"]:
        for k, v in out["features_provided"].items():
            print(f"   - {k}: {v}")
    else:
        print("   (‡πÑ‡∏°‡πà‡∏°‡∏µ ‚Äî ‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡πÑ‡∏°‡πà‡∏Å‡∏£‡∏≠‡∏Å‡πÄ‡∏•‡∏¢)")

    print("‚Üí Features (‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ):")
    if out["features_filled"]:
        for k, v in out["features_filled"].items():
            print(f"   - {k}: {v}")
    else:
        print("   (‡πÑ‡∏°‡πà‡∏°‡∏µ ‚Äî ‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡∏Å‡∏£‡∏≠‡∏Å‡∏Ñ‡∏£‡∏ö)")

    print()


=== ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô ===
Train RMSE: 34,471.10 | MAE: 19,641.75 | R2: 0.801
 Test RMSE: 29,620.65 | MAE: 19,180.85 | R2: 0.886

‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡πâ‡∏ß: house_price_pipeline_Train_10.joblib
‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå: features.json

=== ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á (‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÅ‡∏¢‡∏Å '‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß' / '‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ') ===


ValueError: Must specify axis=0 or 1

In [28]:
# train_house_price.py
# ====================
# ‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏£‡∏≤‡∏Ñ‡∏≤‡∏ö‡πâ‡∏≤‡∏ô (Regression) ‡∏à‡∏≤‡∏Å Kaggle (train_cleaned.csv)
# - ‡πÉ‡∏ä‡πâ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå 9 ‡∏ï‡∏±‡∏ß + ‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢ SalePrice (log-transform)
# - ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ö‡∏ô test split
# - ‡πÄ‡∏ã‡∏ü‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÄ‡∏õ‡πá‡∏ô house_price_pipeline_Train_10.joblib ‡πÅ‡∏•‡∏∞ features.json
# - ‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡πÅ‡∏ö‡∏ö "‡πÇ‡∏ä‡∏ß‡πå‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏Å‡∏£‡∏≠‡∏Å" ‡πÅ‡∏•‡∏∞ "‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ (imputed)"
# - ‡πÅ‡∏Å‡πâ‡∏ö‡∏±‡πä‡∏Å pandas: ‡πÉ‡∏ä‡πâ fillna(series) ‡πÅ‡∏ó‡∏ô where(...)

import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from typing import Dict

# ------------------------
# 1) ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
# ------------------------
DATA_PATH = "train_cleaned.csv"
assert Path(DATA_PATH).exists(), f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå {DATA_PATH} ‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô"

df = pd.read_csv(DATA_PATH)

# ------------------------
# 2) ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå/‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢
# ------------------------
FEATURES_NUM = [
    "OverallQual", "TotalBsmtSF", "LotArea", "GarageCars",
    "Fireplaces", "BedroomAbvGr", "GrLivArea", "FullBath"
]
FEATURES_CAT = ["Neighborhood"]
FEATURES = FEATURES_NUM + FEATURES_CAT
TARGET = "SalePrice"

# ‡∏Å‡∏£‡∏≠‡∏á‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà target ‡πÄ‡∏õ‡πá‡∏ô NaN ‡∏≠‡∏≠‡∏Å (‡∏Å‡∏±‡∏ô error)
df = df.dropna(subset=[TARGET])

X = df[FEATURES].copy()
y = np.log1p(df[TARGET].astype(float))  # ‡πÅ‡∏õ‡∏•‡∏á log1p ‡∏ä‡πà‡∏ß‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏ñ‡∏µ‡∏¢‡∏£‡∏Ç‡∏≠‡∏á‡∏£‡∏≤‡∏Ñ‡∏≤

# ------------------------
# 3) ‡πÅ‡∏ö‡πà‡∏á train/test
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------
# 4) Preprocessor (impute + encode + scale)
# ------------------------
num_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer(transformers=[
    ("num", num_tf, FEATURES_NUM),
    ("cat", cat_tf, FEATURES_CAT),
])

# ------------------------
# 5) ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•
# ------------------------
model = RidgeCV(alphas=(0.1, 1.0, 10.0))

# ------------------------
# 6) Pipeline ‡∏£‡∏ß‡∏°‡∏ó‡∏∏‡∏Å‡∏≠‡∏¢‡πà‡∏≤‡∏á
# ------------------------
pipe = Pipeline(steps=[
    ("preprocessor", pre),
    ("model", model),
])

# ------------------------
# 7) ‡πÄ‡∏ó‡∏£‡∏ô
# ------------------------
pipe.fit(X_train, y_train)

# ------------------------
# 8) ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏• (‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì RMSE ‡πÄ‡∏≠‡∏á‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö sklearn ‡∏£‡∏∏‡πà‡∏ô‡πÄ‡∏Å‡πà‡∏≤)
# ------------------------
# ‡πÅ‡∏õ‡∏•‡∏á‡∏Å‡∏•‡∏±‡∏ö‡∏à‡∏≤‡∏Å log ‚Üí ‡∏™‡πÄ‡∏Å‡∏•‡∏à‡∏£‡∏¥‡∏á
pred_train = np.expm1(pipe.predict(X_train))
true_train = np.expm1(y_train)

pred_test = np.expm1(pipe.predict(X_test))
true_test = np.expm1(y_test)

# MSE/MAE/R2 train
mse_tr  = mean_squared_error(true_train, pred_train)  # ‡πÑ‡∏°‡πà‡∏°‡∏µ squared=
rmse_tr = np.sqrt(mse_tr)
mae_tr  = mean_absolute_error(true_train, pred_train)
r2_tr   = r2_score(true_train, pred_train)

# MSE/MAE/R2 test
mse_te  = mean_squared_error(true_test, pred_test)
rmse_te = np.sqrt(mse_te)
mae_te  = mean_absolute_error(true_test, pred_test)
r2_te   = r2_score(true_test, pred_test)

print("=== ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô ===")
print(f"Train RMSE: {rmse_tr:,.2f} | MAE: {mae_tr:,.2f} | R2: {r2_tr:.3f}")
print(f" Test RMSE: {rmse_te:,.2f} | MAE: {mae_te:,.2f} | R2: {r2_te:.3f}")

# ------------------------
# 9) ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏• + ‡πÄ‡∏°‡∏ó‡∏≤‡∏î‡∏≤‡∏ó‡∏≤
# ------------------------
MODEL_PATH = "house_price_pipeline_Train_10.joblib"
META_PATH = "features.json"

joblib.dump(pipe, MODEL_PATH)
with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump({"features": FEATURES, "num": FEATURES_NUM, "cat": FEATURES_CAT},
              f, ensure_ascii=False, indent=2)

print(f"\n‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡πâ‡∏ß: {MODEL_PATH}")
print(f"‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå: {META_PATH}")

# ------------------------
# 10) Helper: preview ‡∏Ñ‡πà‡∏≤‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå "‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå‡πÅ‡∏•‡πâ‡∏ß"
# ------------------------
def impute_preview(pipe: Pipeline, row_dict: Dict) -> Dict:
    """
    ‡∏Ñ‡∏∑‡∏ô dict ‡∏Ç‡∏≠‡∏á‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏î‡∏¥‡∏ö‡∏ó‡∏±‡πâ‡∏á 9 ‡∏ï‡∏±‡∏ß '‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå‡πÅ‡∏•‡πâ‡∏ß' (‡∏Å‡πà‡∏≠‡∏ô scale/one-hot)
    ‡πÉ‡∏ä‡πâ fillna(series) ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏à‡∏±‡∏î‡πÅ‡∏ô‡∏ß‡∏ï‡∏≤‡∏°‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏≠‡∏±‡∏ï‡πÇ‡∏ô‡∏°‡∏±‡∏ï‡∏¥ (‡πÄ‡∏•‡∏µ‡πà‡∏¢‡∏á ValueError)
    """
    # ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° df 1 ‡πÅ‡∏ñ‡∏ß‡∏ï‡∏≤‡∏°‡∏•‡∏≥‡∏î‡∏±‡∏ö FEATURES
    row = {f: row_dict.get(f, np.nan) for f in FEATURES}
    X_one = pd.DataFrame([row], columns=FEATURES)

    # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏°‡∏µ preprocessor ‡∏Å‡πá‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏£‡∏±‡∏ö‡∏°‡∏≤
    pre = pipe.named_steps.get("preprocessor")
    if pre is None:
        return {k: (None if pd.isna(v) else v) for k, v in X_one.iloc[0].to_dict().items()}

    # ‡πÄ‡∏Ç‡πâ‡∏≤‡∏ñ‡∏∂‡∏á imputer ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô
    try:
        num_imp: SimpleImputer = pre.named_transformers_["num"].named_steps["imputer"]
        cat_imp: SimpleImputer = pre.named_transformers_["cat"].named_steps["imputer"]
    except Exception:
        # ‡∏ö‡∏≤‡∏á‡πÄ‡∏ß‡∏≠‡∏£‡πå‡∏ä‡∏±‡∏ô/‡∏ä‡∏∑‡πà‡∏≠‡∏™‡πÄ‡∏ï‡πá‡∏õ‡∏ï‡πà‡∏≤‡∏á‡∏Å‡∏±‡∏ô ‚Üí ‡∏™‡πà‡∏á‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏£‡∏±‡∏ö‡∏°‡∏≤‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
        return {k: row_dict.get(k, None) for k in FEATURES}

    # ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå (index ‡∏ï‡∏£‡∏á‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå)
    num_stats = pd.Series(num_imp.statistics_, index=FEATURES_NUM) if FEATURES_NUM else pd.Series(dtype=float)
    cat_stats = pd.Series(cat_imp.statistics_, index=FEATURES_CAT) if FEATURES_CAT else pd.Series(dtype=object)

    # ‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå‡πÅ‡∏ö‡∏ö column-wise ‡∏î‡πâ‡∏ß‡∏¢ fillna(series)
    if FEATURES_NUM:
        X_num = X_one[FEATURES_NUM].apply(pd.to_numeric, errors="coerce")
        X_num = X_num.fillna(num_stats)
    else:
        X_num = pd.DataFrame()

    if FEATURES_CAT:
        X_cat = X_one[FEATURES_CAT].copy()
        X_cat = X_cat.fillna(cat_stats)
    else:
        X_cat = pd.DataFrame()

    # ‡∏£‡∏ß‡∏°‡∏Å‡∏•‡∏±‡∏ö‡∏ï‡∏≤‡∏°‡∏•‡∏≥‡∏î‡∏±‡∏ö FEATURES
    if FEATURES_NUM and FEATURES_CAT:
        X_imp = pd.concat([X_num, X_cat], axis=1)[FEATURES]
    elif FEATURES_NUM:
        X_imp = X_num[FEATURES]
    else:
        X_imp = X_cat[FEATURES]

    # ‡∏ó‡∏≥‡πÉ‡∏´‡πâ‡∏≠‡πà‡∏≤‡∏ô‡∏á‡πà‡∏≤‡∏¢: ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç‡πÄ‡∏õ‡πá‡∏ô float, ‡∏´‡∏°‡∏ß‡∏î‡∏´‡∏°‡∏π‡πà‡πÄ‡∏õ‡πá‡∏ô string/None
    out = {}
    for k, v in X_imp.iloc[0].to_dict().items():
        if k in FEATURES_NUM:
            out[k] = None if pd.isna(v) else float(v)
        else:
            out[k] = None if (isinstance(v, float) and pd.isna(v)) else str(v)
    return out

# ------------------------
# 11) ‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå + ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô "‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß/‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ"
# ------------------------
def predict_with_report(input_dict: Dict):
    """
    - ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏£‡∏≤‡∏Ñ‡∏≤ (expm1)
    - ‡πÅ‡∏ö‡πà‡∏á‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡πÄ‡∏õ‡πá‡∏ô:
        features_provided: ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡∏Å‡∏£‡∏≠‡∏Å
        features_filled:   ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏Å‡∏£‡∏≠‡∏Å ‡πÅ‡∏•‡πâ‡∏ß‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡∏Ñ‡πà‡∏≤‡πÉ‡∏´‡πâ (imputed)
    - ‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÇ‡∏ä‡∏ß‡πå‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡∏à‡∏£‡∏¥‡∏á‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå
    """
    # ‡∏Ñ‡πà‡∏≤‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå (‡πÑ‡∏ß‡πâ‡πÇ‡∏ä‡∏ß‡πå)
    used_after_impute = impute_preview(pipe, input_dict)

    # ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö predict (‡∏õ‡∏•‡πà‡∏≠‡∏¢‡πÉ‡∏´‡πâ pipeline impute ‡∏≠‡∏µ‡∏Å‡∏ä‡∏±‡πâ‡∏ô ‡πÑ‡∏°‡πà‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡∏ú‡∏•)
    row = {f: input_dict.get(f, None) for f in FEATURES}
    X_one = pd.DataFrame([row], columns=FEATURES)
    y_log = pipe.predict(X_one)[0]
    y_hat = float(np.expm1(y_log))

    # ‡∏à‡∏±‡∏î‡πÅ‡∏¢‡∏Å "‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß" vs "‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ"
    provided, filled = {}, {}
    for f in FEATURES:
        if f in input_dict and input_dict[f] is not None:
            provided[f] = used_after_impute[f]  # ‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡∏à‡∏£‡∏¥‡∏á
        else:
            filled[f] = used_after_impute[f]     # ‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°

    return {
        "predicted_price_usd": round(y_hat, 2),
        "features_provided": provided,
        "features_filled": filled
    }

# ------------------------
# 12) ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå 3 ‡πÄ‡∏Ñ‡∏™ (‡∏û‡∏¥‡∏°‡∏û‡πå‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡πÄ‡∏ï‡πá‡∏°)
# ------------------------
examples = [
    {"LotArea": 8500},
    {"LotArea": 8500},
    {"LotArea": 8500},]

print("\n=== ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á (‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÅ‡∏¢‡∏Å '‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß' / '‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ') ===")
for i, ex in enumerate(examples, 1):
    out = predict_with_report(ex)
    print(f"Case {i}: input={ex}")
    print(f"‚Üí Predicted SalePrice: {out['predicted_price_usd']:,.2f} USD")

    print("‚Üí Features (‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß):")
    if out["features_provided"]:
        for k, v in out["features_provided"].items():
            print(f"   - {k}: {v}")
    else:
        print("   (‡πÑ‡∏°‡πà‡∏°‡∏µ ‚Äî ‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡πÑ‡∏°‡πà‡∏Å‡∏£‡∏≠‡∏Å‡πÄ‡∏•‡∏¢)")

    print("‚Üí Features (‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ):")
    if out["features_filled"]:
        for k, v in out["features_filled"].items():
            print(f"   - {k}: {v}")
    else:
        print("   (‡πÑ‡∏°‡πà‡∏°‡∏µ ‚Äî ‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡∏Å‡∏£‡∏≠‡∏Å‡∏Ñ‡∏£‡∏ö)")

    print()


=== ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô ===
Train RMSE: 34,471.10 | MAE: 19,641.75 | R2: 0.801
 Test RMSE: 29,620.65 | MAE: 19,180.85 | R2: 0.886

‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡πâ‡∏ß: house_price_pipeline_Train_10.joblib
‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå: features.json

=== ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á (‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÅ‡∏¢‡∏Å '‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß' / '‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ') ===
Case 1: input={'LotArea': 8500}
‚Üí Predicted SalePrice: 169,700.75 USD
‚Üí Features (‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß):
   - LotArea: 8500.0
‚Üí Features (‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ):
   - OverallQual: 6.0
   - TotalBsmtSF: 997.5
   - GarageCars: 2.0
   - Fireplaces: 1.0
   - BedroomAbvGr: 3.0
   - GrLivArea: 1473.0
   - FullBath: 2.0
   - Neighborhood: NAmes

Case 2: input={'LotArea': 8500}
‚Üí Predicted SalePrice: 169,700.75 USD
‚Üí Features (‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß):
   - LotArea: 8500.0
‚Üí Features (‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏

In [25]:
# train_house_price.py
# ====================
# ‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏£‡∏≤‡∏Ñ‡∏≤‡∏ö‡πâ‡∏≤‡∏ô (Regression) ‡∏à‡∏≤‡∏Å Kaggle (train_cleaned.csv)
# - ‡πÉ‡∏ä‡πâ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå 9 ‡∏ï‡∏±‡∏ß + ‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢ SalePrice (log-transform)
# - ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ö‡∏ô test split ‚Üí RMSE/MAE/R¬≤
# - ‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏°‡∏ï‡∏£‡∏¥‡∏Å‡πÅ‡∏ö‡∏ö Classification (Accuracy, F1, Classification Report, Confusion Matrix)
#   ‡πÇ‡∏î‡∏¢ "‡∏à‡∏±‡∏î‡∏ä‡πà‡∏ß‡∏á‡∏£‡∏≤‡∏Ñ‡∏≤ (bins)" ‡πÉ‡∏´‡πâ y_true ‡πÅ‡∏•‡∏∞ y_pred ‡∏à‡∏≤‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏• Regression
# - ‡πÄ‡∏ã‡∏ü‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÄ‡∏õ‡πá‡∏ô house_price_pipeline_Train_10.joblib ‡πÅ‡∏•‡∏∞ features.json
# - ‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡πÅ‡∏ö‡∏ö "‡πÇ‡∏ä‡∏ß‡πå‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏Å‡∏£‡∏≠‡∏Å" ‡πÅ‡∏•‡∏∞ "‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ (imputed)"
# - ‡πÅ‡∏Å‡πâ‡∏ö‡∏±‡πä‡∏Å pandas: ‡πÉ‡∏ä‡πâ fillna(series) ‡πÅ‡∏ó‡∏ô where(...)

import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    classification_report, confusion_matrix, f1_score, accuracy_score
)

# ------------------------
# 1) ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
# ------------------------
DATA_PATH = "train_cleaned.csv"
assert Path(DATA_PATH).exists(), f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå {DATA_PATH} ‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô"

df = pd.read_csv(DATA_PATH)

# ------------------------
# 2) ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå/‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢
# ------------------------
FEATURES_NUM = [
    "OverallQual", "TotalBsmtSF", "LotArea", "GarageCars",
    "Fireplaces", "BedroomAbvGr", "GrLivArea", "FullBath"
]
FEATURES_CAT = ["Neighborhood"]
FEATURES = FEATURES_NUM + FEATURES_CAT
TARGET = "SalePrice"

# ‡∏Å‡∏£‡∏≠‡∏á‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà target ‡πÄ‡∏õ‡πá‡∏ô NaN ‡∏≠‡∏≠‡∏Å (‡∏Å‡∏±‡∏ô error)
df = df.dropna(subset=[TARGET])

X = df[FEATURES].copy()
y = np.log1p(df[TARGET].astype(float))  # ‡πÅ‡∏õ‡∏•‡∏á log1p ‡∏ä‡πà‡∏ß‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏ñ‡∏µ‡∏¢‡∏£‡∏Ç‡∏≠‡∏á‡∏£‡∏≤‡∏Ñ‡∏≤

# ------------------------
# 3) ‡πÅ‡∏ö‡πà‡∏á train/test
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------
# 4) Preprocessor (impute + encode + scale)
# ------------------------
num_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer(transformers=[
    ("num", num_tf, FEATURES_NUM),
    ("cat", cat_tf, FEATURES_CAT),
])

# ------------------------
# 5) ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•
# ------------------------
model = RidgeCV(alphas=(0.1, 1.0, 10.0))

# ------------------------
# 6) Pipeline ‡∏£‡∏ß‡∏°‡∏ó‡∏∏‡∏Å‡∏≠‡∏¢‡πà‡∏≤‡∏á
# ------------------------
pipe = Pipeline(steps=[
    ("preprocessor", pre),
    ("model", model),
])

# ------------------------
# 7) ‡πÄ‡∏ó‡∏£‡∏ô
# ------------------------
pipe.fit(X_train, y_train)

# ------------------------
# 8) ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏• Regression (‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì RMSE ‡πÄ‡∏≠‡∏á‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö sklearn ‡∏£‡∏∏‡πà‡∏ô‡πÄ‡∏Å‡πà‡∏≤)
# ------------------------
# ‡πÅ‡∏õ‡∏•‡∏á‡∏Å‡∏•‡∏±‡∏ö‡∏à‡∏≤‡∏Å log ‚Üí ‡∏™‡πÄ‡∏Å‡∏•‡∏à‡∏£‡∏¥‡∏á
pred_train = np.expm1(pipe.predict(X_train))
true_train = np.expm1(y_train)

pred_test = np.expm1(pipe.predict(X_test))
true_test = np.expm1(y_test)

# MSE/MAE/R2 train
mse_tr  = mean_squared_error(true_train, pred_train)  # ‡πÑ‡∏°‡πà‡∏°‡∏µ squared=
rmse_tr = np.sqrt(mse_tr)
mae_tr  = mean_absolute_error(true_train, pred_train)
r2_tr   = r2_score(true_train, pred_train)

# MSE/MAE/R2 test
mse_te  = mean_squared_error(true_test, pred_test)
rmse_te = np.sqrt(mse_te)
mae_te  = mean_absolute_error(true_test, pred_test)
r2_te   = r2_score(true_test, pred_test)

print("=== Regression metrics ===")
print(f"Train RMSE: {rmse_tr:,.2f} | MAE: {mae_tr:,.2f} | R2: {r2_tr:.3f}")
print(f" Test RMSE: {rmse_te:,.2f} | MAE: {mae_te:,.2f} | R2: {r2_te:.3f}")

# ------------------------
# 8.1) ‡πÄ‡∏°‡∏ï‡∏£‡∏¥‡∏Å‡πÅ‡∏ö‡∏ö Classification ‡∏ö‡∏ô '‡∏ä‡πà‡∏ß‡∏á‡∏£‡∏≤‡∏Ñ‡∏≤ (bins)'
# ------------------------
def price_to_bin(v):
    if v <  50000:  return "<50k"
    if v < 100000:  return "50-100k"
    if v < 150000:  return "100-150k"
    if v < 200000:  return "150-200k"
    if v < 250000:  return "200-250k"
    if v < 500000:  return "250-500k"
    return "500k+"

y_true_bins = pd.Series(true_test).map(price_to_bin)
y_pred_bins = pd.Series(pred_test).map(price_to_bin)

acc  = accuracy_score(y_true_bins, y_pred_bins)
f1_macro    = f1_score(y_true_bins, y_pred_bins, average="macro", zero_division=0)
f1_weighted = f1_score(y_true_bins, y_pred_bins, average="weighted", zero_division=0)

print("\n=== Classification-style metrics on price bins ===")
print(f"Accuracy: {acc:.4f} | F1-macro: {f1_macro:.4f} | F1-weighted: {f1_weighted:.4f}")
print("\nClassification Report:")
print(classification_report(y_true_bins, y_pred_bins, zero_division=0))

labels_order = ["<50k","50-100k","100-150k","150-200k","200-250k","250-500k","500k+"]
cm = confusion_matrix(y_true_bins, y_pred_bins, labels=labels_order)
cm_df = pd.DataFrame(cm, index=[f"T:{l}" for l in labels_order], columns=[f"P:{l}" for l in labels_order])
print("Confusion Matrix (rows=true, cols=pred):")
print(cm_df)

# ------------------------
# 9) ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏• + ‡πÄ‡∏°‡∏ó‡∏≤‡∏î‡∏≤‡∏ó‡∏≤
# ------------------------
MODEL_PATH = "house_price_pipeline_Train_10.joblib"
META_PATH = "features.json"

joblib.dump(pipe, MODEL_PATH)
with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump({"features": FEATURES, "num": FEATURES_NUM, "cat": FEATURES_CAT},
              f, ensure_ascii=False, indent=2)

print(f"\n‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡πâ‡∏ß: {MODEL_PATH}")
print(f"‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå: {META_PATH}")

# ------------------------
# 10) Helper: preview ‡∏Ñ‡πà‡∏≤‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå "‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå‡πÅ‡∏•‡πâ‡∏ß"
# ------------------------
def impute_preview(pipe: Pipeline, row_dict: Dict) -> Dict:
    """
    ‡∏Ñ‡∏∑‡∏ô dict ‡∏Ç‡∏≠‡∏á‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏î‡∏¥‡∏ö‡∏ó‡∏±‡πâ‡∏á 9 ‡∏ï‡∏±‡∏ß '‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå‡πÅ‡∏•‡πâ‡∏ß' (‡∏Å‡πà‡∏≠‡∏ô scale/one-hot)
    ‡πÉ‡∏ä‡πâ fillna(series) ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏à‡∏±‡∏î‡πÅ‡∏ô‡∏ß‡∏ï‡∏≤‡∏°‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏≠‡∏±‡∏ï‡πÇ‡∏ô‡∏°‡∏±‡∏ï‡∏¥ (‡πÄ‡∏•‡∏µ‡πà‡∏¢‡∏á ValueError)
    """
    row = {f: row_dict.get(f, np.nan) for f in FEATURES}
    X_one = pd.DataFrame([row], columns=FEATURES)

    pre = pipe.named_steps.get("preprocessor")
    if pre is None:
        return {k: (None if pd.isna(v) else v) for k, v in X_one.iloc[0].to_dict().items()}

    try:
        num_imp: SimpleImputer = pre.named_transformers_["num"].named_steps["imputer"]
        cat_imp: SimpleImputer = pre.named_transformers_["cat"].named_steps["imputer"]
    except Exception:
        return {k: row_dict.get(k, None) for k in FEATURES}

    num_stats = pd.Series(num_imp.statistics_, index=FEATURES_NUM) if FEATURES_NUM else pd.Series(dtype=float)
    cat_stats = pd.Series(cat_imp.statistics_, index=FEATURES_CAT) if FEATURES_CAT else pd.Series(dtype=object)

    if FEATURES_NUM:
        X_num = X_one[FEATURES_NUM].apply(pd.to_numeric, errors="coerce")
        X_num = X_num.fillna(num_stats)
    else:
        X_num = pd.DataFrame()

    if FEATURES_CAT:
        X_cat = X_one[FEATURES_CAT].copy()
        X_cat = X_cat.fillna(cat_stats)
    else:
        X_cat = pd.DataFrame()

    if FEATURES_NUM and FEATURES_CAT:
        X_imp = pd.concat([X_num, X_cat], axis=1)[FEATURES]
    elif FEATURES_NUM:
        X_imp = X_num[FEATURES]
    else:
        X_imp = X_cat[FEATURES]

    out = {}
    for k, v in X_imp.iloc[0].to_dict().items():
        if k in FEATURES_NUM:
            out[k] = None if pd.isna(v) else float(v)
        else:
            out[k] = None if (isinstance(v, float) and pd.isna(v)) else str(v)
    return out

# ------------------------
# 11) ‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå + ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô "‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß/‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ"
# ------------------------
def predict_with_report(input_dict: Dict):
    """
    - ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏£‡∏≤‡∏Ñ‡∏≤ (expm1)
    - ‡πÅ‡∏ö‡πà‡∏á‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡πÄ‡∏õ‡πá‡∏ô:
        features_provided: ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡∏Å‡∏£‡∏≠‡∏Å
        features_filled:   ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏Å‡∏£‡∏≠‡∏Å ‡πÅ‡∏•‡πâ‡∏ß‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡∏Ñ‡πà‡∏≤‡πÉ‡∏´‡πâ (imputed)
    - ‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÇ‡∏ä‡∏ß‡πå‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡∏à‡∏£‡∏¥‡∏á‡∏´‡∏•‡∏±‡∏á‡∏≠‡∏¥‡∏°‡∏û‡∏¥‡∏ß‡∏ï‡πå
    """
    used_after_impute = impute_preview(pipe, input_dict)

    row = {f: input_dict.get(f, None) for f in FEATURES}
    X_one = pd.DataFrame([row], columns=FEATURES)
    y_log = pipe.predict(X_one)[0]
    y_hat = float(np.expm1(y_log))

    provided, filled = {}, {}
    for f in FEATURES:
        if f in input_dict and input_dict[f] is not None:
            provided[f] = used_after_impute[f]
        else:
            filled[f] = used_after_impute[f]

    return {
        "predicted_price_usd": round(y_hat, 2),
        "features_provided": provided,
        "features_filled": filled
    }

# ------------------------
# 12) ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå 3 ‡πÄ‡∏Ñ‡∏™ (‡∏û‡∏¥‡∏°‡∏û‡πå‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡πÄ‡∏ï‡πá‡∏°)
# ------------------------
examples = [
    {"LotArea": 8500, "OverallQual": 6, "Neighborhood": "CollgCr"},
    {"OverallQual": 7, "GrLivArea": 1800, "FullBath": 2, "BedroomAbvGr": 3, "Neighborhood": "NAmes"},
    {"LotArea": 12000, "TotalBsmtSF": 900, "GarageCars": 2, "GrLivArea": 2200, "FullBath": 2},
]

print("\n=== ‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á (‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÅ‡∏¢‡∏Å '‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß' / '‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ') ===")
for i, ex in enumerate(examples, 1):
    out = predict_with_report(ex)
    print(f"Case {i}: input={ex}")
    print(f"‚Üí Predicted SalePrice: {out['predicted_price_usd']:,.2f} USD")

    print("‚Üí Features (‡πÉ‡∏™‡πà‡πÅ‡∏•‡πâ‡∏ß):")
    if out["features_provided"]:
        for k, v in out["features_provided"].items():
            print(f"   - {k}: {v}")
    else:
        print("   (‡πÑ‡∏°‡πà‡∏°‡∏µ ‚Äî ‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡πÑ‡∏°‡πà‡∏Å‡∏£‡∏≠‡∏Å‡πÄ‡∏•‡∏¢)")

    print("‚Üí Features (‡∏£‡∏∞‡∏ö‡∏ö‡πÄ‡∏ï‡∏¥‡∏°‡πÉ‡∏´‡πâ):")
    if out["features_filled"]:
        for k, v in out["features_filled"].items():
            print(f"   - {k}: {v}")
    else:
        print("   (‡πÑ‡∏°‡πà‡∏°‡∏µ ‚Äî ‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡∏Å‡∏£‡∏≠‡∏Å‡∏Ñ‡∏£‡∏ö)")

    print()


=== Regression metrics ===
Train RMSE: 34,471.10 | MAE: 19,641.75 | R2: 0.801
 Test RMSE: 29,620.65 | MAE: 19,180.85 | R2: 0.886

=== Classification-style metrics on price bins ===
Accuracy: 0.7363 | F1-macro: 0.5845 | F1-weighted: 0.7357

Classification Report:
              precision    recall  f1-score   support

    100-150k       0.81      0.79      0.80       113
    150-200k       0.69      0.70      0.70        71
    200-250k       0.57      0.74      0.64        34
    250-500k       0.85      0.76      0.80        45
     50-100k       0.64      0.67      0.65        24
       500k+       1.00      0.33      0.50         3
        <50k       0.00      0.00      0.00         2

    accuracy                           0.74       292
   macro avg       0.65      0.57      0.58       292
weighted avg       0.74      0.74      0.74       292

Confusion Matrix (rows=true, cols=pred):
            P:<50k  P:50-100k  P:100-150k  P:150-200k  P:200-250k  P:250-500k  \
T:<50k           0

In [29]:
# train_and_pack.py
# ‡πÄ‡∏ó‡∏£‡∏ô RidgeCV + ‡∏´‡πà‡∏≠‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏•‡∏≤‡∏™ HousePriceModel (predict + suggest_profiles)
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Any, Optional

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


FEATURES_NUM = [
    "OverallQual", "TotalBsmtSF", "LotArea", "GarageCars",
    "Fireplaces", "BedroomAbvGr", "GrLivArea", "FullBath"
]
FEATURES_CAT = ["Neighborhood"]
FEATURES_ALL = FEATURES_NUM + FEATURES_CAT
TARGET = "SalePrice"
INT_LIKE = ["OverallQual","GarageCars","Fireplaces","BedroomAbvGr","FullBath"]

class HousePriceModel:
    """‡∏ö‡∏£‡∏£‡∏à‡∏∏ Pipeline + ‡∏î‡∏≤‡∏ï‡πâ‡∏≤‡πÄ‡∏ö‡∏™‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏™‡∏£‡πâ‡∏≤‡∏á 3 ‡πÇ‡∏õ‡∏£‡πÑ‡∏ü‡∏•‡πå‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥"""
    def __init__(self, pipe: Pipeline, df_train: pd.DataFrame):
        self.pipe = pipe
        # ‡πÄ‡∏Å‡πá‡∏ö‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ (‡πÄ‡∏û‡∏∑‡πà‡∏≠ suggestion)
        df = df_train.copy()
        for c in FEATURES_ALL:
            if c not in df.columns:
                df[c] = np.nan
        self.df = df[FEATURES_ALL].copy()

    # ---------- Utils ----------
    def _coerce_row_types(self, d: Dict[str, Any]) -> Dict[str, Any]:
        d = dict(d)
        for k in INT_LIKE:
            if k in d and d[k] is not None and not pd.isna(d[k]):
                d[k] = int(round(float(d[k])))
        if "LotArea" in d and d["LotArea"] is not None and not pd.isna(d["LotArea"]):
            d["LotArea"] = float(d["LotArea"])
        if "Neighborhood" in d and d["Neighborhood"] is not None and not pd.isna(d["Neighborhood"]):
            d["Neighborhood"] = str(d["Neighborhood"])
        return d

    def _predict_price(self, row: Dict[str, Any]) -> float:
        X = pd.DataFrame([{c: row.get(c, np.nan) for c in FEATURES_ALL}], columns=FEATURES_ALL)
        y_log = self.pipe.predict(X)[0]
        return float(np.expm1(y_log))

    def _group_by_lotarea(self, lot: float, pct_window: float = 0.15, min_rows: int = 50) -> pd.DataFrame:
        g = pd.DataFrame()
        for mul in [1.0, 1.5, 2.0, 3.0]:
            low, high = lot*(1 - pct_window*mul), lot*(1 + pct_window*mul)
            g = self.df[(self.df["LotArea"] >= low) & (self.df["LotArea"] <= high)].copy()
            if len(g) >= min_rows:
                break
        if g.empty:
            g = self.df.copy()
        if "Neighborhood" in g.columns:
            g = g.dropna(subset=["Neighborhood"])
        return g.reset_index(drop=True)

    # ---------- Public: prediction ----------
    def predict(self, payload: Dict[str, Any]) -> Dict[str, Any]:
        """‡∏£‡∏±‡∏ö‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡πÑ‡∏°‡πà‡∏Ñ‡∏£‡∏ö‡∏Å‡πá‡πÑ‡∏î‡πâ ‚Üí ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏£‡∏≤‡∏Ñ‡∏≤ + ‡πÅ‡∏™‡∏î‡∏á preview ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ (median/mode ‡πÄ‡∏ï‡∏¥‡∏°‡∏Ñ‡∏£‡πà‡∏≤‡∏ß ‡πÜ)"""
        row = self._coerce_row_types({k: v for k, v in payload.items() if v is not None})
        price = self._predict_price(row)

        # preview ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ (‡πÄ‡∏ï‡∏¥‡∏°‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡∏á‡πà‡∏≤‡∏¢ ‡πÜ ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÇ‡∏ä‡∏ß‡πå)
        g = self.df
        preview = {}
        for c in FEATURES_NUM:
            preview[c] = row.get(c, float(g[c].median()))
        for c in FEATURES_CAT:
            preview[c] = row.get(c, str(g[c].mode(dropna=True).iloc[0]) if g[c].notna().any() else None)

        return {
            "predicted_price_usd": round(price, 2),
            "features_used_preview": self._coerce_row_types(preview)
        }

    # ---------- Public: 3 suggestions ----------
    def suggest_profiles(
        self,
        lotarea: float,
        method: str = "neighborhood",
        pct_window: float = 0.15
    ) -> Dict[str, Any]:
        """
        ‡∏™‡∏£‡πâ‡∏≤‡∏á 3 ‡πÇ‡∏õ‡∏£‡πÑ‡∏ü‡∏•‡πå (features ‡∏Ñ‡∏£‡∏ö 9) + ‡∏£‡∏≤‡∏Ñ‡∏≤
        method: 'neighborhood' | 'quantile' | 'knn'
        """
        lot = float(lotarea)
        group = self._group_by_lotarea(lot, pct_window=pct_window, min_rows=60)

        if method == "neighborhood":
            items = self._suggest_neighborhood(group, lot, n=3)
        elif method == "quantile":
            items = self._suggest_quantile(group, lot, jitter=0.05)
        elif method == "knn":
            items = self._suggest_knn(group, lot, k=5, n=3, jitter=0.03)
        else:
            raise ValueError("method must be one of: neighborhood | quantile | knn")

        return {
            "summary": {
                "method": method,
                "n_in_group": int(len(group)),
                "lotarea_input": lot,
                "lotarea_range": [int(group["LotArea"].min()), int(group["LotArea"].max())],
                "neighborhood_top": group["Neighborhood"].value_counts().head(3).to_dict()
            },
            "items": items
        }

    # ----- methods impl -----
    def _suggest_neighborhood(self, group_df: pd.DataFrame, lot: float, n: int = 3) -> List[Dict[str, Any]]:
        df = group_df.copy()
        df["_diff"] = (df["LotArea"] - lot).abs()
        df = df.sort_values("_diff").reset_index(drop=True)

        picks = []
        if len(df) >= 1: picks.append(df.iloc[0])           # ‡πÉ‡∏Å‡∏•‡πâ‡∏™‡∏∏‡∏î
        if len(df) >= 3: picks.append(df.iloc[len(df)//2])  # ‡∏Å‡∏•‡∏≤‡∏á
        if len(df) >= 2: picks.append(df.iloc[-1])          # ‡∏´‡πà‡∏≤‡∏á‡∏™‡∏∏‡∏î

        items = []
        for r in picks[:n]:
            feat = {c: r[c] for c in FEATURES_ALL}
            feat = self._coerce_row_types(feat)
            items.append({
                "features": feat,
                "predicted_price_usd": round(self._predict_price(feat), 2)
            })
        return items

    def _suggest_quantile(self, group_df: pd.DataFrame, lot: float, jitter: float = 0.05) -> List[Dict[str, Any]]:
        g = group_df.copy()
        q = g[FEATURES_NUM].quantile([0.25, 0.5, 0.75]).to_dict(orient="index")
        modes = g["Neighborhood"].value_counts().index.tolist()[:3]
        if len(modes) < 3:
            modes += ([modes[-1]] * (3 - len(modes))) if modes else ["NAmes", "CollgCr", "OldTown"]

        items = []
        for qn, neigh in zip([0.25, 0.5, 0.75], modes):
            base = {k: q[qn][k] for k in FEATURES_NUM}
            base["LotArea"] = lot * (1.0 + np.random.uniform(-jitter, jitter))
            base["Neighborhood"] = neigh
            feat = self._coerce_row_types(base)
            items.append({
                "features": feat,
                "predicted_price_usd": round(self._predict_price(feat), 2)
            })
        return items

    def _suggest_knn(self, group_df: pd.DataFrame, lot: float, k: int = 5, n: int = 3, jitter: float = 0.03) -> List[Dict[str, Any]]:
        g = group_df.copy()
        g_num = g.copy()
        neigh_map = {v: i for i, v in enumerate(g["Neighborhood"].fillna("NA").unique())}
        g_num["Neighborhood"] = g["Neighborhood"].map(neigh_map).astype(float)

        imputer = KNNImputer(n_neighbors=k, weights="distance")
        imputer.fit(g_num[FEATURES_ALL])

        items = []
        for j in [-jitter, 0.0, jitter]:
            row = {c: np.nan for c in FEATURES_ALL}
            row["LotArea"] = lot * (1.0 + (np.random.uniform(-abs(j), abs(j)) if j != 0 else 0.0))
            df_row = pd.DataFrame([row], columns=FEATURES_ALL)
            df_row["Neighborhood"] = np.nan
            pred = imputer.transform(df_row)[0]
            d = {FEATURES_ALL[i]: pred[i] for i in range(len(FEATURES_ALL))}
            inv_neigh = {v: k for k, v in neigh_map.items()}
            d["Neighborhood"] = inv_neigh.get(int(round(d["Neighborhood"])), "NAmes")
            feat = self._coerce_row_types(d)
            items.append({
                "features": feat,
                "predicted_price_usd": round(self._predict_price(feat), 2)
            })
        return items[:n]


# ================== ‡πÄ‡∏ó‡∏£‡∏ô + ‡πÅ‡∏û‡πá‡∏Å + ‡πÄ‡∏ã‡∏ü ==================
DATA_PATH = "train_cleaned.csv"
assert Path(DATA_PATH).exists(), "‡πÑ‡∏°‡πà‡∏û‡∏ö train_cleaned.csv"
df = pd.read_csv(DATA_PATH).dropna(subset=[TARGET])

X = df[FEATURES_ALL].copy()
y = np.log1p(df[TARGET].astype(float))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_tf = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_tf = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])
pre = ColumnTransformer([("num", num_tf, FEATURES_NUM), ("cat", cat_tf, FEATURES_CAT)])

model = RidgeCV(alphas=(0.1, 1.0, 10.0))
pipe = Pipeline([("preprocessor", pre), ("model", model)])
pipe.fit(X_train, y_train)

# ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏ô
pred_test = np.expm1(pipe.predict(X_test)); true_test = np.expm1(y_test)
rmse = mean_squared_error(true_test, pred_test) ** 0.5
mae  = mean_absolute_error(true_test, pred_test)
r2   = r2_score(true_test, pred_test)
print(f"Test RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | R¬≤: {r2:.3f}")

# ‡πÅ‡∏û‡πá‡∏Å‡πÅ‡∏•‡∏∞‡πÄ‡∏ã‡∏ü
packed = HousePriceModel(pipe, df_train=df)
joblib.dump(packed, "house_price_model.joblib")
print("‚úÖ Saved packed model to house_price_model.joblib")


Test RMSE: 29,620.65 | MAE: 19,180.85 | R¬≤: 0.886
‚úÖ Saved packed model to house_price_model.joblib
