In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib

# โหลดข้อมูล
train_df = pd.read_csv("train_cleaned.csv")

print("Shape:", train_df.shape)
print("Columns:", len(train_df.columns))


Shape: (1460, 78)
Columns: 78


In [2]:
selected_features = [
    'OverallQual', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
    'GarageCars', 'YearBuilt', 'YearRemodAdd', 'FullBath', 'BsmtQual',
    'KitchenQual', 'Neighborhood', 'LotArea', 'Fireplaces'
]
target = 'SalePrice'

X = train_df[selected_features]
y = train_df[target]

# ✅ STEP: Create Price Category Groups
# Convert continuous SalePrice into groups (classification)
bins = [0, 50000, 100000, 150000, 200000, 250000, 500000, 1000000]
labels = ['<50k', '50-100k', '100-150k', '150-200k', '200-250k', '250-500k', '500k+']

X = train_df[selected_features]
y = pd.cut(train_df['SalePrice'], bins=bins, labels=labels)



In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [4]:
from sklearn.preprocessing import OrdinalEncoder

# แบ่งชนิดของข้อมูล
numeric_features = [
    'GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
    'GarageCars', 'YearBuilt', 'YearRemodAdd', 'FullBath', 'LotArea', 'Fireplaces'
]
categorical_features = ['OverallQual', 'BsmtQual', 'KitchenQual', 'Neighborhood']

# ตัวแปลงข้อมูลตัวเลข
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# ตัวแปลงข้อมูลหมวดหมู่
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# รวมทั้งหมดใน ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# สร้าง pipeline รวมทั้งหมด
model = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])


In [10]:
feature_names = list(pipeline.named_steps['preprocessor'].get_feature_names_out())
print("จำนวน features หลัง encode:", len(feature_names))
print("ตัวอย่าง feature:", feature_names[:20])


จำนวน features หลัง encode: 53
ตัวอย่าง feature: ['num__GrLivArea', 'num__TotalBsmtSF', 'num__1stFlrSF', 'num__2ndFlrSF', 'num__GarageCars', 'num__YearBuilt', 'num__YearRemodAdd', 'num__FullBath', 'num__LotArea', 'num__Fireplaces', 'cat__OverallQual_1', 'cat__OverallQual_2', 'cat__OverallQual_3', 'cat__OverallQual_4', 'cat__OverallQual_5', 'cat__OverallQual_6', 'cat__OverallQual_7', 'cat__OverallQual_8', 'cat__OverallQual_9', 'cat__OverallQual_10']


In [11]:
model = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [16]:
pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)


In [17]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Accuracy: 0.7568493150684932

Classification Report:
               precision    recall  f1-score   support

    100-150k       0.80      0.94      0.86       113
    150-200k       0.73      0.72      0.72        71
    200-250k       0.50      0.59      0.54        34
    250-500k       0.91      0.69      0.78        45
     50-100k       0.86      0.50      0.63        24
       500k+       1.00      0.33      0.50         3
        <50k       0.00      0.00      0.00         2

    accuracy                           0.76       292
   macro avg       0.68      0.54      0.58       292
weighted avg       0.77      0.76      0.75       292



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [18]:
joblib.dump(pipeline, "house_price_pipeline_new.joblib")
print("✅ บันทึกโมเดลใหม่เรียบร้อยแล้ว")


✅ บันทึกโมเดลใหม่เรียบร้อยแล้ว


In [19]:
import joblib
import inspect

def inspect_model(file_path):
    print(f"\n🔎 Checking file: {file_path}")
    try:
        model = joblib.load(file_path)
        print("✅ File loaded successfully!")
        print("📦 Object type:", type(model))
        
        # ถ้าเป็น Pipeline
        if hasattr(model, 'named_steps'):
            print("\n🧱 Pipeline Steps:")
            for step_name, step_obj in model.named_steps.items():
                print(f"  - {step_name}: {type(step_obj)}")
            
            # ถ้ามี preprocessor
            if 'preprocessor' in model.named_steps:
                pre = model.named_steps['preprocessor']
                if hasattr(pre, 'transformers'):
                    print("\n📊 Transformers in preprocessor:")
                    for name, transformer, cols in pre.transformers:
                        print(f"  • {name}: {type(transformer)} (columns: {cols})")
            
            # ถ้ามี model
            if 'model' in model.named_steps:
                print("\n🤖 Model details:")
                mdl = model.named_steps['model']
                print("  - Model type:", type(mdl))
                if hasattr(mdl, 'get_params'):
                    print("  - Model parameters:")
                    for k, v in list(mdl.get_params().items())[:10]:
                        print(f"     {k}: {v}")
        
        # ถ้าไม่ใช่ pipeline
        else:
            print("\n⚙️ This object is not a Pipeline.")
            if hasattr(model, 'get_params'):
                print("Model parameters:")
                for k, v in list(model.get_params().items())[:10]:
                    print(f"   {k}: {v}")
        
    except Exception as e:
        print("❌ Error loading file:", e)


# 🔹 เรียกใช้ตรวจสอบทั้งสองไฟล์
inspect_model("house_price_pipeline.pkl")
inspect_model("house_price_pipeline.joblib")



🔎 Checking file: house_price_pipeline.pkl
✅ File loaded successfully!
📦 Object type: <class 'sklearn.linear_model._base.LinearRegression'>

⚙️ This object is not a Pipeline.
Model parameters:
   copy_X: True
   fit_intercept: True
   n_jobs: None
   positive: False
   tol: 1e-06

🔎 Checking file: house_price_pipeline.joblib
✅ File loaded successfully!
📦 Object type: <class 'sklearn.pipeline.Pipeline'>

🧱 Pipeline Steps:
  - preprocess: <class 'sklearn.compose._column_transformer.ColumnTransformer'>
  - reg: <class 'sklearn.compose._target.TransformedTargetRegressor'>


***Train 10 Model***

In [20]:
# 0) Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib

# 1) Load data
df = pd.read_csv("train_cleaned.csv")

# 2) Define features/target (SalePrice = target)
feature_cols = [
    "OverallQual",
    "TotalBsmtSF",
    "LotArea",
    "GarageCars",
    "Fireplaces",
    "BedroomAbvGr",
    "Neighborhood",
    "GrLivArea",
    "FullBath",
]
target_col = "SalePrice"

X = df[feature_cols].copy()
y = df[target_col].astype(float)

# 3) Enforce dtypes (สำคัญมากสำหรับ front-end)
numeric_int_cols = [
    "OverallQual", "TotalBsmtSF", "LotArea", "GarageCars",
    "Fireplaces", "BedroomAbvGr", "GrLivArea", "FullBath"
]
cat_cols = ["Neighborhood"]

# แปลง numeric เป็นตัวเลข (กันพังถ้ามีสตริงหลงมา)
for c in numeric_int_cols:
    X[c] = pd.to_numeric(X[c], errors="coerce")

# Neighborhood เป็น string
X[cat_cols] = X[cat_cols].astype(str)

# 4) Basic sanity clips (ช่วยให้ dropdown/front-end มี guardrail)
X["OverallQual"] = X["OverallQual"].clip(1, 10)
X["GarageCars"]  = X["GarageCars"].clip(0, 4)
X["FullBath"]    = X["FullBath"].clip(lower=0)
X["Fireplaces"]  = X["Fireplaces"].clip(lower=0)
X["BedroomAbvGr"]= X["BedroomAbvGr"].clip(lower=0)
X["TotalBsmtSF"] = X["TotalBsmtSF"].clip(lower=0)
X["GrLivArea"]   = X["GrLivArea"].clip(lower=0)
X["LotArea"]     = X["LotArea"].clip(lower=0)

# 5) Preprocessor
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, numeric_int_cols),
        ("cat", cat_transformer, cat_cols)
    ]
)

# 6) Model
model = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# 7) 10-fold CV score (R^2)
cv = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring="r2")
print(f"10-fold R^2: mean={cv_scores.mean():.3f}, std={cv_scores.std():.3f}")

# 8) Hold-out test (optional แต่ช่วยเช็ค over/underfit)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
pipeline.fit(X_train, y_train)
pred_train = pipeline.predict(X_train)
pred_test  = pipeline.predict(X_test)

rmse = lambda a,b: np.sqrt(((a-b)**2).mean())

print(f"Train R^2: {r2_score(y_train, pred_train):.3f}")
print(f"Test  R^2: {r2_score(y_test,  pred_test):.3f}")
print(f"MAE: {mean_absolute_error(y_test, pred_test):.2f}")
print(f"RMSE: {rmse(y_test, pred_test):.2f}")

# 9) Fit on all data & save
pipeline.fit(X, y)
joblib.dump(pipeline, "house_price_pipeline_9f_cv10.joblib")
print("✅ Saved: house_price_pipeline_9f_cv10.joblib")

# (Optional) ดูชื่อฟีเจอร์หลังแปลงแล้ว
feat_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
print("Features after transform:", len(feat_names))
print(feat_names[:15])


10-fold R^2: mean=0.793, std=0.095
Train R^2: 0.808
Test  R^2: 0.834
MAE: 21798.35
RMSE: 35685.43
✅ Saved: house_price_pipeline_9f_cv10.joblib
Features after transform: 33
['num__OverallQual' 'num__TotalBsmtSF' 'num__LotArea' 'num__GarageCars'
 'num__Fireplaces' 'num__BedroomAbvGr' 'num__GrLivArea' 'num__FullBath'
 'cat__Neighborhood_Blmngtn' 'cat__Neighborhood_Blueste'
 'cat__Neighborhood_BrDale' 'cat__Neighborhood_BrkSide'
 'cat__Neighborhood_ClearCr' 'cat__Neighborhood_CollgCr'
 'cat__Neighborhood_Crawfor']
