In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Step 2: Load data
train = pd.read_csv("train.csv")

# Step 3: Select features and target
features = ["OverallQual", "GrLivArea", "GarageCars", "TotalBsmtSF", "Neighborhood"]
X = train[features]
y = train["SalePrice"]

# Step 4: Split feature types
num_features = ["OverallQual", "GrLivArea", "GarageCars", "TotalBsmtSF"]
cat_features = ["Neighborhood"]

# Step 5: Preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ]
)

# Step 6: Build final pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        random_state=42
    ))
])

# Step 7: Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train model
model.fit(X_train, y_train)

# Step 9: Evaluate model
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

# Step 10: Cross-validation
scores = -cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=5)
print("Cross-validation RMSE:", scores.mean())

# Step 11: Retrain on full data
model.fit(X, y)

# Step 12: Save trained model
joblib.dump(model, "house_price_model.pkl")
print("Model saved as house_price_model.pkl")


Validation RMSE: 28576.539641839114
Cross-validation RMSE: 31802.746178334557
Model saved as house_price_model.pkl
