In [1]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

# 📥 Load dataset
df = pd.read_csv("/content/kc_house_data.csv")

# 🧹 Basic cleaning
df = df.drop_duplicates()
df = df[(df['bedrooms'] > 0) & (df['bathrooms'] > 0)]
df = df.drop(columns=['id', 'date'])

# 🧪 Feature Engineering: Interaction term
df['grade_x_sqft'] = df['grade'] * df['sqft_living']

# 🎯 Features and target
X = pd.get_dummies(df.drop(columns='price'), drop_first=True)
y = df['price']

# 🧪 Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🎯 Hyperparameter space for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# 🚀 Randomized Search on XGBoost
xgb = XGBRegressor(random_state=42, verbosity=0)
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring='r2',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# 📈 Evaluate
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# 📊 Output
print("✅ Best Hyperparameters:")
print(random_search.best_params_)
print(f"R² Score on test set: {r2:.4f}")
print(f"RMSE on test set: {rmse:.2f}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
✅ Best Hyperparameters:
{'subsample': 0.7, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
R² Score on test set: 0.8745
RMSE on test set: 127780.25
