# Finding the Best Model

In [3]:
!pip install optuna

import optuna
import pandas as pd
import pickle

from pgeocode import Nominatim
from sklearn.compose import ColumnTransformer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor

Collecting optuna
  Obtaining dependency information for optuna from https://files.pythonhosted.org/packages/5c/5e/068798a8c7087863e7772e9363a880ab13fe55a5a7ede8ec42fab8a1acbb/optuna-4.4.0-py3-none-any.whl.metadata
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Obtaining dependency information for colorlog from https://files.pythonhosted.org/packages/e3/51/9b208e85196941db2f0654ad0357ca6388ab3ed67efdbfc799f35d1f83aa/colorlog-6.9.0-py3-none-any.whl.metadata
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
   ---------------------------------------- 0.0/395.9 kB ? eta -:--:--
   -- ------------------------------------ 20.5/395.9 kB 640.0 kB/s eta 0:00:01
   ------ -------------------------------- 61.4/395.9 kB 812.7 kB/s eta 0:00:01
   --------- ---------------------------- 102.4/395.9 kB 980.4 kB/s eta 0:00:01
   -------------- ----------------------- 153.6/395.9 kB 913.1 k

In [5]:
df = pd.read_csv('../data/housing_cleaned.csv')

df.head()

Unnamed: 0,price,city,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,sqft_basement,pct_basement,house_age,was_renovated,renovation_age,sqft_living15,sqft_lot15
0,231300.0,Seattle,2,1.0,1180,5650,1.0,3,0,0.0,70,0,70,1340,5650
1,538000.0,Seattle,3,2.25,2570,7242,2.0,3,400,0.155642,74,1,34,1690,7639
2,180000.0,Kenmore,2,1.0,770,10000,1.0,3,0,0.0,92,0,92,2720,8062
3,604000.0,Seattle,4,3.0,1960,5000,1.0,5,910,0.464286,60,0,60,1360,5000
4,510000.0,Sammamish,3,2.0,1680,8080,1.0,3,0,0.0,38,0,38,1800,7503


In [7]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=X['city'],
    random_state=42,
)

In [9]:
binary = ['was_renovated']
categorical = ['city']
numerical = X.select_dtypes(include=['float64', 'int64']).columns.drop(binary).tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(drop='first'), categorical),
    ('bin', 'passthrough', binary)
])

In [13]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42
    }

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(**params))
    ])

    # Use 3-fold cross-validation
    score = cross_val_score(
        model,
        X_train,
        y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_jobs=-1
    )

    return -1.0 * score.mean()

In [15]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[I 2025-07-15 15:03:10,800] A new study created in memory with name: no-name-0730738e-507a-49cb-9553-22f227d88473
[I 2025-07-15 15:03:15,961] Trial 0 finished with value: 199004.2259497925 and parameters: {'n_estimators': 1000, 'max_depth': 12, 'learning_rate': 0.1769121924539134, 'subsample': 0.9970248078473156, 'colsample_bytree': 0.8113458803248855, 'min_child_weight': 8}. Best is trial 0 with value: 199004.2259497925.
[I 2025-07-15 15:03:22,981] Trial 1 finished with value: 199172.26735637154 and parameters: {'n_estimators': 600, 'max_depth': 12, 'learning_rate': 0.2045942140143342, 'subsample': 0.8131698224974224, 'colsample_bytree': 0.8051512342389666, 'min_child_weight': 1}. Best is trial 0 with value: 199004.2259497925.
[I 2025-07-15 15:03:27,739] Trial 2 finished with value: 202620.6770043779 and parameters: {'n_estimators': 800, 'max_depth': 9, 'learning_rate': 0.22821650785282405, 'subsample': 0.7565497684399295, 'colsample_bytree': 0.6289480414960645, 'min_child_weight': 4}

In [16]:
study.best_params

{'n_estimators': 400,
 'max_depth': 7,
 'learning_rate': 0.0392644727808183,
 'subsample': 0.9436534387455704,
 'colsample_bytree': 0.5036947715909679,
 'min_child_weight': 6}

In [19]:
best_params = study.best_params

best_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(**best_params))
])

best_pipeline.fit(X_train, y_train)

In [23]:
y_pred_train = best_pipeline.predict(X_train)
y_pred_test = best_pipeline.predict(X_test)

In [25]:
print(f'Train RMSE: {root_mean_squared_error(y_train, y_pred_train):,.0f}')
print(f'Test RMSE: {root_mean_squared_error(y_test, y_pred_test):,.0f}')

Train RMSE: 106,192
Test RMSE: 164,350


In [27]:
with open('../api/models/best_optuna_model.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)