In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import randint, uniform
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv("../data/processed/zillow_cleaned_feature_engineered_encoded.csv")
df.head()


Unnamed: 0,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedfinishedsquarefeet,fireplacecnt,garagecarcnt,garagetotalsqft,latitude,longitude,lotsizesquarefeet,...,regionidneighborhood_top_274517.0,regionidneighborhood_top_275078.0,regionidneighborhood_top_275405.0,regionidneighborhood_top_275496.0,regionidneighborhood_top_276119.0,regionidneighborhood_top_276450.0,regionidneighborhood_top_276476.0,regionidneighborhood_top_276514.0,regionidneighborhood_top_403184.0,has_garage
0,3.5,4.0,6.0,3100.0,1.0,2.0,633.0,33634931.0,-117869207.0,4506.0,...,False,False,False,False,False,False,False,False,False,1
1,1.0,2.0,6.0,1465.0,1.0,1.0,360.0,34449266.0,-119281531.0,12647.0,...,False,False,False,False,False,False,False,False,False,1
2,2.0,3.0,6.0,1243.0,1.0,2.0,440.0,33886168.0,-117823170.0,8432.0,...,False,False,False,False,False,False,False,False,False,1
3,3.0,4.0,8.0,2376.0,1.0,2.0,436.0,34245180.0,-118240722.0,13038.0,...,False,False,False,False,False,False,False,False,False,1
4,3.0,3.0,8.0,1312.0,1.0,2.0,436.0,34185120.0,-118414640.0,278581.0,...,False,False,False,False,False,False,False,False,False,1


In [4]:
X = df.drop(columns=["taxvaluedollarcnt"])
y = df["taxvaluedollarcnt"]    

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train:", X_train.shape, " Test:", X_test.shape)




Train: (61378, 213)  Test: (15345, 213)


In [None]:
#do pipeline
#randomized search
#final model predict, x test, y test, r2, mae, rmse
#confusion matrix, feature importance analysis, permutation importance


Ridge

In [None]:
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

ridge_param_grid = {'ridge__alpha': [100.0, 200.0, 300.0, 400.0, 500.0]}

ridge_search = GridSearchCV(
    estimator=ridge_pipeline,
    param_grid=ridge_param_grid,
    scoring={
        'r2': 'r2',
        'mae': 'neg_mean_absolute_error',
        'rmse': 'neg_root_mean_squared_error'
    },
    refit='r2',
    cv=5,
    n_jobs=-1,
    verbose=1
)

ridge_search.fit(X_train, y_train)

ridge_cv = pd.DataFrame({
    'alpha': [p['ridge__alpha'] for p in ridge_search.cv_results_['params']],
    'R2': ridge_search.cv_results_['mean_test_r2'],
    'MAE': -ridge_search.cv_results_['mean_test_mae'],
    'RMSE': -ridge_search.cv_results_['mean_test_rmse']
}).sort_values(by='R2', ascending=False)

print("=== Ridge CV Results ===")
print(ridge_cv.head())

Fitting 5 folds for each of 5 candidates, totalling 25 fits
=== Ridge CV Results ===
   alpha        R2            MAE           RMSE
1  200.0  0.801581  113087.623478  273550.929098
0  100.0  0.801558  113475.839852  273549.146675
2  300.0  0.801530  112747.612476  273600.339713
3  400.0  0.801423  112440.625671  273687.131196
4  500.0  0.801270  112161.640661  273803.661319


Random Forest

In [None]:
# Define parameter space
rf_param_dist = {
    'n_estimators': randint(380, 450),                # tighter range around 413
    'max_depth': randint(14, 20),                     # around 16
    'min_samples_leaf': randint(1, 4),                # low-leaf size
    'min_samples_split': randint(2, 6),               # try a few regularized values
    'max_features': uniform(0.9, 0.1),                # around 0.95
}


# Initialize model
rf_model = RandomForestRegressor(n_jobs=-1, random_state=42)

# Run RandomizedSearchCV
rf_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_param_dist,
    n_iter=20,
    scoring={
        'r2': 'r2',
        'mae': 'neg_mean_absolute_error',
        'rmse': 'neg_root_mean_squared_error'
    },
    refit='r2',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

rf_search.fit(X_train, y_train)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
cv_results = rf_search.cv_results_

rf_cv = pd.DataFrame({
    'R2': cv_results['mean_test_r2'],
    'MAE': -cv_results['mean_test_mae'],
    'RMSE': -cv_results['mean_test_rmse']
}).sort_values(by='R2', ascending=False)

print("=== Fine-tuned RF CV Results ===")
print(rf_cv.head())


          R2           MAE          RMSE
3   0.979280   5265.619016  87716.769806
18  0.976920   6570.545530  92750.329219
22  0.976339  12666.437189  94019.474813
8   0.975483   5667.323652  95797.118062
49  0.974129   5688.135292  98182.594622


In [None]:
# Extract best params
best_params = rf_search.best_params_
print(best_params)


{'max_depth': 16, 'max_features': np.float64(0.9569868963110251), 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 413}


LightGBM

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import randint, uniform
import numpy as np
import pandas as pd

# Define parameter distribution
lgb_param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 30),
    'learning_rate': uniform(0.01, 0.3),
    'num_leaves': randint(20, 150),
    'min_child_samples': randint(5, 30),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
}

# Initialize model
lgb_model = LGBMRegressor(random_state=42, n_jobs=-1)

# RandomizedSearchCV setup (CV-only)
lgb_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=lgb_param_dist,
    n_iter=20,
    cv=3,
    scoring={
        'r2': 'r2',
        'mae': 'neg_mean_absolute_error',
        'rmse': 'neg_root_mean_squared_error',
    },
    refit='r2',  # Use R² to select the best model
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit using only training data
lgb_search.fit(X_train, y_train)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3513
[LightGBM] [Info] Number of data points in the train set: 61378, number of used features: 209
[LightGBM] [Info] Start training from score 487319.324253


0,1,2
,estimator,LGBMRegressor...ndom_state=42)
,param_distributions,"{'colsample_bytree': <scipy.stats....001FDB5EB8E90>, 'learning_rate': <scipy.stats....001FDB5EBBEF0>, 'max_depth': <scipy.stats....001FDB5791C10>, 'min_child_samples': <scipy.stats....001FDB5EBA810>, ...}"
,n_iter,20
,scoring,"{'mae': 'neg_mean_absolute_error', 'r2': 'r2', 'rmse': 'neg_root_mean_squared_error'}"
,n_jobs,-1
,refit,'r2'
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,boosting_type,'gbdt'
,num_leaves,78
,max_depth,27
,learning_rate,np.float64(0....9891565915222)
,n_estimators,574
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
# Extract CV results
cv_results = lgb_search.cv_results_

cv_summary = pd.DataFrame({
    'R2':   cv_results['mean_test_r2'],
    'MAE': -cv_results['mean_test_mae'],   # negate because it's neg_mae
    'RMSE': -cv_results['mean_test_rmse'], # negate because it's neg_rmse
})

# Sort by R² descending
cv_summary_sorted = cv_summary.sort_values(by='R2', ascending=False)
print(cv_summary_sorted.head())


          R2           MAE           RMSE
3   0.954329  15440.415854  131330.688215
17  0.952750  16748.634221  133535.370281
14  0.952366  23750.929197  134078.216661
8   0.951909  19660.952114  134730.936561
19  0.951257  16885.391894  135565.343149
