# LAB | Hyperparameter Tuning

**Load the data**

Finally step in order to maximize the performance on your Spaceship Titanic model.

The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

So far we've been training and evaluating models with default values for hyperparameters.

Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters.

In [7]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.datasets import  fetch_california_housing
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor


from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

In [8]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [9]:
spaceship_clean = spaceship.dropna()
spaceship_clean['Cabin'] = spaceship_clean['Cabin'].str[0]
spaceship_clean = spaceship.drop(columns=['PassengerId', 'Name'])
spaceship_clean_dummies = pd.get_dummies(spaceship_clean, drop_first=False)
target = 'Transported'
features = spaceship_clean_dummies.drop(columns=[target])


from sklearn.model_selection import train_test_split

mask = spaceship_clean_dummies[target].notna()
x=features.loc[mask]
y=spaceship_clean_dummies.loc[mask,target]
X_train, X_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.2,
    random_state=42, stratify=y
)   
print("NaNs in X:", x.isna().sum().sum(), " | NaNs in y:", y.isna().sum())

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Pipeline: impute → scale → KNN
knn_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),   # fill NaNs
    ('scaler', StandardScaler()),                   # scale features
    ('knn', KNeighborsClassifier(n_neighbors=5))    # model
])

# Train
knn_pipe.fit(X_train, y_train)

# Predict
y_pred = knn_pipe.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

normalizer = MinMaxScaler()
normalizer.fit(X_train)


X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spaceship_clean['Cabin'] = spaceship_clean['Cabin'].str[0]


NaNs in X: 1122  | NaNs in y: 0
Accuracy: 0.6877515813686026

Classification Report:
               precision    recall  f1-score   support

       False       0.66      0.75      0.70       863
        True       0.72      0.63      0.67       876

    accuracy                           0.69      1739
   macro avg       0.69      0.69      0.69      1739
weighted avg       0.69      0.69      0.69      1739



- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters.

In [10]:
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=8), n_estimators=100, max_samples=3000, bootstrap=True)
bagging_reg.fit(X_train_norm, y_train)
pred = bagging_reg.predict  (X_test_norm)
print("TEST DATA")

print("R2 score ", r2_score(y_test, pred))
print("RMSE ", root_mean_squared_error(y_test, pred))
print("MAE ", mean_absolute_error(y_test, pred))


print("----------------------------")

print("TRAIN DATA")
pred = bagging_reg.predict(X_train_norm)

print("R2 score ", r2_score(y_train, pred))
print("RMSE ", root_mean_squared_error(y_train, pred))
print("MAE ", mean_absolute_error(y_train, pred))

TEST DATA
R2 score  0.43456355054783635
RMSE  0.3759670366526507
MAE  0.27768965191994577
----------------------------
TRAIN DATA
R2 score  0.49330587227064115
RMSE  0.3559030531070573
MAE  0.26547961552658983


In [12]:
# --- prefer successive halving if available ---
use_halving = False
try:
    from sklearn.experimental import enable_halving_search_cv  # noqa: F401
    from sklearn.model_selection import HalvingRandomSearchCV
    use_halving = True
except Exception:
    pass

cv = KFold(n_splits=3, shuffle=True, random_state=42)

# shared base estimator (small trees during search; no OOB; avoid nested parallelism)
bag_search = BaggingRegressor(
    estimator=DecisionTreeRegressor(random_state=42),
    n_estimators=40,           # ignored by Halving as it will overwrite via `resource`
    max_samples=0.8,
    max_features=1.0,
    bootstrap=True,
    bootstrap_features=False,
    oob_score=False,
    random_state=42,
    n_jobs=1
)

# param spaces
param_space_common = {
    "estimator__max_depth": [None, 6, 10, 14],
    "estimator__min_samples_leaf": [1, 2, 4, 8],
    "estimator__min_samples_split": [2, 5, 10],
    "max_samples": [0.6, 0.8, 1.0],
    "max_features": [0.6, 0.8, 1.0],
    "bootstrap_features": [False],
}

scoring = "neg_root_mean_squared_error"

if use_halving:
    # NOTE: do NOT include 'n_estimators' here
    search = HalvingRandomSearchCV(
        estimator=bag_search,
        param_distributions=param_space_common,
        resource="n_estimators",
        min_resources=20,      # start with fewer trees
        max_resources=120,     # grow to more trees for survivors
        factor=3,
        scoring=scoring,
        cv=cv,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
else:
    # For randomized search, it's fine to include n_estimators as a hyperparam
    param_space_rand = {
        **param_space_common,
        "n_estimators": [30, 40, 60]
    }
    search = RandomizedSearchCV(
        estimator=bag_search,
        param_distributions=param_space_rand,
        n_iter=20,
        scoring=scoring,
        cv=cv,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

search.fit(Xtr, ytr)

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 20
max_resources_: 120
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 6
n_resources: 20
Fitting 3 folds for each of 6 candidates, totalling 18 fits
----------
iter: 1
n_candidates: 2
n_resources: 60
Fitting 3 folds for each of 2 candidates, totalling 6 fits


- Evaluate your model

In [None]:
#your code here

**Grid/Random Search**

For this lab we will use Grid Search.

- Define hyperparameters to fine tune.

In [14]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

rand = RandomizedSearchCV(
    bag,
    param_distributions=param_distributions,
    n_iter=40,                 # increase if you can afford it
    scoring=rmse_scorer,
    cv=cv,
    verbose=1,
    
    random_state=42
)

rand.fit(Xtr, ytr)
print("RandomizedSearch best params:", rand.best_params_)
print("CV RMSE:", -rand.best_score_)
print("OOB R^2 (best model):", getattr(rand.best_estimator_, "oob_score_", None))

best = rand.best_params_
grid_candidates = {
    "estimator__max_depth": sorted({best["estimator__max_depth"],  None,  (best["estimator__max_depth"] or 12)}),
    "estimator__min_samples_leaf": sorted({max(1, best["estimator__min_samples_leaf"]-1),
                                           best["estimator__min_samples_leaf"],
                                           best["estimator__min_samples_leaf"]+1}),
    "estimator__min_samples_split": sorted({max(2, best["estimator__min_samples_split"]-3),
                                            best["estimator__min_samples_split"],
                                            best["estimator__min_samples_split"]+3}),
    "n_estimators": sorted({best["n_estimators"], max(100, best["n_estimators"]//2),
                            min(800, best["n_estimators"]*2)}),
    "max_samples": sorted({best["max_samples"], min(1.0, float(best["max_samples"])+0.1),
                           max(0.5, float(best["max_samples"])-0.1)}),
    "max_features": sorted({best["max_features"], min(1.0, float(best["max_features"])+0.1),
                            max(0.5, float(best["max_features"])-0.1)}),
    "bootstrap_features": [best["bootstrap_features"]],
}




Fitting 5 folds for each of 40 candidates, totalling 200 fits


KeyboardInterrupt: 

- Run Grid Search

In [None]:
grid = GridSearchCV(
    rand.best_estimator_,
    param_grid=grid_candidates,
    scoring=rmse_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=1
)
grid.fit(Xtr, ytr)

print("Final best params:", grid.best_params_)
print("Final CV RMSE:", -grid.best_score_)


- Evaluate your model

In [None]:
# Evaluate on holdout
pred_test = grid.best_estimator_.predict(Xte)
pred_train = grid.best_estimator_.predict(Xtr)
