# LAB | Hyperparameter Tuning

**Load the data**

Finally step in order to maximize the performance on your Spaceship Titanic model.

The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

So far we've been training and evaluating models with default values for hyperparameters.

Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters.

In [17]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
from sklearn.tree import  DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [3]:
spaceship.shape

(8693, 14)

In [4]:
spaceship.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [5]:
spaceship.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
spaceship.dropna(inplace=True)
spaceship.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [7]:
# Splitting the value of column "Cabin" by / and get the first value
spaceship['Cabin'] = spaceship['Cabin'].str.split('/').str.get(0)
spaceship

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [8]:
#Drop PassengerId and Name
spaceship2 = spaceship.drop(columns=["PassengerId","Name"])
spaceship2

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,Earth,True,G,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,Earth,False,G,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,Europa,False,E,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [9]:
#For non-numerical columns, do dummies

print(spaceship2['HomePlanet'].value_counts())
print(spaceship2['Destination'].value_counts())

HomePlanet
Earth     3566
Europa    1673
Mars      1367
Name: count, dtype: int64
Destination
TRAPPIST-1e      4576
55 Cancri e      1407
PSO J318.5-22     623
Name: count, dtype: int64


In [10]:
spaceship2_enc = spaceship2.copy()
spaceship2_transformed = pd.get_dummies(spaceship2_enc[['HomePlanet','Destination']],drop_first=True)
spaceship2_transformed

Unnamed: 0,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,True,False,False,True
1,False,False,False,True
2,True,False,False,True
3,True,False,False,True
4,False,False,False,True
...,...,...,...,...
8688,True,False,False,False
8689,False,False,True,False
8690,False,False,False,True
8691,True,False,False,False


In [11]:
dummies = pd.get_dummies(spaceship2_enc[['HomePlanet', 'Destination']], prefix=['HomePlanet', 'Destination'], drop_first=True)

spaceship2_transformed = spaceship2_enc.join(dummies)

spaceship2_transformed

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,Europa,False,B,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,True,False,False,True
1,Earth,False,F,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,False,False,False,True
2,Europa,False,A,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,True,False,False,True
3,Europa,False,A,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,True,False,False,True
4,Earth,False,F,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,True,False,False,False
8689,Earth,True,G,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,False,False,True,False
8690,Earth,False,G,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,False,False,False,True
8691,Europa,False,E,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,True,False,False,False


In [12]:
booleans = ['CryoSleep', 'VIP', 'Transported', 'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']

spaceship2_transformed[booleans] = spaceship2_transformed[booleans].astype(int)

spaceship2_transformed


Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,Europa,0,B,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,1
1,Earth,0,F,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,0,0,1
2,Europa,0,A,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,1,0,0,1
3,Europa,0,A,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,1,0,0,1
4,Earth,0,F,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,0,A,55 Cancri e,41.0,1,0.0,6819.0,0.0,1643.0,74.0,0,1,0,0,0
8689,Earth,1,G,PSO J318.5-22,18.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
8690,Earth,0,G,TRAPPIST-1e,26.0,0,0.0,0.0,1872.0,1.0,0.0,1,0,0,0,1
8691,Europa,0,E,55 Cancri e,32.0,0,0.0,1049.0,0.0,353.0,3235.0,0,1,0,0,0


- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters.

In [14]:

# Perform train test split 

features = spaceship2_transformed.drop(columns=["HomePlanet", "Cabin", "Destination", "Transported"], axis=1)
target = spaceship2_transformed["Transported"]


features

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,1,0,0,1
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,0,0,0,1
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,1,0,0,1
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,1,0,0,1
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,1,0,0,0
8689,1,18.0,0,0.0,0.0,0.0,0.0,0.0,0,0,1,0
8690,0,26.0,0,0.0,0.0,1872.0,1.0,0.0,0,0,0,1
8691,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,1,0,0,0


In [15]:
target

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: Transported, Length: 6606, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [18]:
normalizer = MinMaxScaler()
normalizer.fit(X_train)

X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)
X_train_norm =pd.DataFrame(X_train_norm, columns = X_train.columns)
X_test_norm =pd.DataFrame(X_test_norm, columns = X_test.columns)



knn = KNeighborsClassifier(n_neighbors=4)

knn.fit(X_train_norm.values,y_train)
print(knn.score(X_test_norm.values,y_test))


pred =knn.predict(X_test_norm.values)


print(classification_report(y_test, pred))

0.7428139183055976
              precision    recall  f1-score   support

           0       0.72      0.81      0.76       661
           1       0.78      0.68      0.73       661

    accuracy                           0.74      1322
   macro avg       0.75      0.74      0.74      1322
weighted avg       0.75      0.74      0.74      1322



In [19]:
# Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters
# BAGGING

bagging_class = BaggingClassifier(DecisionTreeClassifier(max_depth=20),n_estimators=100,max_samples=1000)

bagging_class.fit(X_train_norm, y_train)

pred = bagging_class.predict(X_test_norm)


print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79       661
           1       0.78      0.80      0.79       661

    accuracy                           0.79      1322
   macro avg       0.79      0.79      0.79      1322
weighted avg       0.79      0.79      0.79      1322



In [20]:
# GRADIENT BOOSTING
gb_class = GradientBoostingClassifier(max_depth=20,n_estimators=100)
gb_class.fit(X_train_norm, y_train)

pred = gb_class.predict(X_test_norm)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78       661
           1       0.77      0.79      0.78       661

    accuracy                           0.78      1322
   macro avg       0.78      0.78      0.78      1322
weighted avg       0.78      0.78      0.78      1322



Given these comparisons, the first model seem to perform slightly better than the second (looking at the precision, recall & F1 score)

**Grid/Random Search**

For this lab we will use Grid Search.

- Define hyperparameters to fine tune.

In [21]:
grid_bagging = {"n_estimators": [50, 100,500],
       
        "estimator__max_depth":[10],
          'max_samples': [0.5, 0.75, 1.0],
     }

- Run Grid Search

In [22]:
bagging_class = BaggingClassifier(DecisionTreeClassifier(max_depth=20),n_estimators=100,max_samples=1000)
bagging_class.fit(X_train_norm, y_train)

- Evaluate your model

In [23]:
model = GridSearchCV(estimator = bagging_class, param_grid = grid_bagging, cv=5)
model.fit(X_train_norm, y_train)

In [24]:
model.best_params_

{'estimator__max_depth': 10, 'max_samples': 0.75, 'n_estimators': 50}

In [25]:
best_model_grid_bagging = model.best_estimator_
pred_grid = bagging_class.predict(X_test_norm)
print(classification_report(y_test, pred_grid))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78       661
           1       0.78      0.79      0.78       661

    accuracy                           0.78      1322
   macro avg       0.78      0.78      0.78      1322
weighted avg       0.78      0.78      0.78      1322



In [26]:
grid_random = {"n_estimators": [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
        # "estimator__max_leaf_nodes": [int(x) for x in np.linspace(start = 500, stop = 3000, num = 10)],
        "estimator__max_depth":[int(x) for x in np.linspace(10, 110, num = 11)]}

In [27]:
bagging_class = BaggingClassifier(DecisionTreeClassifier(max_depth=20),n_estimators=100,max_samples=1000)
bagging_class.fit(X_train_norm, y_train)

In [28]:
model_random_bagging = RandomizedSearchCV(estimator = bagging_class, param_distributions = grid_random, n_iter = 10, cv = 5, n_jobs = -1)

In [29]:
model_random_bagging.fit(X_train_norm,y_train)

In [30]:
model_random_bagging.best_params_

{'n_estimators': 200, 'estimator__max_depth': 80}

In [31]:
best_model_random = model.best_estimator_

In [32]:
pred_random= best_model_random.predict(X_test_norm)
print(classification_report(y_test, pred_random))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78       661
           1       0.78      0.80      0.79       661

    accuracy                           0.79      1322
   macro avg       0.79      0.79      0.79      1322
weighted avg       0.79      0.79      0.79      1322



In [33]:
#remove left
pred_random= best_model_random.predict(X_test_norm)
print(classification_report(y_test, pred_random))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78       661
           1       0.78      0.80      0.79       661

    accuracy                           0.79      1322
   macro avg       0.79      0.79      0.79      1322
weighted avg       0.79      0.79      0.79      1322

