# Imports

In [1]:
import pandas as pd
import numpy as np
# For timing training times
import time

In [2]:
import sys
import os
src_dir = os.path.join(os.getcwd(), '..')
sys.path.append(src_dir)

In [3]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [19]:
from sklearn.model_selection import cross_val_score

In [72]:
from src.data.make_dataset import load_data
from src.models.train_model import split_X_y, model_find_hyperparameters
from src.models.train_model import train_and_save_xgboost
from src.features.build_features import create_preprocessor

# Load data

In [73]:
df = load_data("interim/train_transformed.csv")
df.head(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupNumber,GroupSize,Deck,Side,SpendingTotal,GroupTotalSpending,AvgSpendingPerMember
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,1,B,P,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,1,F,S,736.0,736.0,736.0
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,2,A,S,10383.0,15559.0,7779.5
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,2,A,S,5176.0,15559.0,7779.5
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,1,F,S,1091.0,1091.0,1091.0


# Split data

In [53]:
X, y = split_X_y(df, "Transported")
X.shape, y.shape

((8693, 17), (8693,))

# Finding best hyperparameters for XGBoost

In [54]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Learning rate
    'n_estimators': [50, 100, 200, 300],  # Number of boosting rounds
    'max_depth': [3, 4, 5, 6, 7],  # Maximum depth of a tree
    'min_child_weight': [1, 5, 10],  # Minimum sum of instance weight needed in a child
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 1, 5],  # Minimum loss reduction required to make a further partition on a leaf node of the tree
    'subsample': [0.8, 0.9, 1.0],  # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 0.9, 1.0],  # Subsample ratio of features when constructing each tree
    'reg_alpha': [0, 0.1, 0.5, 1],  # L1 regularization term on weights
    'reg_lambda': [0, 0.1, 0.5, 1, 5]  # L2 regularization term on weights
}

In [61]:
# Start the timer
start_time = time.time()

gs_clf = model_find_hyperparameters(X, y, param_distributions=param_grid, n_iter=1000)

# Find the running time
end_time = time.time()

In [62]:
# How long did it take? 
total_time = end_time - start_time
print(f"[INFO] The total running time for running RandomizedSearchCV was {total_time:.2f} seconds.")

[INFO] The total running time for running RandomizedSearchCV was 4547.11 seconds.


In [63]:
# Check the best hyperparameters found with RandomizedSearchCV
gs_clf.best_params_

{'subsample': 0.8,
 'reg_lambda': 5,
 'reg_alpha': 0,
 'n_estimators': 300,
 'min_child_weight': 5,
 'max_depth': 4,
 'learning_rate': 0.1,
 'gamma': 0.4,
 'colsample_bytree': 1.0}

# Training final model

In [70]:
model = train_and_save_xgboost(X, y, gs_clf.best_params_)