# Scikit-Learn Hyperparameter Tuning
Using local data (data was created from preprocessor script)

## Install fedml_gcp package

In [None]:
pip install fedml_gcp

## Import Libraries

In [None]:
import os
import json
from fedml_gcp import dwcgcp

## Some constant variables to use throughout the notebook

In [None]:
PROJECT_ID = '<project_id>'
REGION = '<region>'

BUCKET_NAME = '<bucket_name>'
BUCKET_URI = "gs://"+BUCKET_NAME
BUCKET_FOLDER = 'h_tuning'
MODEL_OUTPUT_DIR = BUCKET_URI+'/'+BUCKET_FOLDER
GCS_PATH_TO_MODEL_ARTIFACTS= MODEL_OUTPUT_DIR+'/model/'

TRAINING_PACKAGE_PATH = 'HyperparameterTuning'
JOB_NAME = "h-tuning-training"

MODEL_DISPLAY_NAME = "h-tuning-model"
DEPLOYED_MODEL_DISPLAY_NAME = 'h-tuning-deployed-model'

TAR_BUNDLE_NAME = 'HyperparameterTuning.tar.gz'

# Create DwcGCP Instance to access class methods and train model

It is expected that the bucket name passed here already exists in Cloud Storage.

For information on this constructor, please refer to the readme.

In [None]:
params = {'project':PROJECT_ID,
         'location':REGION, 
         'staging_bucket':BUCKET_URI}

In [None]:
dwc = dwcgcp.DwcGCP(params)


## Determine which training image and deploying image you want to use. 

Please refer here for the training pre-built containers: https://cloud.google.com/vertex-ai/docs/training/create-python-pre-built-container

Please refer here for the deployment pre-built containers: https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers

In [None]:
TRAIN_VERSION = <ADD YOUR TRAINING IMAGE HERE FROM THE GCP LINKS ABOVE>
DEPLOY_VERSION = <ADD YOUR DEPLOY IMAGE HERE FROM THE GCP LINKS ABOVE>

TRAIN_IMAGE = "us-docker.pkg.dev/vertex-ai/training/{}:latest".format(TRAIN_VERSION)
DEPLOY_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/{}:latest".format(DEPLOY_VERSION)

# Training using a custom training job and pre built container

For information on the dwc.train_model() function, please refer to the readme.

In the training inputs, we are using a script. When using a script, we have to pass the required packages needed as well.

We are also passing args which hold the file names to get data from and some other arguments we want to access in our training script. Before running the following cell, you should have to download the preprocessed data we want to use. Then create the tar bundle, and finally we can train.


In [None]:
dwc.download_blob(BUCKET_NAME, 'datapreprocessor/output/preprocessed_data.csv',
                  'HyperparameterTuning/trainer/preprocessed_data.csv')
dwc.download_blob(BUCKET_NAME, 'datapreprocessor/output/y_train.csv',
                  'HyperparameterTuning/trainer/labels.csv')

# Create tar bundle of script folder so GCP can use it for training

Please refer to the readme for more information on the dwc.make_tar_bundle() function

Before running this cell, please ensure that the script package has all the necessary files for a training job.

In [None]:
dwc.make_tar_bundle(TAR_BUNDLE_NAME, 
                    TRAINING_PACKAGE_PATH, 
                    BUCKET_FOLDER+'/train/'+TAR_BUNDLE_NAME)


In [None]:
hyperparameters = {
    'max_depth': [2, 4, 6],
    'n_estimators': [100, 250, 300],
    'max_features': [4, 5, 6, 'sqrt'],
    'min_samples_leaf': [25, 30]
    }

job_dir = 'gs://'+BUCKET_NAME
    
cmd_args = [
    "--preprocessed_file_name=" + 'preprocessed_data.csv',
    "--labels_file_name="+ 'labels.csv',
    "--hyperparameters=" + json.dumps(hyperparameters),
    "--n_jobs="+'24',
    "--job-dir=" + str(job_dir),
    "--bucket_name=" + str(BUCKET_NAME),
    "--bucket_folder=" + str(BUCKET_FOLDER)
]

In [None]:
inputs ={
    'display_name':JOB_NAME,
    'python_package_gcs_uri':BUCKET_URI + '/' + BUCKET_FOLDER+'/train/'+TAR_BUNDLE_NAME,
    'python_module_name':'trainer.task',
    'container_uri':TRAIN_IMAGE,
    'model_serving_container_image_uri':DEPLOY_IMAGE,
}

In [None]:
run_job_params = {'model_display_name':MODEL_DISPLAY_NAME,
                  'args':cmd_args,
                  'replica_count':1,
                  'base_output_dir':MODEL_OUTPUT_DIR,
                  'sync':True}

In [None]:
job = dwc.train_model(training_inputs=inputs, 
                      training_type='customPythonPackage',
                     params=run_job_params)

## Deployment

For information on the dwc.deploy() function please refer to the readme.

Here we are deploying the model we trained in the above cell.

In [None]:
model_config = {
    'deployed_model_display_name': DEPLOYED_MODEL_DISPLAY_NAME,
    'traffic_split':{"0": 100},
    'machine_type':'n1-standard-2',
    'min_replica_count':1,
    'max_replica_count':1,
    'sync':True
    
}
deployed_endpoint = dwc.deploy(model=job, model_config=model_config)

## Prediction

Once the model is deployed to an endpoint, we can run predictions on it.

For information on the dwc.predict() function please refer to the readme.

Since we are using DbConnection here, we will need to have the config.json in this notebook instance as well.

In [None]:
from fedml_gcp import DbConnection
import pandas as pd
import numpy as np

In [None]:
db = DbConnection()
org_data = db.get_data_with_headers(table_name="TITANIC_TEST_VIEW", size=1)
org_data = pd.DataFrame(org_data[0], columns=org_data[1])
org_data.Age.fillna(value=np.nan, inplace=True)
org_data = org_data.drop(['Name', 'Ticket'], axis=1)
org_data = org_data.drop(['Cabin'], axis=1)
org_data

In [None]:
select_dtypes = {
    'PassengerId': 'int64',
    'Pclass': 'int64',
    'Sex': 'string',
    'Age': 'float64',
    'SibSp': 'int64',
    'Parch': 'int64',
    'Fare': 'float64',
    'Embarked': 'string'
}

org_data = org_data.astype(select_dtypes)
org_data

In [None]:
data = {
    'instances': org_data.values.tolist()
}

In [None]:
predictions = dwc.predict(endpoint='4063249618492719104', predict_params=data)

In [None]:
type(predictions)

In [None]:
params = {
    'instances': predictions
}

In [None]:
final_predictions = dwc.predict(endpoint=deployed_endpoint, predict_params=params)

In [None]:
column_names = ['num__PassengerId', 'num__Pclass', 'num__Age', 'num__SibSp',
       'num__Parch', 'num__Fare', 'onehotencoder__x0_female',
       'onehotencoder__x0_male', 'onehotencoder__x1_C', 'onehotencoder__x1_Q',
       'onehotencoder__x1_S']
df = pd.DataFrame(predictions, columns=column_names)
df