In [13]:
import boto3

from sagemaker.processing import FrameworkProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

In [14]:
region = boto3.session.Session().region_name
role = get_execution_role()
sess = sagemaker.Session()

## Step 1.1 Get fields data

In [2]:
%%writefile requirements.txt
psycopg2-binary==2.9.3
sqlalchemy-redshift==0.8.11
pydantic==1.10.2 

Writing requirements.txt


In [3]:
!mkdir code/step1_1
!cp 1_1_get_fields.py ./code/step1_1
!cp requirements.txt ./code/step1_1

In [None]:
est_cls = SKLearn
framework_version_str = "0.20.0"

tags = [
    {'Key':'Application', 'Value':'CAD'},
    {'Key':"Cost Center", 'Value':'68230'}
]
security_group_ids = ['sg-08ec780df00d293b0']
subnets = ['subnet-05bd78860c2f05164','subnet-0d8e3fd5532d83e69']

script_processor = FrameworkProcessor(
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    estimator_cls=est_cls,
    framework_version=framework_version_str,
    tags=tags,
    base_job_name='cad',
)
script_processor.run(
    code="1_1_get_fields.py",
    source_dir="code/step1_1",
    inputs=[],
    outputs=[
        ProcessingOutput(output_name="field_data", source="/opt/ml/processing")
    ],
    arguments=['--bucket', 'cad-alok-singh', 
               '--folder', 'us_in_season_corn_yield', 
               '--field-file', '1_field_raw_data.csv'],
)
preprocessing_job_description = script_processor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "field_data":
        field_data = output["S3Output"]["S3Uri"]
print(field_data)

## Training Job

In [None]:
bucket = 'cad-alok-singh'
folder = 'us_in_season_corn_yield/8_stages/V0/train_test_2020'
preprocessed_training_data = f's3://{bucket}/{folder}/train'
preprocessed_testing_data = f's3://{bucket}/{folder}/test'
output_dir =  f's3://{bucket}/{folder}/model'

In [None]:
%%writefile requirements.txt
catboost==1.1

In [None]:
!mkdir code/train
!cp train_catboost.py ./code/train
!mv requirements.txt ./code/train

In [None]:
base_job_name = 'cad'
tags = [
    {'Key':'Application', 'Value':'CAD'},
    {'Key':"Cost Center", 'Value':'68230'}
]
security_group_ids = ['sg-08ec780df00d293b0']
subnets = ['subnet-05bd78860c2f05164','subnet-0d8e3fd5532d83e69']
hp = {
    "learning_rate": 0.1,
    "iterations": 50,
    "max_depth": 4,
    "l2_leaf_reg": 2,
    "subsample":  0.8,
    'num_boost_round':'500'
}

sklearn = SKLearn(
    entry_point="train_catboost.py",
    source_dir = 'code/train',
    framework_version="0.20.0", 
    instance_type="ml.m4.xlarge", 
    role=role,
    instance_count=1,
    tags=tags,
    sagemaker_session=sess,
    security_group_ids= ['sg-08ec780df00d293b0'],
    subnets= ['subnet-05bd78860c2f05164','subnet-0d8e3fd5532d83e69'],
    base_job_name=base_job_name,
    output_path=output_dir,
    hyperparameters = hp
)
sklearn.fit({"train": preprocessed_training_data})

## Multiple training jobs for multiple stages 

In [20]:
bucket = 'cad-alok-singh'

stages = ['V0', 'VE', 'bin1', 'bin2', 'bin3', 'bin4', 'R2', 'R3', 'R4', 'R5']
test_year = '2020'

preprocessed_training_data_list = []
preprocessed_testing_data_list = []
output_dir_list = []

for stage in stages:
    
    folder = f'us_in_season_corn_yield/8_stages/{stage}/train_test_{test_year}'
    preprocessed_training_data_list.append( f's3://{bucket}/{folder}/train')
    preprocessed_testing_data_list.append( f's3://{bucket}/{folder}/test')
    output_dir_list.append( f's3://{bucket}/{folder}/model')

In [24]:
%%writefile requirements.txt
catboost==1.1

Writing requirements.txt


In [26]:
!mkdir code/train
!cp 10_train_catboost.py ./code/train
!cp requirements.txt ./code/train

In [None]:
base_job_name = 'cad'
tags = [
    {'Key':'Application', 'Value':'CAD'},
    {'Key':"Cost Center", 'Value':'68230'}
]
security_group_ids = ['sg-08ec780df00d293b0']
subnets = ['subnet-05bd78860c2f05164','subnet-0d8e3fd5532d83e69']
hp = {
    "learning_rate": 0.1,
    "iterations": 50,
    "max_depth": 4,
    "l2_leaf_reg": 2,
    "subsample":  0.8,
    'num_boost_round':'500'
}


for preprocessed_training_data, output_dir in zip(preprocessed_training_data_list, output_dir_list):
    sklearn = SKLearn(
        entry_point="10_train_catboost.py",
        source_dir = 'code/train',
        framework_version="0.20.0", 
        instance_type="ml.m4.xlarge", 
        role=role,
        instance_count=1,
        tags=tags,
        sagemaker_session=sess,
        security_group_ids= ['sg-08ec780df00d293b0'],
        subnets= ['subnet-05bd78860c2f05164','subnet-0d8e3fd5532d83e69'],
        base_job_name=base_job_name,
        output_path=output_dir,
        hyperparameters = hp,
    )
    sklearn.fit({"train": preprocessed_training_data}, logs=True)


2022-09-29 02:29:55 Starting - Starting the training job....
2022-09-29 02:30:20 Starting - Preparing the instances for training.............
2022-09-29 02:31:28 Downloading - Downloading input data...
2022-09-29 02:31:48 Training - Downloading the training image............
2022-09-29 02:32:53 Training - Training image download completed. Training in progress.......
2022-09-29 02:33:28 Uploading - Uploading generated training model..
2022-09-29 02:33:44 Completed - Training job completed

2022-09-29 02:33:47 Starting - Starting the training job.....
2022-09-29 02:34:12 Starting - Preparing the instances for training............
2022-09-29 02:35:20 Downloading - Downloading input data....
2022-09-29 02:35:46 Training - Downloading the training image...........
2022-09-29 02:36:47 Training - Training image download completed. Training in progress.......
2022-09-29 02:37:22 Uploading - Uploading generated training model..
2022-09-29 02:37:38 Completed - Training job completed

2022-09-2