In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import boto3
import yaml
from pathlib import Path
from cloudpathlib import S3Path
import sagemaker
from sagemaker import get_execution_role
from sagemaker import Session
from sagemaker.local import LocalSession
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner
from sagemaker.serverless import ServerlessInferenceConfig
from sagemaker.network import NetworkConfig

In [2]:
project_directory = Path.cwd() / "../"

In [3]:
boto_session = boto3.Session()
sagemaker_client = boto_session.client("sagemaker")
sagemaker_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client
)
region = boto_session.region_name
role = get_execution_role()
sagemaker_bucket = sagemaker_session.default_bucket()

In [12]:
params = yaml.safe_load(open('../code/params.yaml'))
hyperparameters = params["hyperparameters"]
directories = params["directories"]
datasets = params["datasets"]
sagemaker_params = yaml.safe_load(open('../params.yaml'))
train_params = sagemaker_params["train"]
model_params = sagemaker_params["model"]
inference_params = sagemaker_params["inference"]

In [5]:
train_s3_dir = train_params["inputs"]["train"]
test_s3_dir = train_params["inputs"]["test"]

In [6]:
!aws s3 cp $train_s3_dir ../data/train --recursive

download: s3://cad-alok-singh/us_in_season_corn_yield/8_stages/V0/train_test_2020/train/train.csv to ../data/train/train.csv


In [7]:
!aws s3 cp $test_s3_dir ../data/test --recursive

download: s3://cad-alok-singh/us_in_season_corn_yield/8_stages/V0/train_test_2020/test/test.csv to ../data/test/test.csv


In [8]:
sklearn_estimator = SKLearn(
    source_dir = str(project_directory / train_params['source_dir']),
    entry_point=train_params["entry_point"],
    framework_version=train_params["framework_version"], 
    instance_type=train_params["instance_type"],
    role=role,
    instance_count=train_params["instance_count"],
    tags=train_params["tags"],
    base_job_name=train_params["base_job_name"],
    output_path=train_params["output_path"],
    hyperparameters=hyperparameters,
    container_log_level=train_params["container_log_level"],
    volume_size=train_params["volume_size"],
    max_run=train_params["max_run"],
    max_wait=train_params["max_wait"],
    enable_sagemaker_metrics=train_params["enable_sagemaker_metrics"],
    metric_definitions=train_params["metric_definitions"],
    use_spot_instances=train_params["use_spot_instances"],
    security_group_ids= train_params["security_group_ids"],
    subnets= train_params["subnets"],
)

sklearn_estimator.fit(
    inputs=train_params["inputs"],
    wait=train_params["wait"]
)

2022-11-24 21:48:43 Starting - Starting the training job...
2022-11-24 21:49:10 Starting - Preparing the instances for trainingProfilerReport-1669326522: InProgress
............
2022-11-24 21:51:08 Downloading - Downloading input data...
2022-11-24 21:51:48 Training - Downloading the training image...
2022-11-24 21:52:08 Training - Training image download completed. Training in progress.[34m2022-11-24 21:52:09,749 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-11-24 21:52:09,751 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-24 21:52:09,762 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-11-24 21:52:09,946 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting PyYAML
  Downloading PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.man

In [9]:
sklearn_inference = SKLearnModel(
    model_data=sklearn_estimator.model_data,
    role=role,
    source_dir =str(project_directory / model_params['source_dir']),
    entry_point=model_params["entry_point"],
    framework_version=model_params["framework_version"], 
    name=model_params["name"],
)

In [13]:
predictor = sklearn_inference.deploy(
    endpoint_name=inference_params["endpoint_name"],
    instance_type=inference_params["instance_type"],
    initial_instance_count=inference_params["initial_instance_count"],
    security_group_ids= inference_params["security_group_ids"],
    subnets= inference_params["subnets"],
    tags=inference_params["tags"],
    serverless_inference_config=ServerlessInferenceConfig(
        **inference_params["serverles_inference_config"]
    ),
)

--------!

In [14]:
df = pd.read_csv("../data/test/test.csv")
df = df.drop(datasets["drop_columns"], axis=1)
df = df.iloc[:10,:]

In [15]:
X_test = df.drop(datasets["y_column"], axis=1)
y_true = df[datasets["y_column"]]

In [16]:
y_pred = predictor.predict(X_test)

In [17]:
mae = metrics.mean_absolute_error(y_true, y_pred)
rmse = (np.sqrt(metrics.mean_squared_error(y_true, y_pred)))
r2 = metrics.r2_score(y_true, y_pred)

print("Testing performance")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")

Testing performance
MAE: 21.36
RMSE: 26.47
R2: 0.35


In [18]:
predictor.delete_model()
predictor.delete_endpoint()