In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
import boto3
import yaml
from pathlib import Path
from cloudpathlib import S3Path
import sagemaker
from sagemaker import get_execution_role
from sagemaker import Session
from sagemaker.local import LocalSession
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner
from sagemaker.serverless import ServerlessInferenceConfig
from sagemaker.network import NetworkConfig

from sagemaker.analytics import ExperimentAnalytics
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [3]:
project_directory = Path.cwd() / "../"

In [4]:
boto_session = boto3.Session()
sagemaker_client = boto_session.client("sagemaker")
sagemaker_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client
)
region = boto_session.region_name
role = get_execution_role()
sagemaker_bucket = sagemaker_session.default_bucket()

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [5]:
params = yaml.safe_load(open('../code/params.yaml'))
hyperparameters = params["hyperparameters"]
directories = params["directories"]
datasets = params["datasets"]
sagemaker_params = yaml.safe_load(open('../params.yaml'))
train_params = sagemaker_params["train"]
model_params = sagemaker_params["model"]
inference_params = sagemaker_params["inference"]

In [6]:
train_s3_dir = train_params["inputs"]["train"]
test_s3_dir = train_params["inputs"]["test"]

In [6]:
!aws s3 cp $train_s3_dir ../data/train --recursive

download: s3://cad-alok-singh/us_in_season_corn_yield/8_stages/V0/train_test_2020/train/train.csv to ../data/train/train.csv


In [7]:
!aws s3 cp $test_s3_dir ../data/test --recursive

download: s3://cad-alok-singh/us_in_season_corn_yield/8_stages/V0/train_test_2020/test/test.csv to ../data/test/test.csv


In [9]:
experiment_name="gda-yeild-clarify"
trial_name=f"{experiment_name}-trail"

try:
    experiment = Experiment.load(experiment_name=experiment_name)
except Exception as ex:
    if "ResourceNotFound" in str(ex):
        experiment = Experiment.create(
            experiment_name=experiment_name,
            description="Clarify yield",
            tags=train_params["tags"],
            sagemaker_boto_client=sagemaker_client,
        )

try:
    trial = Trial.load(trial_name=f"{trial_name}")
except Exception as ex:
    if "ResourceNotFound" in str(ex):
        trial = Trial.create(
            experiment_name=experiment.experiment_name, 
            trial_name=f"{trial_name}",
            tags=train_params["tags"],
            sagemaker_boto_client=sagemaker_client,
        )

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [10]:
sklearn_estimator = SKLearn(
    source_dir = str(project_directory / train_params['source_dir']),
    entry_point=train_params["entry_point"],
    framework_version=train_params["framework_version"], 
    instance_type=train_params["instance_type"],
    role=role,
    instance_count=train_params["instance_count"],
    tags=train_params["tags"],
    base_job_name=train_params["base_job_name"],
    output_path=train_params["output_path"],
    hyperparameters=hyperparameters,
    container_log_level=train_params["container_log_level"],
    volume_size=train_params["volume_size"],
    max_run=train_params["max_run"],
    max_wait=train_params["max_wait"],
    enable_sagemaker_metrics=train_params["enable_sagemaker_metrics"],
    metric_definitions=train_params["metric_definitions"],
    use_spot_instances=train_params["use_spot_instances"],
    security_group_ids= train_params["security_group_ids"],
    subnets= train_params["subnets"],
)

sklearn_estimator.fit(
    inputs=train_params["inputs"],
        experiment_config={
                "TrialName": trial.trial_name,
                "TrialComponentDisplayName": "Training",
            },
        wait=True
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: gda-yield-training-job-2022-11-24-23-00-13-860


2022-11-24 23:00:14 Starting - Starting the training job...
2022-11-24 23:00:40 Starting - Preparing the instances for trainingProfilerReport-1669330814: InProgress
............
2022-11-24 23:02:38 Downloading - Downloading input data...
2022-11-24 23:03:19 Training - Downloading the training image...
2022-11-24 23:03:39 Training - Training image download completed. Training in progress..[34m2022-11-24 23:03:51,679 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-11-24 23:03:51,682 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-24 23:03:51,692 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-11-24 23:03:51,875 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting PyYAML
  Downloading PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.ma

In [41]:
sklearn_inference = SKLearnModel(
    model_data=sklearn_estimator.model_data,
    role=role,
    source_dir =str(project_directory / model_params['source_dir']),
    entry_point=model_params["entry_point"],
    framework_version=model_params["framework_version"], 
    name=model_params["name"],
)

In [42]:
predictor = sklearn_inference.deploy(
    endpoint_name=inference_params["endpoint_name"],
    instance_type=inference_params["instance_type"],
    initial_instance_count=inference_params["initial_instance_count"],
    security_group_ids= inference_params["security_group_ids"],
    subnets= inference_params["subnets"],
    tags=inference_params["tags"],
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker:Creating model with name: gda-yeild-infrence-provisioned
INFO:sagemaker:Creating endpoint-config with name gda-yeild-infrence-provisioned
INFO:sagemaker:Creating endpoint with name gda-yeild-infrence-provisioned


-------!

In [43]:
from sagemaker import clarify
from sagemaker.s3 import S3Downloader, S3Uploader

clarify_processor = clarify.SageMakerClarifyProcessor(
    job_name_prefix="gda-yield-clarify",
    role=role, 
    instance_count=1, 
    instance_type="ml.c4.xlarge", 
    sagemaker_session=sagemaker_session,
    tags=inference_params["tags"],
    network_config=NetworkConfig(
        security_group_ids= inference_params["security_group_ids"],
        subnets= inference_params["subnets"],
    )
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [48]:
df = pd.read_csv("../data/test/test.csv")
df = df.drop(datasets["drop_columns"], axis=1)
df = df.iloc[:10,:]

In [55]:
train_params["output_path"]

's3://sagemaker-eu-central-1-226275233641/cad-alok-singh/us_in_season_corn_yield/8_stages/V0/train_test_2020/model/Train'

In [51]:
clarify_file = Path("../data/test/") / "clarify_test.csv"
df.to_csv(clarify_file, index=False)

In [57]:
test_raw = S3Uploader.upload(str(clarify_file), train_params["output_path"])
print(test_raw)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


s3://sagemaker-eu-central-1-226275233641/cad-alok-singh/us_in_season_corn_yield/8_stages/V0/train_test_2020/model/Train/clarify_test.csv


In [58]:
X_test = df.drop(datasets["y_column"], axis=1)
y_true = df[datasets["y_column"]]

In [59]:
y_pred = predictor.predict(X_test)

In [60]:
mae = metrics.mean_absolute_error(y_true, y_pred)
rmse = (np.sqrt(metrics.mean_squared_error(y_true, y_pred)))
r2 = metrics.r2_score(y_true, y_pred)

print("Testing performance")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")

Testing performance
MAE: 21.36
RMSE: 26.47
R2: 0.35


In [61]:
shap_config = clarify.SHAPConfig(
    baseline=X_test.values.tolist(),
    # num_samples=2000,  # num_samples are permutations from your features, so should be large enough as compared to number of input features, for example, 2k + 2* num_features
    agg_method="mean_abs",
    use_logit=True,
)  # we want the shap values to have log-odds units so that the equation 'shap values + expected probability =  predicted probability' for each instance record )

In [62]:
explainability_data_config = clarify.DataConfig(
    s3_data_input_path=test_raw,
    s3_output_path=train_params["output_path"],
    label=datasets["y_column"],
    headers=df.columns.to_list(),
    dataset_type="text/csv",
)

In [63]:
model_config = clarify.ModelConfig(
    model_name=sklearn_inference.name,  # specify the inference pipeline model name
    instance_type="ml.c5.xlarge",
    instance_count=1,
    accept_type="text/csv",
)

In [64]:
clarify_processor.run_explainability(
    data_config=explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config,
)

INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['year', 'variety_name2', 'yield', 'relative_maturity_2', 'soil_type_2', 'previous_crop_2', 'irrigation', 'bd_0_5', 'bd_100_200', 'bd_15_30', 'bd_30_60', 'bd_5_15', 'bd_60_100', 'clay_0_5', 'clay_100_200', 'clay_15_30', 'clay_30_60', 'clay_5_15', 'clay_60_100', 'ksat_0_5', 'ksat_100_200', 'ksat_15_30', 'ksat_30_60', 'ksat_5_15', 'ksat_60_100', 'om_0_5', 'om_30_60', 'om_5_15', 'ph_0_5', 'ph_15_30', 'ph_30_60', 'ph_5_15', 'sand_0_5', 'sand_100_200', 'sand_15_30', 'sand_5_15', 'sand_60_100', 'silt_0_5', 'silt_100_200', 'silt_15_30', 'silt_30_60', 'silt_5_15', 'silt_60_100', 'theta_r_0_5', 'theta_r_100_200', 'theta_r_15_30', 'theta_r_30_60', 'theta_r_5_15', 'theta_r_60_100', 'theta_s_0_5', 'theta_s_100_200', 'theta_s_15_30', 'theta_s_30_60', 'theta_s_5_15', 'theta_s_60_100', 'V0_as2', 'V0_as1', 'V0_frost_days', 'V0_heat_stress', 'V0_dry_days', 'V0_low_humidity', 'V0_high_humidity', 'V0_evapotranspiration_mm_sur


Job Name:  gda-yield-clarify-2022-11-25-00-30-53-769
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-central-1-226275233641/cad-alok-singh/us_in_season_corn_yield/8_stages/V0/train_test_2020/model/Train/clarify_test.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-central-1-226275233641/cad-alok-singh/us_in_season_corn_yield/8_stages/V0/train_test_2020/model/Train/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-eu-central-1-226275233641/cad-alok-singh/us_in_season_corn_yie

UnexpectedStatusException: Error for Processing job gda-yield-clarify-2022-11-25-00-30-53-769: Failed. Reason: ClientError: An error occurred (AccessDeniedException) when calling the CreateEndpointConfig operation: User: arn:aws:sts::226275233641:assumed-role/globaldataanalyticsDataScientistRole/SageMaker is not authorized to perform: sagemaker:CreateEndpointConfig on resource: arn:aws:sagemaker:eu-central-1:226275233641:endpoint-config/sm-clarify-config-1669336636-ff94 because no identity-based policy allows the sagemaker:CreateEndpointConfig action

In [65]:
predictor.delete_model()
predictor.delete_endpoint()

INFO:sagemaker:Deleting model with name: gda-yeild-infrence-provisioned
INFO:sagemaker:Deleting endpoint configuration with name: gda-yeild-infrence-provisioned
INFO:sagemaker:Deleting endpoint with name: gda-yeild-infrence-provisioned


In [None]:
clarify_processor.