# Pre-Processing

In [2]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor

region = boto3.session.Session().region_name

role = get_execution_role()
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0", role=role, instance_type="ml.t3.medium", instance_count=1
)

In [4]:
import pandas as pd

input_data = "s3://sagemaker-workshopdata1/census-income.csv"
df = pd.read_csv(input_data, nrows=10)
#df.dtypes
df.head(n=10)

Unnamed: 0,age,class_of_worker,education,major_industry_code,capital_gains,capital_losses,dividends_from_stocks,num_persons_worked_for_employer,income
0,73,Not in universe,High school graduate,Not in universe or children,0,0,0,0,0
1,58,Self-employed-not incorporated,Some college but no degree,Construction,0,0,0,1,0
2,18,Not in universe,10th grade,Not in universe or children,0,0,0,0,0
3,9,Not in universe,Children,Not in universe or children,0,0,0,0,0
4,10,Not in universe,Children,Not in universe or children,0,0,0,0,0
5,48,Private,Some college but no degree,Entertainment,0,0,0,1,0
6,42,Private,Bachelors degree(BA AB BS),Finance insurance and real estate,5178,0,0,6,0
7,28,Private,High school graduate,Construction,0,0,0,4,0
8,47,Local government,Some college but no degree,Education,0,0,0,5,0
9,34,Private,Some college but no degree,Construction,0,0,0,6,0


In [5]:
%%writefile preprocessing.py

import argparse
import os
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer

from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action="ignore", category=DataConversionWarning)

## Write comments everywhere and include ref slide no.
columns = [
    "age",
    "class_of_worker",
    "education",
    "major_industry_code",
    "capital_gains",
    "capital_losses",
    "dividends_from_stocks",
    "num_persons_worked_for_employer",
    "income",
]
class_labels = [0, 1]


def print_shape(df):
    print('*****IN print_shape df')
    negative_examples, positive_examples = np.bincount(df["income"])
    print(
        "Data shape: {}, {} positive examples, {} negative examples".format(
            df.shape, positive_examples, negative_examples
        )
    )


if __name__ == "__main__":
    print('**** IN MAIN')
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-test-split-ratio", type=float, default=0.3)
    args, _ = parser.parse_known_args()

    print("Received arguments {}".format(args))

    input_data_path = os.path.join("/opt/ml/processing/input", "census-income.csv")

    print("Reading input data from {}".format(input_data_path))
    df = pd.read_csv(input_data_path)
    pd.set_option('max_columns', None)
    print('******read_csv df', df.head(n=5))
    # df = pd.DataFrame(data=df, columns=columns)
    # print('******read_csv df', df.show(10))
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    #df.replace(class_labels, [0, 1], inplace=True)

    negative_examples, positive_examples = np.bincount(df["income"])
    print(
        "Data after cleaning: {}, {} positive examples, {} negative examples".format(
            df.shape, positive_examples, negative_examples
        )
    )

    split_ratio = args.train_test_split_ratio
    print("Splitting data into train and test sets with ratio {}".format(split_ratio))
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop("income", axis=1), df["income"], test_size=split_ratio, random_state=0
    )

    preprocess = make_column_transformer(
        (
            ["age", "num_persons_worked_for_employer"],
            KBinsDiscretizer(encode="onehot-dense", n_bins=10),
        ),
        (["capital_gains", "capital_losses", "dividends_from_stocks"], StandardScaler()),
        (["education", "major_industry_code", "class_of_worker"], OneHotEncoder(sparse=False)),
    )
    print("Running preprocessing and feature engineering transformations")
    train_features = preprocess.fit_transform(X_train)
    test_features = preprocess.transform(X_test)

    print("Train data shape after preprocessing: {}".format(train_features.shape))
    print("Test data shape after preprocessing: {}".format(test_features.shape))

    train_features_output_path = os.path.join("/opt/ml/processing/train", "train_features.csv")
    train_labels_output_path = os.path.join("/opt/ml/processing/train", "train_labels.csv")

    test_features_output_path = os.path.join("/opt/ml/processing/test", "test_features.csv")
    test_labels_output_path = os.path.join("/opt/ml/processing/test", "test_labels.csv")

    print("Saving training features to {}".format(train_features_output_path))
    pd.DataFrame(train_features).to_csv(train_features_output_path, header=False, index=False)

    print("Saving test features to {}".format(test_features_output_path))
    pd.DataFrame(test_features).to_csv(test_features_output_path, header=False, index=False)

    print("Saving training labels to {}".format(train_labels_output_path))
    y_train.to_csv(train_labels_output_path, header=False, index=False)

    print("Saving test labels to {}".format(test_labels_output_path))
    y_test.to_csv(test_labels_output_path, header=False, index=False)

Overwriting preprocessing.py


In [6]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(
    code="preprocessing.py",
    inputs=[ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),
    ],
    arguments=["--train-test-split-ratio", "0.2"],
)

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "train_data":
        preprocessed_training_data = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "test_data":
        preprocessed_test_data = output["S3Output"]["S3Uri"]


Job Name:  sagemaker-scikit-learn-2022-10-24-13-11-37-691
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-workshopdata1/census-income.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-815648291685/sagemaker-scikit-learn-2022-10-24-13-11-37-691/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-eu-west-1-815648291685/sagemaker-scikit-learn-2022-10-24-13-11-37-691/output/train_data', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test_data', 'AppManag

In [7]:
training_features = pd.read_csv(preprocessed_training_data + "/train_features.csv", nrows=10)
print("Training features shape: {}".format(training_features.shape))
training_features.head(n=10)

Training features shape: (10, 73)


Unnamed: 0,0.0,0.0.1,0.0.2,0.0.3,0.0.4,1.0,0.0.5,0.0.6,0.0.7,0.0.8,...,0.0.56,0.0.57,0.0.58,0.0.59,0.0.60,1.0.4,0.0.61,0.0.62,0.0.63,0.0.64
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Training

In [8]:
from sagemaker.sklearn.estimator import SKLearn

sklearn = SKLearn(
    entry_point="train.py", framework_version="0.20.0", instance_type="ml.m5.xlarge", role=role
)

In [27]:
%%writefile train.py

import os

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    training_data_directory = "/opt/ml/input/data/train"
    train_features_data = os.path.join(training_data_directory, "train_features.csv")
    train_labels_data = os.path.join(training_data_directory, "train_labels.csv")
    print("Reading input data")
    X_train = pd.read_csv(train_features_data, header=None)
    y_train = pd.read_csv(train_labels_data, header=None)

    model = LogisticRegression(class_weight="balanced", solver="lbfgs")
    print("Training LR model")
    model.fit(X_train, y_train)
    model_output_directory = os.path.join("/opt/ml/model", "model.joblib")
    print("Saving model to {}".format(model_output_directory))
    joblib.dump(model, model_output_directory)

Overwriting train.py


In [28]:
sklearn.fit({"train": preprocessed_training_data})
training_job_description = sklearn.jobs[-1].describe()
model_data_s3_uri = "{}{}/{}".format(
    training_job_description["OutputDataConfig"]["S3OutputPath"],
    training_job_description["TrainingJobName"],
    "output/model.tar.gz",
)
print('model_data_s3_uri: ', model_data_s3_uri)

2022-10-24 14:18:50 Starting - Starting the training job...
2022-10-24 14:19:13 Starting - Preparing the instances for trainingProfilerReport-1666621130: InProgress
......
2022-10-24 14:20:17 Downloading - Downloading input data...
2022-10-24 14:20:42 Training - Downloading the training image...
2022-10-24 14:21:17 Training - Training image download completed. Training in progress..[34m2022-10-24 14:21:19,994 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-10-24 14:21:19,996 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-24 14:21:20,005 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-10-24 14:21:20,397 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-24 14:21:20,409 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-24 14:21:20,422 sagemaker

# Model Evaluation

In [11]:
%%writefile evaluation.py

import json
import os
import tarfile

import pandas as pd

from sklearn.externals import joblib
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

if __name__ == "__main__":
    model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz")
    print("Extracting model from path: {}".format(model_path))
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")
    print("Loading model")
    model = joblib.load("model.joblib")

    print("Loading test input data")
    test_features_data = os.path.join("/opt/ml/processing/test", "test_features.csv")
    test_labels_data = os.path.join("/opt/ml/processing/test", "test_labels.csv")

    X_test = pd.read_csv(test_features_data, header=None)
    y_test = pd.read_csv(test_labels_data, header=None)
    predictions = model.predict(X_test)

    print("Creating classification evaluation report")
    report_dict = classification_report(y_test, predictions, output_dict=True)
    report_dict["accuracy"] = accuracy_score(y_test, predictions)
    report_dict["roc_auc"] = roc_auc_score(y_test, predictions)

    print("Classification report:\n{}".format(report_dict))

    evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", "evaluation.json")
    print("Saving classification report to {}".format(evaluation_output_path))

    with open(evaluation_output_path, "w") as f:
        f.write(json.dumps(report_dict))

Overwriting evaluation.py


In [12]:
import json
from sagemaker.s3 import S3Downloader

sklearn_processor.run(
    code="evaluation.py",
    inputs=[
        ProcessingInput(source=model_data_s3_uri, destination="/opt/ml/processing/model"),
        ProcessingInput(source=preprocessed_test_data, destination="/opt/ml/processing/test"),
    ],
    outputs=[ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation")],
)
evaluation_job_description = sklearn_processor.jobs[-1].describe()


Job Name:  sagemaker-scikit-learn-2022-10-24-13-29-33-234
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-815648291685/sagemaker-scikit-learn-2022-10-24-13-24-53-877/output/model.tar.gz', 'LocalPath': '/opt/ml/processing/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-815648291685/sagemaker-scikit-learn-2022-10-24-13-11-37-691/output/test_data', 'LocalPath': '/opt/ml/processing/test', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-815648291685/sagemaker-scikit-learn-2022-10-24-13-29-33-234/input/code/evaluation.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3I

In [13]:
evaluation_output_config = evaluation_job_description["ProcessingOutputConfig"]
for output in evaluation_output_config["Outputs"]:
    if output["OutputName"] == "evaluation":
        evaluation_s3_uri = output["S3Output"]["S3Uri"] + "/evaluation.json"
        break

evaluation_output = S3Downloader.read_file(evaluation_s3_uri)
evaluation_output_dict = json.loads(evaluation_output)
print(json.dumps(evaluation_output_dict, sort_keys=True, indent=4))

{
    "0": {
        "f1-score": 0.8389297724060626,
        "precision": 0.9404501748251748,
        "recall": 0.757191871206123,
        "support": 11367
    },
    "1": {
        "f1-score": 0.5136129506990433,
        "precision": 0.3873473917869034,
        "recall": 0.7620087336244541,
        "support": 2290
    },
    "accuracy": 0.7579995606648605,
    "macro avg": {
        "f1-score": 0.676271361552553,
        "precision": 0.6638987833060391,
        "recall": 0.7596003024152885,
        "support": 13657
    },
    "micro avg": {
        "f1-score": 0.7579995606648605,
        "precision": 0.7579995606648605,
        "recall": 0.7579995606648605,
        "support": 13657
    },
    "roc_auc": 0.7596003024152885,
    "weighted avg": {
        "f1-score": 0.7843807849484165,
        "precision": 0.8477061334429062,
        "recall": 0.7579995606648605,
        "support": 13657
    }
}


# Batch Inference

In [29]:
import sagemaker as sage
from time import gmtime, strftime

sagemaker_session = sage.Session()

transform_output_folder = "batch-transform-output"
output_path = "s3://{}/{}".format(sagemaker_session.default_bucket(), transform_output_folder)
print("output_path: ", output_path)
transformer = sklearn.transformer(
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=output_path,
    assemble_with="Line",
    accept="text/csv",
)

output_path:  s3://sagemaker-eu-west-1-815648291685/batch-transform-output


In [30]:
data_location = "{}/{}".format(preprocessed_test_data, "test_features.csv") 
#'s3://sagemaker-data-sci-poc/sagemaker_data/resi_loss_inference.csv'
print("data_location: ", data_location)
df = pd.read_csv(data_location, nrows=5)
df.head(n=5)

data_location:  s3://sagemaker-eu-west-1-815648291685/sagemaker-scikit-learn-2022-10-24-13-11-37-691/output/test_data/test_features.csv


Unnamed: 0,0.0,0.0.1,0.0.2,1.0,0.0.3,0.0.4,0.0.5,0.0.6,0.0.7,0.0.8,...,0.0.56,0.0.57,0.0.58,0.0.59,1.0.4,0.0.60,0.0.61,0.0.62,0.0.63,0.0.64
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [31]:
transformer.transform(
    data_location, content_type="text/csv", split_type="Line"#, input_filter="$[1:]"
)
transformer.wait()

................................[34mProcessing /opt/ml/code
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'[0m
[34mBuilding wheels for collected packages: train
  Building wheel for train (setup.py): started
  Building wheel for train (setup.py): finished with status 'done'
  Created wheel for train: filename=train-1.0.0-py2.py3-none-any.whl size=3377 sha256=cbccacbfc276fe163544c1a27244ecdc761aafd627d32c84429f346e272bdb88
  Stored in directory: /tmp/pip-ephem-wheel-cache-gl1sbsob/wheels/3e/0f/51/2f1df833dd0412c1bc2f5ee56baac195b5be563353d111dca6[0m
[34mSuccessfully built train[0m
[34mInstalling collected packages: train[0m
[34mSuccessfully installed train-1.0.0[0m
  import imp[0m
  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'[0m
[34m[2022-10-24 14:27:34 +0000] [37] [INFO] Starting gunicorn 20.1.0[0m
[34m[2022-10-24 14:27:34 +0000] [37] [INFO] Listening at: unix:/tmp/gunicorn.sock (37)[0m
[34m[202

In [32]:
import pandas as pd

inference_output_data = "{}/{}".format(output_path, "test_features.csv.out")
df = pd.read_csv(inference_output_data, nrows=10)
df.head(n=10)

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,1
5,0
6,1
7,1
8,1
9,0


# Custom Container

In [20]:
!pip install sagemaker-studio-image-build

Collecting sagemaker-studio-image-build
  Downloading sagemaker_studio_image_build-0.6.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sagemaker-studio-image-build
  Building wheel for sagemaker-studio-image-build (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker-studio-image-build: filename=sagemaker_studio_image_build-0.6.0-py3-none-any.whl size=13469 sha256=0721b320d4114bf11a742364e5d22c2d2689f9b266592c8436cfadd76f240586
  Stored in directory: /root/.cache/pip/wheels/ee/a3/ba/fa5677c340fe6a76f3f39cb3e9823726f994c5d481658bb089
Successfully built sagemaker-studio-image-build
Installing collected packages: sagemaker-studio-image-build
Successfully installed sagemaker-studio-image-build-0.6.0
[0m

In [21]:
!mkdir docker

mkdir: cannot create directory ‘docker’: File exists


In [37]:
%%writefile docker/Dockerfile


FROM python:3.8
#-slim-buster

RUN pip install pandas==1.1.5 numpy==1.16.5 scikit-learn==0.23.1 
RUN pip install "snowflake-snowpark-python[pandas]"

#ENV PYTHONUNBUFFERED=TRUE


ENTRYPOINT ["python3"]

Overwriting docker/Dockerfile


In [38]:
!sm-docker build ./docker --repository mydockerrepo1:2

...[Container] 2022/10/24 14:45:25 Waiting for agent ping

[Container] 2022/10/24 14:45:26 Waiting for DOWNLOAD_SOURCE
[Container] 2022/10/24 14:45:30 Phase is DOWNLOAD_SOURCE
[Container] 2022/10/24 14:45:30 CODEBUILD_SRC_DIR=/codebuild/output/src035417250/src
[Container] 2022/10/24 14:45:30 YAML location is /codebuild/output/src035417250/src/buildspec.yml
[Container] 2022/10/24 14:45:30 Setting HTTP client timeout to higher timeout for S3 source
[Container] 2022/10/24 14:45:30 Processing environment variables
[Container] 2022/10/24 14:45:30 No runtime version selected in buildspec.
[Container] 2022/10/24 14:45:30 Moving to directory /codebuild/output/src035417250/src
[Container] 2022/10/24 14:45:30 Configuring ssm agent with target id: codebuild:39071b22-ba9b-4db9-a74c-795d7ce1f90f
[Container] 2022/10/24 14:45:30 Successfully updated ssm agent configuration
[Container] 2022/10/24 14:45:30 Registering with agent
[Container] 2022/10/24 14:45:30 Phases found in YAML: 3
[Container] 2022/1

In [29]:
processing_repository_uri = '815648291685.dkr.ecr.eu-west-1.amazonaws.com/mydockerrepo1:1'

# Preprocessing using custom container

In [30]:
from sagemaker.processing import ScriptProcessor

script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=processing_repository_uri,
    role=role,
    instance_count=1,
    instance_type="ml.t3.medium",
)

In [37]:
%%writefile preprocessing_custom.py


import argparse
import os
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer

from sklearn.exceptions import DataConversionWarning

from snowflake.snowpark import (
    Column,
    DataFrame,
    Session,
    Window
)
from snowflake.snowpark import functions as f
from snowflake.snowpark.types import IntegerType, StringType, StructType, DateType, StructField, MapType
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col, when, to_date


warnings.filterwarnings(action="ignore", category=DataConversionWarning)


columns = [
    "age",
    "class_of_worker",
    "education",
    "major_industry_code",
    "capital_gains",
    "capital_losses",
    "dividends_from_stocks",
    "num_persons_worked_for_employer",
    "income",
]
class_labels = [0, 1]

CONNECTION_PARAMETERS = {
"account": "gi02106.eu-west-2.aws",
"user": "pujaverma",
"password": "Itzme#123",
"role": "accountadmin",
"warehouse": "workshopwh",
"database": "workshopdb",
"schema": "workshopsch",
}


def print_shape(df):
    print('*****IN print_shape df')
    negative_examples, positive_examples = np.bincount(df["income"])
    print(
        "Data shape: {}, {} positive examples, {} negative examples".format(
            df.shape, positive_examples, negative_examples
        )
    )
    

def transformation_pipeline():
    print('INSIDE TRANSFORMSTION PIPELINE')
    session = Session.builder.configs(CONNECTION_PARAMETERS).create()
    
    session.sql("select current_warehouse(), current_database(), current_schema()").show()
    
    df_train = session.table('SAGEMAKER_TABLE')

    df_train_pd = df_train.to_pandas()
    
    print("df_train_pd: ",df_train_pd.columns)
    return df_train_pd


if __name__ == "__main__":
    print('**** IN MAIN')
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-test-split-ratio", type=float, default=0.3)
    args, _ = parser.parse_known_args()

    print("Received arguments {}".format(args))

#     input_data_path = os.path.join("/opt/ml/processing/input", "census-income.csv")

#     print("Reading input data from {}".format(input_data_path))
#     df = pd.read_csv(input_data_path)
    df = transformation_pipeline()
    pd.set_option('max_columns', None)
    print('******read snowflake table df', df.head(n=5))
    # df = pd.DataFrame(data=df, columns=columns)
    # print('******read_csv df', df.show(10))
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    #df.replace(class_labels, [0, 1], inplace=True)

    negative_examples, positive_examples = np.bincount(df["income"])
    print(
        "Data after cleaning: {}, {} positive examples, {} negative examples".format(
            df.shape, positive_examples, negative_examples
        )
    )

    split_ratio = args.train_test_split_ratio
    print("Splitting data into train and test sets with ratio {}".format(split_ratio))
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop("income", axis=1), df["income"], test_size=split_ratio, random_state=0
    )

    preprocess = make_column_transformer(
        (
            KBinsDiscretizer(encode="onehot-dense", n_bins=10),
            ["age", "num_persons_worked_for_employer"],
        ),
        (StandardScaler(), ["capital_gains", "capital_losses", "dividends_from_stocks"]),
        (OneHotEncoder(sparse=False), ["education", "major_industry_code", "class_of_worker"]),
    )
    print("Running preprocessing and feature engineering transformations")
    train_features = preprocess.fit_transform(X_train)
    test_features = preprocess.transform(X_test)

    print("Train data shape after preprocessing: {}".format(train_features.shape))
    print("Test data shape after preprocessing: {}".format(test_features.shape))

    train_features_output_path = os.path.join("/opt/ml/processing/train", "train_features.csv")
    train_labels_output_path = os.path.join("/opt/ml/processing/train", "train_labels.csv")

    test_features_output_path = os.path.join("/opt/ml/processing/test", "test_features.csv")
    test_labels_output_path = os.path.join("/opt/ml/processing/test", "test_labels.csv")

    print("Saving training features to {}".format(train_features_output_path))
    pd.DataFrame(train_features).to_csv(train_features_output_path, header=False, index=False)

    print("Saving test features to {}".format(test_features_output_path))
    pd.DataFrame(test_features).to_csv(test_features_output_path, header=False, index=False)

    print("Saving training labels to {}".format(train_labels_output_path))
    y_train.to_csv(train_labels_output_path, header=False, index=False)

    print("Saving test labels to {}".format(test_labels_output_path))
    y_test.to_csv(test_labels_output_path, header=False, index=False)


Overwriting preprocessing_custom.py


In [63]:
script_processor.run(
    code="preprocessing_custom.py",
    #inputs=[ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    inputs=[],
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),
    ],
    arguments=["--train-test-split-ratio", "0.2"],
)
script_processor_job_description = script_processor.jobs[-1].describe()
print('$$$$$$$$ outside script processor')
output_config = script_processor_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "train_data":
        preprocessed_training_data_custom = output["S3Output"]["S3Uri"]
        print('$$$$$$$$ preprocessed_training_data_custom: ', preprocessed_training_data_custom)
    if output["OutputName"] == "test_data":
        preprocessed_test_data_custom = output["S3Output"]["S3Uri"]
        print('$$$$$$$$ preprocessed_test_data_custom: ', preprocessed_test_data_custom)
        


Job Name:  mydockerrepo1-2022-10-23-17-23-16-631
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-2-157575526935/mydockerrepo1-2022-10-23-17-23-16-631/input/code/preprocessing_custom.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-workshopdata/sagemaker_outputs/train_data/', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-workshopdata/sagemaker_outputs/test_data/', 'LocalPath': '/opt/ml/processing/test', 'S3UploadMode': 'EndOfJob'}}]
...............

KeyboardInterrupt: 

In [40]:
training_custom_features = pd.read_csv(preprocessed_training_data_custom + "/train_features.csv", nrows=10)
print("Training features shape: {}".format(training_features.shape))
training_custom_features.head(n=10)

Training features shape: (10, 73)


Unnamed: 0,0.0,0.0.1,0.0.2,0.0.3,0.0.4,1.0,0.0.5,0.0.6,0.0.7,0.0.8,...,0.0.52,0.0.53,0.0.54,0.0.55,0.0.56,1.0.4,0.0.57,0.0.58,0.0.59,0.0.60
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Model Training

In [55]:
from sagemaker.sklearn.estimator import SKLearn

sklearn = SKLearn(
    entry_point="train.py", framework_version="0.20.0", instance_type="ml.m5.xlarge", role=role
)

In [56]:
%%writefile train.py

import os

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

if __name__ == "__main__":
    training_data_directory = "/opt/ml/input/data/train"
    train_features_data = os.path.join(training_data_directory, "train_features.csv")
    train_labels_data = os.path.join(training_data_directory, "train_labels.csv")
    print("Reading input data")
    X_train = pd.read_csv(train_features_data, header=None)
    y_train = pd.read_csv(train_labels_data, header=None)

    model = LogisticRegression(class_weight="balanced", solver="lbfgs")
    print("Training LR model")
    model.fit(X_train, y_train)
    model_output_directory = os.path.join("/opt/ml/model", "model.joblib")
    print("Saving model to {}".format(model_output_directory))
    joblib.dump(model, model_output_directory)

Overwriting train.py


In [57]:
sklearn.fit({"train": preprocessed_training_data_custom})
training_job_description = sklearn.jobs[-1].describe()
model_data_s3_uri = "{}{}/{}".format(
    training_job_description["OutputDataConfig"]["S3OutputPath"],
    training_job_description["TrainingJobName"],
    "output/model.tar.gz",
)
print('model_data_s3_uri: ', model_data_s3_uri)

2022-10-23 16:46:05 Starting - Starting the training job...
2022-10-23 16:46:29 Starting - Preparing the instances for trainingProfilerReport-1666543565: InProgress
......
2022-10-23 16:47:30 Downloading - Downloading input data...
2022-10-23 16:47:50 Training - Downloading the training image...
2022-10-23 16:48:30 Training - Training image download completed. Training in progress..[34m2022-10-23 16:48:32,864 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-10-23 16:48:32,867 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-23 16:48:32,875 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-10-23 16:48:33,299 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-23 16:48:33,311 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-23 16:48:33,322 sagemaker

# Model Evaluation

In [51]:
%%writefile evaluation.py

import json
import os
import tarfile

import pandas as pd

from sklearn.externals import joblib
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

if __name__ == "__main__":
    model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz")
    print("Extracting model from path: {}".format(model_path))
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")
    print("Loading model")
    model = joblib.load("model.joblib")

    print("Loading test input data")
    test_features_data = os.path.join("/opt/ml/processing/test", "test_features.csv")
    test_labels_data = os.path.join("/opt/ml/processing/test", "test_labels.csv")

    X_test = pd.read_csv(test_features_data, header=None)
    y_test = pd.read_csv(test_labels_data, header=None)
    predictions = model.predict(X_test)

    print("Creating classification evaluation report")
    report_dict = classification_report(y_test, predictions, output_dict=True)
    report_dict["accuracy"] = accuracy_score(y_test, predictions)
    report_dict["roc_auc"] = roc_auc_score(y_test, predictions)

    print("Classification report:\n{}".format(report_dict))

    evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", "evaluation.json")
    print("Saving classification report to {}".format(evaluation_output_path))

    with open(evaluation_output_path, "w") as f:
        f.write(json.dumps(report_dict))

Overwriting evaluation.py


In [52]:
import json
from sagemaker.s3 import S3Downloader

sklearn_processor.run(
    code="evaluation.py",
    inputs=[
        ProcessingInput(source=model_data_s3_uri, destination="/opt/ml/processing/model"),
        ProcessingInput(source=preprocessed_test_data_custom, destination="/opt/ml/processing/test"),
    ],
    outputs=[ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation")],
)
evaluation_job_description = sklearn_processor.jobs[-1].describe()


Job Name:  sagemaker-scikit-learn-2022-10-23-16-30-22-154
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-2-157575526935/sagemaker-scikit-learn-2022-10-23-16-24-04-133/output/model.tar.gz', 'LocalPath': '/opt/ml/processing/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-2-157575526935/mydockerrepo1-2022-10-23-16-06-31-688/output/test_data', 'LocalPath': '/opt/ml/processing/test', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-2-157575526935/sagemaker-scikit-learn-2022-10-23-16-30-22-154/input/code/evaluation.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode'

In [53]:
evaluation_output_config = evaluation_job_description["ProcessingOutputConfig"]
for output in evaluation_output_config["Outputs"]:
    if output["OutputName"] == "evaluation":
        evaluation_s3_uri = output["S3Output"]["S3Uri"] + "/evaluation.json"
        break

evaluation_output = S3Downloader.read_file(evaluation_s3_uri)
evaluation_output_dict = json.loads(evaluation_output)
print(json.dumps(evaluation_output_dict, sort_keys=True, indent=4))

{
    "0": {
        "f1-score": 0.8357202209296642,
        "precision": 0.9402771667399912,
        "recall": 0.7520893815430633,
        "support": 11367
    },
    "1": {
        "f1-score": 0.5097009482129832,
        "precision": 0.3826944140197152,
        "recall": 0.762882096069869,
        "support": 2290
    },
    "accuracy": 0.7538990993629641,
    "macro avg": {
        "f1-score": 0.6727105845713237,
        "precision": 0.6614857903798532,
        "recall": 0.7574857388064662,
        "support": 13657
    },
    "micro avg": {
        "f1-score": 0.7538990993629641,
        "precision": 0.7538990993629641,
        "recall": 0.7538990993629641,
        "support": 13657
    },
    "roc_auc": 0.7574857388064662,
    "weighted avg": {
        "f1-score": 0.7810534467829849,
        "precision": 0.8467819259309238,
        "recall": 0.7538990993629641,
        "support": 13657
    }
}


# Batch Inference using Batch Transform

In [58]:
import sagemaker as sage
from time import gmtime, strftime

sagemaker_session = sage.Session()

transform_output_folder = "batch-transform-output"
#output_path = "s3://{}/{}".format(sagemaker_session.default_bucket(), transform_output_folder)
output_path = "s3://snowflake-stage-area/batch-transform-output"
print("output_path: ", output_path)
transformer = sklearn.transformer(
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=output_path,
    assemble_with="Line",
    accept="text/csv",
)

output_path:  s3://sagemaker-eu-west-2-157575526935/batch-transform-output


In [59]:
data_location = "{}/{}".format(preprocessed_test_data_custom, "test_features.csv") 
#'s3://sagemaker-data-sci-poc/sagemaker_data/resi_loss_inference.csv'
print("data_location: ", data_location)
df = pd.read_csv(data_location, nrows=5)
df.head(n=5)

data_location:  s3://sagemaker-eu-west-2-157575526935/sagemaker-scikit-learn-2022-10-23-13-27-38-998/output/test_data/test_features.csv


Unnamed: 0,0.0,0.0.1,0.0.2,1.0,0.0.3,0.0.4,0.0.5,0.0.6,0.0.7,0.0.8,...,0.0.56,0.0.57,0.0.58,0.0.59,1.0.4,0.0.60,0.0.61,0.0.62,0.0.63,0.0.64
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [60]:
transformer.transform(
    data_location, content_type="text/csv", split_type="Line"#, input_filter="$[1:]"
)
transformer.wait()

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTransformJob operation: The account-level service limit 'ml.m5.xlarge for transform job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.

In [61]:
import pandas as pd

inference_output_data = "{}/{}".format(output_path, "test_features.csv.out")
df = pd.read_csv(inference_output_data, nrows=10)
df.head(n=10)

FileNotFoundError: sagemaker-eu-west-2-157575526935/batch-transform-output/test_features.csv.out

# Write Output to Snowflake

In [None]:
%%writefile write_output.py

import argparse
import os
import warnings
import datetime
import pandas as pd
import numpy as np
import io
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.exceptions import DataConversionWarning
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from snowflake.snowpark import (
    Column,
    DataFrame,
    Session,
    Window
)
from snowflake.snowpark import functions as f
from snowflake.snowpark.types import IntegerType, StringType, StructType, DateType, StructField, MapType
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col, when, to_date


warnings.filterwarnings(action="ignore", category=DataConversionWarning)



CONNECTION_PARAMETERS = {
"account": "gi02106.eu-west-2.aws",
"user": "pujaverma",
"password": "Itzme#123",
"role": "accountadmin",
"warehouse": "workshopwh",
"database": "workshopdb",
"schema": "workshopsch",
}
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    #parser.add_argument("--train-test-split-ratio", type=float, default=0.3)
    args, _ = parser.parse_known_args()

    print("Received arguments {}".format(args))
    
   
    snowflake_conn_session = Session.builder.configs(CONNECTION_PARAMETERS).create()
    
    snowflake_conn_session.sql("select current_warehouse(), current_database(), current_schema(), current_role()").show()
    
    table_query = """create table inference_output (
    col1 varchar
    )"""
    
    snowflake_conn_session.sql(table_query).show()
    
    copy_data = """copy into inference_output
    from 's3://snowflake-stage-area/batch-transform-output/test_features.csv.out'
    credentials = (aws_key_id = 'AKIASJMB3XILZG3F2VOO' aws_secret_key = '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')
    file_format = (format_name='public.csv_file_format');"""
    
    snowflake_conn_session.sql(copy_data).show()