<a href="https://colab.research.google.com/github/Prasang-Biyani/aws-sagemaker/blob/main/AWS_SageMaker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sagemaker
import boto3

In [None]:
bucket_name = "dsmlsagemaker-prasang"
# Check the region of the instance
my_region = boto3.session.Session().region_name
print(my_region)

us-east-1


In [None]:
# Create a bucket
s3 = boto3.resource("s3", 
                   region_name=my_region,
                   aws_access_key_id="",
                   aws_secret_access_key="")

try:
    if my_region == "us-east-1":
        s3.create_bucket(Bucket=bucket_name)
        print("S3 Bucket Created Successfully!")
except Exception as e:
    print(str(e))
    

S3 Bucket Created Successfully!


In [None]:
## set an output path for model artifacts
path = "xgboost-as-a-built-in-algo"
output_path = f"s3://{bucket_name}/{path}/output"
print(output_path)

s3://dsmlsagemaker-prasang/xgboost-as-a-built-in-algo/output


## Download the data and store in S3

In [None]:
import urllib
try:
    #the bank data is in one hot encoded format already
        urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
        print('Success: downloaded bank_clean.csv.')
except Exception as e:
        print('Data load error: ',e) 
try:
        model_data = pd.read_csv('./bank_clean.csv',index_col=0)
        print('Success: Data loaded into dataframe.')
except Exception as e:
        print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [None]:
model_data.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [None]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41188 entries, 0 to 41187
Data columns (total 61 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   age                            41188 non-null  int64
 1   campaign                       41188 non-null  int64
 2   pdays                          41188 non-null  int64
 3   previous                       41188 non-null  int64
 4   no_previous_contact            41188 non-null  int64
 5   not_working                    41188 non-null  int64
 6   job_admin.                     41188 non-null  int64
 7   job_blue-collar                41188 non-null  int64
 8   job_entrepreneur               41188 non-null  int64
 9   job_housemaid                  41188 non-null  int64
 10  job_management                 41188 non-null  int64
 11  job_retired                    41188 non-null  int64
 12  job_self-employed              41188 non-null  int64
 13  job_services    

## Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(model_data, test_size=0.3)

In [None]:
## saving training and testing data in s3 buckets
import os
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'],
axis = 1)],
axis = 1).to_csv("train.csv", index=False, header=False)

In [None]:
boto3.Session().resource("s3").Bucket(bucket_name).Object(os.path.join(path, "train/train.csv")).upload_file("train.csv")

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [None]:
## now storing the training csv into a variable
s3_input_train = sagemaker.TrainingInput(s3_data=f"s3://{bucket_name}/{path}/train/", content_type="csv")

In [None]:
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'],
axis = 1)],
axis = 1).to_csv("test.csv", index=False, header=False)

In [None]:
boto3.Session().resource("s3").Bucket(bucket_name).Object(os.path.join(path, "test/test.csv")).upload_file("test.csv")
## now storing the training csv into a variable
s3_input_test = sagemaker.TrainingInput(s3_data=f"s3://{bucket_name}/{path}/test/", content_type="csv")

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


## Building XGBoost-inbuild algo

In [None]:
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.session import s3_input, Session

In [None]:
container = image_uris.retrieve("xgboost", boto3.Session().region_name, "1.5-1")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [None]:
##initialise all the parameters 
hyperparameters = {
"max_depth": "5",
"eta": "0.2",
"gamma":"4",
"min_child_weight":"6",
"subsample":"0.7",
"objective":"binary:logistic",
"num_round":50
}

In [None]:
# create sagemaker estimator that calls the xgboost container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                         hyperparameters=hyperparameters,
                                         role=sagemaker.get_execution_role(),
                                         instance_count=1,
                                         instance_type="ml.m5.2xlarge",
                                         volume_size=5,
                                         output_path=output_path)

In [None]:
estimator.fit({"train":s3_input_train, "validation":s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-06-08-16-50-36-867


2023-06-08 16:50:40 Starting - Starting the training job...
2023-06-08 16:50:55 Starting - Preparing the instances for training......
2023-06-08 16:52:14 Downloading - Downloading input data
2023-06-08 16:52:14 Training - Downloading the training image...
2023-06-08 16:52:35 Training - Training image download completed. Training in progress...[34m[2023-06-08 16:52:54.214 ip-10-2-95-175.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-06-08 16:52:54.283 ip-10-2-95-175.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-06-08:16:52:54:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-06-08:16:52:54:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2023-06-08:16:52:54:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-06-08:16:52:54:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2023

## Deploying the ML Model as an endpoint

In [None]:
from sagemaker.serializers import CSVSerializer 
xgb_predictor = estimator.deploy(initial_instance_count = 1, instance_type="ml.m4.xlarge", serializer=CSVSerializer())

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-06-08-16-53-29-875
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-06-08-16-53-29-875
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-06-08-16-53-29-875


-------!

## Prediction of test data

In [None]:
test_data_array = test_data.drop(['y_yes', 'y_no'], axis=1).values
test_data_array

array([[ 34,   4, 999, ...,   0,   1,   0],
       [ 31,   6, 999, ...,   0,   1,   0],
       [ 32,   1, 999, ...,   0,   1,   0],
       ...,
       [ 36,   3, 999, ...,   0,   1,   0],
       [ 47,   3, 999, ...,   0,   1,   0],
       [ 39,   8, 999, ...,   0,   1,   0]])

In [None]:
predictions = xgb_predictor.predict(test_data_array).decode("utf-8")

In [None]:
predictions_array = np.fromstring(predictions[:], sep="\n")
predictions_array

array([0.08596137, 0.08066913, 0.2411833 , ..., 0.10747377, 0.07555814,
       0.03684008])

In [None]:
import sklearn.metrics
cutoff=0.5
print(sklearn.metrics.confusion_matrix(test_data['y_yes'],np.round(predictions_array)))
print(sklearn.metrics.classification_report(test_data['y_yes'],np.round(predictions_array)))

[[10828   163]
 [ 1076   290]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     10991
           1       0.64      0.21      0.32      1366

    accuracy                           0.90     12357
   macro avg       0.77      0.60      0.63     12357
weighted avg       0.88      0.90      0.88     12357



In [None]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2023-06-08-16-53-29-875'

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-06-08-16-53-29-875


In [None]:
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '3ZFZK6BFTT1TP8MY',
   'HostId': 'oGJmvH3ZJE/pzZ8b4ipnm20ms75AFSLBPaOYYV9qlTBwihvWdnULVLgtk+xQdk9hJrFeFaEBz5E=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'oGJmvH3ZJE/pzZ8b4ipnm20ms75AFSLBPaOYYV9qlTBwihvWdnULVLgtk+xQdk9hJrFeFaEBz5E=',
    'x-amz-request-id': '3ZFZK6BFTT1TP8MY',
    'date': 'Thu, 08 Jun 2023 16:57:35 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-06-08-16-50-36-867/debug-output/index/000000000/000000000030_worker_0.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-06-08-16-50-36-867/debug-output/claim.smd'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-06-08-16-50-36-867/debug-output/index/000000000/000000000010_worker_0.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagem