Steps:

1)Importing libraries
2)Creating S3 bucket
3)Mapping train and Test Data in S3
4)Mapping the path of the model in S3

In [47]:
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.session import s3_input, Session



Getting my region name for creation of bucket

In [48]:
my_region =boto3.session.Session().region_name
print(my_region)

eu-north-1


Creating a New Bucket by mentioning the region 

In [49]:
import boto3

s3 = boto3.client('s3')
region = boto3.session.Session().region_name
bucket_name = "bankapplication-sahil"  # must be globally unique

try:
    if region == "us-east-1":
        # Special case: no LocationConstraint allowed
        response = s3.create_bucket(Bucket=bucket_name)
    else:
        response = s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': region}
        )
    print("✅ S3 bucket created successfully in region:", region)
except Exception as e:
    print("❌ S3 error: ", e)


❌ S3 error:  An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


Setting a output path where the model willl be trained and stored

In [50]:
prefix = 'xgboost-as-a-built-in-algo'
output_path= 's3://{}/{}/output'.format(bucket_name,prefix)
print(output_path)

s3://bankapplication-sahil/xgboost-as-a-built-in-algo/output


Downloading the Dataset and Store it in S3

In [51]:
import pandas as pd
import urllib

try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('success: Data loaded into dataframe.')
except Exception as e:
    print('Data load Error:', e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Sucess: Data Loaded into dataframe.')
except Exception as e:
    print('Data Load Error: ',e)

success: Data loaded into dataframe.
Sucess: Data Loaded into dataframe.


In [52]:
#Train Test Split

import numpy as np

train_data,test_data = np.split(model_data.sample(frac=1,random_state=1729),[int(0.7*len(model_data))])
print(train_data.shape,test_data.shape)

(28831, 61) (12357, 61)


In AWS Sagemaker we generally store the dependent feature as the fist column and the rest as others 

In [53]:
import os
import pandas as pd
import boto3
from sagemaker.inputs import TrainingInput

# creating training set with y_yes and all the independent featues except the y_no feature
pd.concat([train_data['y_yes'], train_data.drop(['y_no','y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)

# Uploading the new train.csv with indepedent dataas first column and the remaining 
# features except the y_no into the S3 bucket
s3 = boto3.Session().resource('s3')
s3.Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

# Define S3 input for SageMaker training
train = TrainingInput(
    s3_data='s3://{}/{}/train'.format(bucket_name, prefix),
    content_type='csv'
)


In [54]:
# Save testing dataset similarly as the train CSV
pd.concat([test_data['y_yes'], test_data.drop(['y_no','y_yes'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)

# Upload it to S3  bucket
s3 = boto3.Session().resource('s3')
s3.Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

# Define S3 input for SageMaker training
test = TrainingInput(
    s3_data='s3://{}/{}/test'.format(bucket_name, prefix),
    content_type='csv'
)

BUilding the XGBoost model

In [55]:
#for using a model we need to first pull them , 
# each model is a container type so initially we need to pull them first
container = image_uris.retrieve(
    framework='xgboost',
    region=boto3.Session().region_name,
    version='1.0-1'
)


In [56]:
hyperparameters = {
    "max_depth": "5", #used for assiging the depth of the decision tree , higher the value complex is the model
    "eta": "0.2", #learning rate through which the model reaches the global minima
    "gamma": "4", #this act a regularizer preventing unecessary splits in the decision tree
    "min_child_weight": "6", # it is the benchmark value which should be excedded to create a new node in the tree 
    "subsample": "0.7", #percentage of training data
    "objective": "binary:logistic", # type of model use i.e binary one and regression type is logistic
    "num_round": "100"  # REQUIRED! tells xgboost how many trees is to build
}


In [57]:
from sagemaker.estimator import Estimator
from sagemaker import get_execution_role

# Define the role (works in notebook — otherwise, pass the ARN directly)
role = get_execution_role()

estimator = Estimator(
    image_uri=container, #calling the predefined algorithm container
    role=role, # used to provide role like root user or IAM role
    instance_count=1, # number os machines which would be used for training
    instance_type='ml.m5.2xlarge', #type of EC2 instance to use
    volume_size=5,  # in GB
    max_run=300, # max time for which training should occur
    max_wait=600,  # required if using spot
    use_spot_instances=True, #used to reduce billings in AWS
    output_path=output_path,
    hyperparameters=hyperparameters
)


In [58]:
estimator.fit({'train': train, 'validation': test})


2025-04-12 19:25:57 Starting - Starting the training job...
2025-04-12 19:26:28 Downloading - Downloading input data...
2025-04-12 19:26:43 Downloading - Downloading the training image...
2025-04-12 19:27:19 Training - Training image download completed. Training in progress...[34m[2025-04-12 19:27:36.967 ip-10-0-236-227.eu-north-1.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0

Deploying the model

In [59]:
xgb_predictor =estimator.deploy(initial_instance_count=1,instance_type='ml.m5.large')

------!

In [62]:
from sagemaker.serializers import CSVSerializer
import numpy as np

# Correct variable and module usage
csv_serializer = CSVSerializer()

# Prepare test data
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values

# Set predictor content type and serializer
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

# Get predictions
predictions = xgb_predictor.predict(test_data_array).decode('utf-8')

# Convert predictions to numpy array
predictions_array = np.fromstring(predictions.strip(), sep=',')

# Print shape
print(predictions_array.shape)


(12357,)


In [None]:
# creating confusion matrix

cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))
     


Overall Classification Rate: 89.6%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10766)    36% (170)
Purchase        9% (1119)     64% (302) 

