# Configuring Notebook

In [2]:
import configparser, urllib.request, os, time
import boto3, sagemaker, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


from sagemaker.inputs import TrainingInput
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.serializers import CSVSerializer
from sagemaker.session import Session

#boto is a special package that allows python to talk to AWS services, including S3, EC2 and others. 

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


# Let's setup BOTO

In [3]:
s3_client = boto3.client("s3")


BUCKET = "mlops-dsml-oct31"
region = 'us-west-2'

try:
    
    s3_client.create_bucket(
        Bucket=BUCKET, CreateBucketConfiguration={"LocationConstraint": region}
    )
    print("Created bucket", BUCKET)
except s3_client.exceptions.BucketAlreadyOwnedByYou:
    print("Using existing bucket", BUCKET)


Created bucket mlops-dsml-oct31


## Some Boto Operations for interacting/playing with S3

In [4]:
response = s3_client.list_buckets()
print("Available S3 Buckets:")
for bucket in response['Buckets']:
    print(f"  - {bucket['Name']} (Created: {bucket['CreationDate']})")

Available S3 Buckets:
  - mlops-dsml-oct31 (Created: 2025-10-31 02:39:25+00:00)
  - shivam-oct31-2025 (Created: 2025-10-31 02:14:32+00:00)


In [5]:
local_file_path = 'empty.txt'
s3_key = 's3_empty_v2.txt' #  This is the file name with which it will be present inside the bucket.

s3_client.upload_file(local_file_path, BUCKET, s3_key)

# Downloading Dataset

In [6]:
DATA_URL = (
    "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-"
    "sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv"
)


# Define local file path
LOCAL_CSV = "bank_clean.csv"


urllib.request.urlretrieve(DATA_URL, LOCAL_CSV)
print("Downloaded", LOCAL_CSV)


Downloaded bank_clean.csv


In [8]:
df = pd.read_csv(LOCAL_CSV, index_col=0)
print("Shape:", df.shape)
display(df.head(3))

Shape: (41188, 61)


Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


## Uploading data to S3, as that's where model will get it from in production

In [9]:
train_df, test_df = train_test_split(
    df, train_size=0.7, shuffle=True, random_state=1729
)

In [10]:
train_df.drop(["y_no", "y_yes"], axis=1).head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
28868,31,1,999,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
10545,29,1,999,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
25907,53,2,999,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
1289,60,1,999,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
23064,55,1,999,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [11]:
pd.concat(
    [train_df["y_yes"], train_df.drop(["y_no", "y_yes"], axis=1)], axis=1
).head()

Unnamed: 0,y_yes,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
28868,0,31,1,999,1,1,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
10545,0,29,1,999,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
25907,0,53,2,999,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
1289,0,60,1,999,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
23064,0,55,1,999,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [12]:
PREFIX = "xgboost-bank"

In [13]:
train_payload = pd.concat(
    [train_df["y_yes"], train_df.drop(["y_no", "y_yes"], axis=1)], axis=1
)


train_csv_path = "train.csv"

train_payload.to_csv(train_csv_path, index=False, header=False)



In [16]:
s3_train_key = f"{PREFIX}/train/{train_csv_path}"

print("S3 train key: ", s3_train_key)

s3_client.upload_file(train_csv_path, BUCKET, s3_train_key)

print(f"Uploaded training data to s3://{BUCKET}/{s3_train_key}")

S3 train key:  xgboost-bank/train/train.csv
Uploaded training data to s3://mlops-dsml-oct31/xgboost-bank/train/train.csv


In [15]:
test_payload = pd.concat(
    [test_df["y_yes"], test_df.drop(["y_no", "y_yes"], axis=1)], axis=1
)
test_csv_path = "test.csv"
test_payload.to_csv(test_csv_path, index=False, header=False)
s3_test_key = f"{PREFIX}/test/{test_csv_path}"
s3_client.upload_file(test_csv_path, BUCKET, s3_test_key)
print(f"Uploaded test data to s3://{BUCKET}/{s3_test_key}")

Uploaded test data to s3://mlops-dsml-oct31/xgboost-bank/test/test.csv


# Configuring and setting up Sagemaker Xgboost Container

In [17]:
f"s3://{BUCKET}/{PREFIX}/train/"

's3://mlops-dsml-oct31/xgboost-bank/train/'

In [18]:
# Create TrainingInput objects -------------------------------------------------


s3_train = TrainingInput(f"s3://{BUCKET}/{PREFIX}/train/", content_type="text/csv")
s3_val = TrainingInput(f"s3://{BUCKET}/{PREFIX}/test/", content_type="text/csv")

## Training Instance

In [19]:
# Training Model

container = image_uris.retrieve('xgboost',boto3.Session().region_name,'1.5-1')

# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
}

estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=f"s3://{BUCKET}/{PREFIX}/output")


In [20]:
print("Starting training …")

estimator.fit({"train": s3_train, "validation": s3_val}, wait=True)


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-10-31-03-00-55-500


Starting training …
2025-10-31 03:00:55 Starting - Starting the training job...
2025-10-31 03:01:17 Starting - Preparing the instances for training...
2025-10-31 03:01:53 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-10-31 03:02:56.883 ip-10-0-163-145.us-west-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-10-31 03:02:56.904 ip-10-0-163-145.us-west-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-10-31:03:02:57:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-10-31:03:02:57:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-31:03:02:57:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-10-31:03:02:57:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-10-31:03:02:57:INFO] Determined 0 GPU(s) ava

In [21]:
# 6. Deploy Realtime Endpoint

print("Deploying endpoint … this may take a few minutes")
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.2xlarge",
    serializer=CSVSerializer(),
)
endpoint_name = predictor.endpoint_name
print("Endpoint active →", endpoint_name)

Deploying endpoint … this may take a few minutes


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-10-31-03-03-42-729
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-10-31-03-03-42-729
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-10-31-03-03-42-729


------!Endpoint active → sagemaker-xgboost-2025-10-31-03-03-42-729


In [24]:
X_test = test_df.drop(["y_no", "y_yes"], axis=1).values
X_test

array([[ 54,   3, 999, ...,   0,   1,   0],
       [ 56,   2, 999, ...,   0,   1,   0],
       [ 32,   2, 999, ...,   0,   1,   0],
       ...,
       [ 58,   2, 999, ...,   0,   1,   0],
       [ 42,   1, 999, ...,   0,   1,   0],
       [ 32,   1, 999, ...,   1,   0,   0]])

In [25]:
probs_text = predictor.predict(X_test)
probs_text[:50].decode("utf-8").strip().split("\n")

['0.24062314629554749', '0.04597575217485428', '0.10725669']

In [26]:
predictor.delete_endpoint(delete_endpoint_config=True)
print("Endpoint deleted.")

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2025-10-31-03-03-42-729
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2025-10-31-03-03-42-729


Endpoint deleted.
