Final Project - Modeling

In [3]:
!pip3 install --upgrade sagemaker



Just as a note, we had to upload files manually into jupyter notebook. We were unable to download using the S3 bucket. 


In [4]:
import os
import boto3
import sagemaker

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

bucket = sess.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [5]:
# download the data and save it in the local folder with the name data.csv and take a look at it.

import pandas as pd
import numpy as np

s3 = boto3.client("s3")
df_train = pd.read_csv('train_user_product_pairs.csv')
df_val = pd.read_csv('validation_user_product_pairs.csv')

column_names = df_train.columns
print(column_names)

df_train.sample(8)
df_val.sample(8)

Index(['user_id', 'product_id', 'purchased', 'view_count', 'cart_count',
       'total_interactions', 'price', 'category_id', 'category_code', 'brand',
       'first_interaction', 'last_interaction', 'product_view_count',
       'product_purchase_count', 'product_conversion_rate',
       'user_total_events', 'user_total_purchases'],
      dtype='object')


Unnamed: 0,user_id,product_id,purchased,view_count,cart_count,total_interactions,price,category_id,category_code,brand,first_interaction,last_interaction,product_view_count,product_purchase_count,product_conversion_rate,user_total_events,user_total_purchases
367430,540483901,1802133,1,0,1,2,102.94,2232732099754852875,appliances.personal.massager,horizont,2020-01-20 05:17:51 UTC,2020-01-20 05:16:43 UTC,92,30,0.326087,5,3
19691,512614634,5800970,1,0,0,1,84.43,2232732082390433922,electronics.audio.subwoofer,alphard,2020-01-07 08:34:54 UTC,2020-01-07 08:34:54 UTC,1,1,1.0,3,2
462413,552434345,22700426,1,0,2,3,79.8,2232732091643068746,,helfer,2020-01-18 13:33:37 UTC,2020-01-19 11:03:46 UTC,6,2,0.333333,3,1
920421,599609502,1005253,0,0,1,1,264.87,2232732093077520756,construction.tools.light,xiaomi,2020-01-15 09:47:28 UTC,2020-01-15 09:47:28 UTC,4346,1933,0.444777,3,2
835834,595726506,1002540,0,0,1,1,445.69,2232732093077520756,construction.tools.light,apple,2020-01-08 14:11:44 UTC,2020-01-08 14:11:44 UTC,1902,1014,0.533123,1,0
604043,570631373,12702958,0,0,1,1,39.9,2053013553199186187,,cordiant,2020-01-16 14:46:40 UTC,2020-01-16 14:46:40 UTC,207,98,0.47343,3,1
228008,520606824,100065117,1,0,0,1,78.74,2232732105912091273,appliances.kitchen.mixer,vitek,2020-01-11 22:10:29 UTC,2020-01-11 22:10:29 UTC,150,82,0.546667,2,1
337272,535903898,8801100,1,0,0,1,102.64,2053013560530830019,electronics.camera.video,nokia,2020-01-24 16:50:21 UTC,2020-01-24 16:50:21 UTC,877,414,0.472064,1,1


In [6]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Get your session and default bucket
sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = "commerce-recommender"
role = sagemaker.get_execution_role()

# S3 paths to your ORIGINAL, unprocessed data
s3_input_train = f"s3://{bucket}/{prefix}/input/raw/train_user_product_pairs.csv"
s3_input_validation = f"s3://{bucket}/{prefix}/input/raw/validation_user_product_pairs.csv"

# First, upload your original files if you haven't already
sess.upload_data(path='train_user_product_pairs.csv', bucket=bucket, key_prefix=f"{prefix}/input/raw")
sess.upload_data(path='validation_user_product_pairs.csv', bucket=bucket, key_prefix=f"{prefix}/input/raw")

sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    role=role,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name="recommender-preprocessing"
)

print("Starting SageMaker Processing Job.")
sklearn_processor.run(
    code="preprocess.py",
    inputs=[
        ProcessingInput(
            source=s3_input_train,
            destination="/opt/ml/processing/input/train",
            s3_data_distribution_type="FullyReplicated"
        ),
        ProcessingInput(
            source=s3_input_validation,
            destination="/opt/ml/processing/input/validation",
            s3_data_distribution_type="FullyReplicated"
        )
    ],

    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output/train", destination=f"s3://{bucket}/{prefix}/processed/train"),
        ProcessingOutput(source="/opt/ml/processing/output/validation", destination=f"s3://{bucket}/{prefix}/processed/validation"),
        ProcessingOutput(source="/opt/ml/processing/output/batch", destination=f"s3://{bucket}/{prefix}/processed/batch")
    ],

    arguments=[
        "--train-input", "train_user_product_pairs.csv",
        "--validation-input", "validation_user_product_pairs.csv"
    ]
)

Starting SageMaker Processing Job.


INFO:sagemaker:Creating processing-job with name recommender-preprocessing-2025-10-12-01-20-17-678


...........[34mLoading data...[0m
[34mApplying manual Target Encoding...[0m
[34mCleaning and reordering columns...[0m
[34mSplitting validation data...[0m
[34mSaving processed files...[0m



# The below cell uses the SageMaker Python SDK to kick off the training job using both our training set and validation set. 
# Using 'binary:logistic' which trains a model to output a probability between 0 and 1 (here the probability of a person making a purchase).

In [7]:
%%time
from time import gmtime, strftime

job_name = "xgb-recommender-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "s3://{}/{}/output/{}".format(bucket, prefix, job_name)
image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=boto3.Session().region_name, version="1.7-1"
)

sm_estimator = sagemaker.estimator.Estimator(
    image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
)

sm_estimator.set_hyperparameters(
    objective="binary:logistic",
    eval_metric="auc",
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    num_round=100,
)

# --- Define the data input channels ---
s3_processed_train_path = f"s3://{bucket}/{prefix}/processed/train"
s3_processed_validation_path = f"s3://{bucket}/{prefix}/processed/validation"

train_data = sagemaker.inputs.TrainingInput(
    s3_processed_train_path,
    content_type="text/csv"
)

validation_data = sagemaker.inputs.TrainingInput(
    s3_processed_validation_path,
    content_type="text/csv"
)

data_channels = {"train": train_data, "validation": validation_data}

# --- Start the training job ---
sm_estimator.fit(inputs=data_channels, job_name=job_name, logs=True)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: xgb-recommender-2025-10-12-01-34-42


2025-10-12 01:34:43 Starting - Starting the training job...
2025-10-12 01:34:58 Starting - Preparing the instances for training...
2025-10-12 01:35:19 Downloading - Downloading input data...
2025-10-12 01:35:49 Downloading - Downloading the training image...
  import pkg_resources[0m
[34m[2025-10-12 01:36:50.900 ip-10-0-136-211.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-10-12 01:36:50.962 ip-10-0-136-211.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-10-12:01:36:51:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-10-12:01:36:51:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-12:01:36:51:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-12:01:36:51:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-10

In [10]:
# 1. Create a transform job with the default configurations; first by uploading batch
# Use inference as a baseline to compare to the results with data processing. 

s3_processed_batch_path = f"s3://{bucket}/{prefix}/processed/batch/batch_commerce.csv"
s3_inference_input_prefix = f"{prefix}/batch-inference-input"

local_batch_file = 'batch_commerce.csv'
!aws s3 cp {s3_processed_batch_path} {local_batch_file}

processed_batch_df = pd.read_csv(local_batch_file, header=None)
inference_df = processed_batch_df.drop(processed_batch_df.columns[0], axis=1)
inference_file_local = 'batch_inference_data.csv'
inference_df.to_csv(inference_file_local, header=False, index=False)

s3_inference_input_path = sess.upload_data(
    path=inference_file_local,
    key_prefix=s3_inference_input_prefix
)

download: s3://sagemaker-us-east-1-691541310573/commerce-recommender/processed/batch/batch_commerce.csv to ./batch_commerce.csv


In [9]:
%%time

# 1. Create a Transformer from your trained estimator
# sm_estimator is the variable from your training job
sm_transformer = sm_estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=f"s3://{bucket}/{prefix}/batch-output" # Define where to save predictions
)

# 2. Start the transform job using the inference-ready data
sm_transformer.transform(
    data=s3_inference_input_path, # 👈 Pointing to the correct S3 location
    content_type="text/csv",
    split_type="Line"
)

# 3. Wait for the job to complete
print("Waiting for Batch Transform job to complete...")
sm_transformer.wait()
print("All done")

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-10-12-01-40-58-544
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2025-10-12-01-40-59-195


  import pkg_resources[0m
[34m[2025-10-12:01:46:04:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-10-12:01:46:04:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-10-12:01:46:04:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
  import pkg_resources[0m
[35m[2025-10-12:01:46:04:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2025-10-12:01:46:04:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2025-10-12:01:46:04:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
   

In [11]:
# time to see model results

import gc  # need to collect some garbage

# Create Product Lookup from Original Training Data to make sense of results

product_cols = ['product_id', 'category_code', 'brand', 'price']
products_df = df_train[product_cols].drop_duplicates(subset=['product_id']).reset_index(drop=True)
products_df.to_csv('products.csv', index=False)

del df_train

np.random.seed(42)
rand_split_recreated = np.random.rand(len(df_val))
batch_list_recreated = rand_split_recreated >= 0.5
original_batch_data = df_val[batch_list_recreated].reset_index(drop=True)
batch_input_filename = 'batch_inference_data.csv'

del df_val
gc.collect()

output_filename_on_s3 = f"{batch_input_filename}.out"
output_s3_path = f"{sm_transformer.output_path}/{output_filename_on_s3}"

# Download the file
!aws s3 cp {output_s3_path} .

download: s3://sagemaker-us-east-1-691541310573/commerce-recommender/batch-output/batch_inference_data.csv.out to ./batch_inference_data.csv.out


In [14]:
# Download Prediction Output
print("\nDownloading prediction results from S3")
# Define the name of the file that was submitted to the batch transform job
batch_input_filename = 'batch_inference_data.csv'
# SageMaker names the output file based on the input file, adding ".out"
output_filename_on_s3 = f"{batch_input_filename}.out"
output_s3_path = f"{sm_transformer.output_path}/{output_filename_on_s3}"

# Use the AWS CLI to download the file
!aws s3 cp {output_s3_path} .

# Merge Predictions with Product Details and Analyze
print("\nMerging all data to generate final recommendations")
# Load the downloaded predictions
predictions_df = pd.read_csv(output_filename_on_s3, header=None)
predictions_df.columns = ['purchase_probability']

# Merge predictions with the user/product IDs from the recreated batch data
results_df = pd.concat([original_batch_data[['user_id', 'product_id']], predictions_df], axis=1)

# Merge the results with the product details from the lookup file we created
final_recommendations_df = pd.merge(
    results_df,
    products_df,
    on='product_id',
    how='left'
)


Downloading prediction results from S3
download: s3://sagemaker-us-east-1-691541310573/commerce-recommender/batch-output/batch_inference_data.csv.out to ./batch_inference_data.csv.out

Merging all data to generate final recommendations


In [13]:
# Display Top 5 Recommendations for an Example User
if not final_recommendations_df.empty:
    recommendations_with_brands = final_recommendations_df[final_recommendations_df['brand'].notna()]

    if not recommendations_with_brands.empty:
        example_user_id = recommendations_with_brands['user_id'].iloc[0]

        user_recommendations = final_recommendations_df[final_recommendations_df['user_id'] == example_user_id]

        print(f"\nTop 5 Human-Readable Recommendations for user: {example_user_id}")
        display_cols = ['product_id', 'category_code', 'brand', 'price', 'purchase_probability']
        print(
            user_recommendations[display_cols]
            .sort_values(by='purchase_probability', ascending=False)
            .head(5)
        )


Top 5 Human-Readable Recommendations for user: 107620212
   product_id category_code brand   price  purchase_probability
1    20500418     sport.ski  elan  244.28               0.99997


In [15]:
# Using SageMaker Model Registry with Model Package Groups for collaboration

sm_client = sess.sagemaker_client
model_package_group_name = 'AAI540ECommerceRecommendationGroup'
model_package_group_description = 'Model group for final project of AAI540 - ecommerce prediction model.'

# Create the model package group
model_package_group_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageGroupDescription": model_package_group_description,
}

try:
    sm_client.create_model_package_group(**model_package_group_input_dict)
    print(f"Model Package Group '{model_package_group_name}' created successfully.")
except Exception as e:
    # This will handle the case where the group already exists etc.
    if "ModelPackageGroup already exists" in str(e):
        print(f"Model Package Group '{model_package_group_name}' already exists.")
    else:
        raise e

Model Package Group 'AAI540ECommerceRecommendationGroup' created successfully.


In [23]:
# register
sm_client = boto3.client('sagemaker')

model_name = job_name
model_data_url = sm_estimator.model_data # The S3 path to model
image_uri = inference_image_uri
execution_role_arn = role # The IAM role ARN

print(f"Model Name: {model_name}")
print(f"Model Data URL: {model_data_url}")
print(f"Execution Role ARN: {execution_role_arn}")

# Assemble the primary container definition
primary_container = {
    'Image': image_uri,
    'ModelDataUrl': model_data_url,
}

# Call the create_model API

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=execution_role_arn,
    PrimaryContainer=primary_container
)

print("\nModel created successfully!")
print(f"Model ARN: {create_model_response['ModelArn']}")

Model Name: xgb-recommender-2025-10-12-01-34-42
Model Data URL: s3://sagemaker-us-east-1-691541310573/commerce-recommender/output/xgb-recommender-2025-10-12-01-34-42/xgb-recommender-2025-10-12-01-34-42/output/model.tar.gz
Execution Role ARN: arn:aws:iam::691541310573:role/LabRole

Model created successfully!
Model ARN: arn:aws:sagemaker:us-east-1:691541310573:model/xgb-recommender-2025-10-12-01-34-42


In [None]:
# --- Code to run to get model ---
# import sagemaker

# # Get the model name from you
# model_name_from_colleague = "xgb-recommender-2025-10-12-01-34-42" # or whatever name you want

# # Create a SageMaker Model object referencing the existing model
# reusable_model = sagemaker.Model(
#     name=model_name_from_colleague,
#     sagemaker_session=sagemaker.Session()
# )

# # Deploy it to an endpoint
# predictor = reusable_model.deploy(
#     initial_instance_count=1,
#     instance_type='ml.m5.xlarge'
# )