## Section 1: Environment Setup and Configuration

In [31]:
import sys
import os
import time
import json
import io
from datetime import datetime
from time import gmtime, strftime

import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, TransformStep
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet, Join
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.inputs import TrainingInput, TransformInput
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.processing import ScriptProcessor
from sagemaker.model_monitor import (
    DataCaptureConfig,
    DefaultModelMonitor,
    CronExpressionGenerator,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [32]:
# Initialize AWS services and sessions
sagemaker_session = sagemaker.Session()
pipeline_session = PipelineSession()
region = sagemaker_session.boto_region_name
role = get_execution_role()
s3_client = boto3.client('s3')
sm_client = boto3.client('sagemaker')
cw_client = boto3.client('cloudwatch')

# Configuration
BUCKET_NAME = 'aai540-ecommerce-recommendation-project-group3'
S3_PREFIX = 'mlops-pipeline'
PROCESSED_PREFIX = f'{S3_PREFIX}/processed/'
FEATURE_STORE_PREFIX = f'{S3_PREFIX}/feature-store/'
MODEL_PACKAGE_GROUP_NAME = 'AAI540ECommerceRecommendationGroup'

default_bucket = sagemaker_session.default_bucket()

print(f"\n SageMaker Role ARN: {role}")
print(f" Region: {region}")
print(f" Default S3 Bucket: {default_bucket}")
print(f" Project Bucket: {BUCKET_NAME}")


 SageMaker Role ARN: arn:aws:iam::115718999037:role/LabRole
 Region: us-east-1
 Default S3 Bucket: sagemaker-us-east-1-115718999037
 Project Bucket: aai540-ecommerce-recommendation-project-group3


## Section 2: Data Loading and Exploratory Data Analysis

In [33]:
# Load training and validation data from S3
def load_data_from_s3(bucket_name=BUCKET_NAME, s3_prefix=''):
    
    train_key = f"{s3_prefix}train_user_product_pairs.csv"
    val_key = f"{s3_prefix}validation_user_product_pairs.csv"
    
    print(f"\n→ Loading training data from s3://{bucket_name}/{train_key}")
    obj = s3_client.get_object(Bucket=bucket_name, Key=train_key)
    df_train = pd.read_csv(io.BytesIO(obj['Body'].read()))
    print(f"   Training data loaded: {df_train.shape}")
    
    print(f"\n→ Loading validation data from s3://{bucket_name}/{val_key}")
    obj = s3_client.get_object(Bucket=bucket_name, Key=val_key)
    df_val = pd.read_csv(io.BytesIO(obj['Body'].read()))
    print(f"   Validation data loaded: {df_val.shape}")
    
    # Combine for proper splitting
    df_all = pd.concat([df_train, df_val], ignore_index=True)
    print(f"\n Combined dataset: {df_all.shape}")
    
    return df_all

In [34]:
# EXECUTE: Load data from S3
df_all = load_data_from_s3()

print("\nData loading complete! Ready for EDA.")


→ Loading training data from s3://aai540-ecommerce-recommendation-project-group3/train_user_product_pairs.csv
   Training data loaded: (2018753, 17)

→ Loading validation data from s3://aai540-ecommerce-recommendation-project-group3/validation_user_product_pairs.csv
   Validation data loaded: (1062101, 17)

 Combined dataset: (3080854, 17)

Data loading complete! Ready for EDA.


In [36]:
# exploratory data analysis
def perform_eda(df):
    print("\n1. DATASET OVERVIEW")
    print("-" * 80)
    print(f"Total records: {len(df):,}")
    print(f"Total features: {len(df.columns)}")
    print(f"\nColumns: {df.columns.tolist()}")
    print(f"\nData types:\n{df.dtypes}")
    print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    print("\n2. TARGET VARIABLE ANALYSIS")
    print("-" * 80)
    purchase_counts = df['purchased'].value_counts()
    purchase_pct = df['purchased'].value_counts(normalize=True) * 100
    print(f"Purchase distribution:")
    print(f"  Not Purchased (0): {purchase_counts[0]:,} ({purchase_pct[0]:.2f}%)")
    print(f"  Purchased (1): {purchase_counts[1]:,} ({purchase_pct[1]:.2f}%)")
    print(f"  Class imbalance ratio: {purchase_counts[0]/purchase_counts[1]:.2f}:1")
    
    print("\n3. MISSING VALUES ANALYSIS")
    print("-" * 80)
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Percentage': missing_pct
    })
    missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
    if len(missing_df) > 0:
        print(missing_df)
    else:
        print("  No missing values found!")
    
    print("\n4. NUMERICAL FEATURES STATISTICS")
    print("-" * 80)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    print(df[numeric_cols].describe())
    
    print("\n5. CATEGORICAL FEATURES ANALYSIS")
    print("-" * 80)
    categorical_cols = ['category_code', 'brand']
    for col in categorical_cols:
        if col in df.columns:
            print(f"\n{col}:")
            print(f"  Unique values: {df[col].nunique():,}")
            print(f"  Top 5 categories:")
            print(df[col].value_counts().head())
    
    print("\n6. USER BEHAVIOR PATTERNS")
    print("-" * 80)
    print(f"Unique users: {df['user_id'].nunique():,}")
    print(f"Unique products: {df['product_id'].nunique():,}")
    print(f"Avg interactions per user: {df.groupby('user_id')['total_interactions'].sum().mean():.2f}")
    print(f"Avg price: ${df['price'].mean():.2f}")
    print(f"Price range: ${df['price'].min():.2f} - ${df['price'].max():.2f}")
    
    print("\n7. FEATURE CORRELATIONS WITH TARGET")
    print("-" * 80)
    correlations = df[numeric_cols].corrwith(df['purchased']).sort_values(ascending=False)
    print("Top correlations with purchase:")
    print(correlations.head(10))
    
    print("\n EDA Complete!")
    
    return df

In [37]:
# EXECUTE: Perform Exploratory Data Analysis

df_all = perform_eda(df_all)

print("\n EDA complete! Ready for preprocessing.")


1. DATASET OVERVIEW
--------------------------------------------------------------------------------
Total records: 3,080,854
Total features: 17

Columns: ['user_id', 'product_id', 'purchased', 'view_count', 'cart_count', 'total_interactions', 'price', 'category_id', 'category_code', 'brand', 'first_interaction', 'last_interaction', 'product_view_count', 'product_purchase_count', 'product_conversion_rate', 'user_total_events', 'user_total_purchases']

Data types:
user_id                      int64
product_id                   int64
purchased                    int64
view_count                   int64
cart_count                   int64
total_interactions           int64
price                      float64
category_id                  int64
category_code               object
brand                       object
first_interaction           object
last_interaction            object
product_view_count           int64
product_purchase_count       int64
product_conversion_rate    float64
user_t

## Section 3: Data Preprocessing and Feature Engineering

In [38]:
# Handle missing values and encode categorical features
def preprocess_data(df):
    df = df.copy()
    
    # Handle missing values
    print("\n1. Handling Missing Values")
    print("-" * 80)
    categorical_cols = ['category_code', 'brand']
    
    for col in categorical_cols:
        missing_count = df[col].isnull().sum()
        df[col] = df[col].fillna('unknown')
        print(f"✓ {col}: Filled {missing_count:,} missing values ({missing_count/len(df)*100:.1f}%)")
    
    # Encode categorical features
    print("\n2. Encoding Categorical Features")
    print("-" * 80)
    label_encoders = {}
    
    for col in categorical_cols:
        print(f"\n{col}:")
        print(f"  Unique categories: {df[col].nunique():,}")
        
        le = LabelEncoder()
        df[col + '_encoded'] = le.fit_transform(df[col])
        label_encoders[col] = le
        
        print(f"  ✓ Encoded range: 0 to {df[col + '_encoded'].max()}")
    
    print("\n Preprocessing Complete!")
    
    return df, label_encoders

In [39]:
# Split data: 40% train, 10% test, 10% val, 40% production
def split_data_properly(df):
    
    print("\nSplitting strategy:")
    print("  - 40% Training data")
    print("  - 10% Test data")
    print("  - 10% Validation data")
    print("  - 40% Production/Holdout data")
    
    # First split: separate 40% for production
    df_working, df_production = train_test_split(
        df, 
        test_size=0.40, 
        random_state=42,
        stratify=df['purchased']
    )
    
    print(f"\n Production data reserved: {len(df_production):,} samples ({len(df_production)/len(df)*100:.1f}%)")
    
    # Split remaining 60% into train (40%), test (10%), validation (10%)
    df_train, df_temp = train_test_split(
        df_working,
        test_size=0.3333,
        random_state=42,
        stratify=df_working['purchased']
    )
    
    df_test, df_val = train_test_split(
        df_temp,
        test_size=0.5,
        random_state=42,
        stratify=df_temp['purchased']
    )
    
    print(f" Training data: {len(df_train):,} samples ({len(df_train)/len(df)*100:.1f}%)")
    print(f" Test data: {len(df_test):,} samples ({len(df_test)/len(df)*100:.1f}%)")
    print(f" Validation data: {len(df_val):,} samples ({len(df_val)/len(df)*100:.1f}%)")
    
    # Verify class distribution
    print("\nVerifying class distribution:")
    for name, dataset in [('Train', df_train), ('Test', df_test), 
                          ('Validation', df_val), ('Production', df_production)]:
        purchase_rate = dataset['purchased'].mean() * 100
        print(f"  {name}: {purchase_rate:.2f}% purchased")
    
    return df_train, df_test, df_val, df_production

In [40]:
# EXECUTE: Preprocess data and create splits
df_processed, label_encoders = preprocess_data(df_all)
df_train, df_test, df_val, df_prod = split_data_properly(df_processed)

print("\nData preprocessing and splitting complete!")
print(f"   - Training set: {len(df_train):,} records")
print(f"   - Test set: {len(df_test):,} records")
print(f"   - Validation set: {len(df_val):,} records")
print(f"   - Production set: {len(df_prod):,} records")


1. Handling Missing Values
--------------------------------------------------------------------------------
✓ category_code: Filled 647,785 missing values (21.0%)
✓ brand: Filled 284,278 missing values (9.2%)

2. Encoding Categorical Features
--------------------------------------------------------------------------------

category_code:
  Unique categories: 136
  ✓ Encoded range: 0 to 135

brand:
  Unique categories: 4,156
  ✓ Encoded range: 0 to 4155

 Preprocessing Complete!

Splitting strategy:
  - 40% Training data
  - 10% Test data
  - 10% Validation data
  - 40% Production/Holdout data

 Production data reserved: 1,232,342 samples (40.0%)
 Training data: 1,232,402 samples (40.0%)
 Test data: 308,055 samples (10.0%)
 Validation data: 308,055 samples (10.0%)

Verifying class distribution:
  Train: 60.94% purchased
  Test: 60.94% purchased
  Validation: 60.94% purchased
  Production: 60.94% purchased

Data preprocessing and splitting complete!
   - Training set: 1,232,402 records


## Section 4: Feature Store Setup

In [41]:
# Initialize SageMaker Feature Store and design feature groups
def create_feature_store(df_sample):
    feature_group_name = f"ecommerce-user-product-features-{int(time.time())}"
    
    print(f"\n→ Creating Feature Group: {feature_group_name}")
    
    # Prepare dataframe for feature store
    df_fs = df_sample.copy()
    df_fs['event_time'] = pd.Timestamp.now().isoformat()
    df_fs['record_id'] = df_fs.index.astype(str) + '_' + df_fs['user_id'].astype(str)
    
    # Select features for feature store
    feature_columns = [
        'record_id', 'event_time', 'user_id', 'product_id', 'purchased',
        'view_count', 'cart_count', 'total_interactions', 'price',
        'category_id', 'category_code_encoded', 'brand_encoded',
        'product_view_count', 'product_purchase_count',
        'product_conversion_rate', 'user_total_events', 'user_total_purchases'
    ]
    
    df_fs = df_fs[feature_columns]
    
    # Initialize Feature Group
    feature_group = FeatureGroup(
        name=feature_group_name,
        sagemaker_session=sagemaker_session
    )
    
    print("\n Feature Group Design:")
    print(f"  Name: {feature_group_name}")
    print(f"  Record Identifier: record_id")
    print(f"  Event Time Feature: event_time")
    print(f"  Number of features: {len(feature_columns)}")
    
    # Load feature definitions
    feature_group.load_feature_definitions(data_frame=df_fs)
    
    print(f"\nFeatures in group:")
    for i, col in enumerate(feature_columns, 1):
        dtype = df_fs[col].dtype
        print(f"  {i:2d}. {col:30s} ({dtype})")
    
    # Create feature group
    try:
        feature_group.create(
            s3_uri=f"s3://{BUCKET_NAME}/{FEATURE_STORE_PREFIX}",
            record_identifier_name='record_id',
            event_time_feature_name='event_time',
            role_arn=role,
            enable_online_store=True
        )
        
        print("\n Feature group created successfully!")
        print(f"  Online store: Enabled")
        print(f"  Offline store: s3://{BUCKET_NAME}/{FEATURE_STORE_PREFIX}")
        
    except Exception as e:
        print(f"\nNote: {str(e)}")
    
    return feature_group, feature_group_name, df_fs

In [42]:
# EXECUTE: Save splits to S3 and create Feature Store
save_splits_to_s3(df_train, df_test, df_val, df_prod)

feature_group, feature_group_name, df_fs_sample = create_feature_store(df_train.head(100))

print("\n Feature Store setup complete!")
print(f"   Feature Group: {feature_group_name}")

 Production data saved: s3://aai540-ecommerce-recommendation-project-group3/mlops-pipeline/production_data.csv
 Train saved: s3://aai540-ecommerce-recommendation-project-group3/mlops-pipeline/train_processed.csv
 Test saved: s3://aai540-ecommerce-recommendation-project-group3/mlops-pipeline/test_processed.csv
 Validation saved: s3://aai540-ecommerce-recommendation-project-group3/mlops-pipeline/validation_processed.csv

 All splits saved successfully!

→ Creating Feature Group: ecommerce-user-product-features-1760669319

 Feature Group Design:
  Name: ecommerce-user-product-features-1760669319
  Record Identifier: record_id
  Event Time Feature: event_time
  Number of features: 17

Features in group:
   1. record_id                      (object)
   2. event_time                     (object)
   3. user_id                        (int64)
   4. product_id                     (int64)
   5. purchased                      (int64)
   6. view_count                     (int64)
   7. cart_count   

## Section 5: Benchmark Model Training

In [44]:
import os

# Create code directory manually
try:
    os.makedirs('code', exist_ok=True)
    print(f"✓ Created/verified 'code' directory at: {os.path.join(os.getcwd(), 'code')}")
    
    # Check write permissions
    test_file = 'code/.test_write'
    try:
        with open(test_file, 'w') as f:
            f.write('test')
        os.remove(test_file)
        print(" Write permissions verified")
    except Exception as e:
        print(f" Write permission error: {e}")
except Exception as e:
    print(f" Directory creation error: {e}")

✓ Created/verified 'code' directory at: /home/sagemaker-user/code
 Write permissions verified


In [49]:
# Save all data splits to S3
def create_preprocessing_script():
    
    # Save production data
    prod_key = f"{prefix}/production_data.csv"
    csv_buffer = io.StringIO()
    df_prod.to_csv(csv_buffer, index=False)
    s3_client.put_object(
        Bucket=bucket,
        Key=prod_key,
        Body=csv_buffer.getvalue()
    )
    print(f" Production data saved: s3://{bucket}/{prod_key}")
    
    # Save other splits
    for name, data in [('train', df_train), ('test', df_test), ('validation', df_val)]:
        key = f"{prefix}/{name}_processed.csv"
        csv_buffer = io.StringIO()
        data.to_csv(csv_buffer, index=False)
        s3_client.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue())
        print(f"✓ {name.capitalize()} saved: s3://{bucket}/{key}")
    
    print("\n All splits saved successfully!")

In [53]:
preprocess_content = '''import argparse
import os
import pandas as pd
import numpy as np

def target_encode(train_series, test_series, target_series, min_samples_leaf=1, smoothing=1):
    temp = pd.concat([train_series, target_series], axis=1)
    averages = temp.groupby(train_series.name)[target_series.name].agg(["mean", "count"])
    smoothing_component = target_series.mean()
    averages["smoothed"] = (averages["mean"] * averages["count"] + smoothing_component * smoothing) / (averages["count"] + smoothing)
    mapping = averages["smoothed"]
    train_encoded = train_series.map(mapping)
    test_encoded = test_series.map(mapping)
    test_encoded.fillna(smoothing_component, inplace=True)
    return train_encoded, test_encoded

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-input", type=str)
    parser.add_argument("--test-input", type=str, default=None)
    parser.add_argument("--validation-input", type=str, default=None)
    args, _ = parser.parse_known_args()

    base_dir = "/opt/ml/processing"
    train_input_path = os.path.join(base_dir, "input/train", args.train_input)
    
    print("Loading training data...")
    df_train = pd.read_csv(train_input_path)
    
    df_test = None
    if args.test_input:
        test_input_path = os.path.join(base_dir, "input/test", args.test_input)
        if os.path.exists(test_input_path):
            print("Loading test data...")
            df_test = pd.read_csv(test_input_path)
    
    df_val = None
    if args.validation_input:
        val_input_path = os.path.join(base_dir, "input/validation", args.validation_input)
        if os.path.exists(val_input_path):
            print("Loading validation data...")
            df_val = pd.read_csv(val_input_path)

    print("Applying target encoding...")
    categorical_cols = ['brand', 'category_code']
    target_column = 'purchased'

    for col in categorical_cols:
        df_train[col] = df_train[col].fillna('unknown')
        train_encoded, _ = target_encode(
            df_train[col], 
            df_train[col], 
            df_train[target_column]
        )
        df_train[f"{col}_encoded"] = train_encoded
        
        if df_test is not None:
            df_test[col] = df_test[col].fillna('unknown')
            _, test_encoded = target_encode(
                df_train[col],
                df_test[col],
                df_train[target_column]
            )
            df_test[f"{col}_encoded"] = test_encoded
        
        if df_val is not None:
            df_val[col] = df_val[col].fillna('unknown')
            _, val_encoded = target_encode(
                df_train[col],
                df_val[col],
                df_train[target_column]
            )
            df_val[f"{col}_encoded"] = val_encoded

    print("Cleaning columns...")
    columns_to_drop = ['user_id', 'product_id', 'first_interaction', 'last_interaction'] + categorical_cols
    df_train_processed = df_train.drop(columns=columns_to_drop)
    
    feature_columns = [col for col in df_train_processed.columns if col != target_column]
    final_train_df = df_train_processed[[target_column] + feature_columns]

    print("Saving processed training data...")
    train_output_path = os.path.join(base_dir, "output/train/train_commerce.csv")
    os.makedirs(os.path.dirname(train_output_path), exist_ok=True)
    final_train_df.to_csv(train_output_path, header=False, index=False)
    
    if df_test is not None:
        df_test_processed = df_test.drop(columns=columns_to_drop)
        final_test_df = df_test_processed[[target_column] + feature_columns]
        test_output_path = os.path.join(base_dir, "output/test/test_commerce.csv")
        os.makedirs(os.path.dirname(test_output_path), exist_ok=True)
        final_test_df.to_csv(test_output_path, header=False, index=False)
        print("Saved processed test data")
    
    if df_val is not None:
        df_val_processed = df_val.drop(columns=columns_to_drop)
        final_val_df = df_val_processed[[target_column] + feature_columns]
        
        rand_split = np.random.rand(len(final_val_df))
        new_validation_set = final_val_df[rand_split < 0.5]
        batch_set = final_val_df[rand_split >= 0.5]
        
        validation_output_path = os.path.join(base_dir, "output/validation/validation_commerce.csv")
        batch_output_path = os.path.join(base_dir, "output/batch/batch_commerce.csv")
        
        os.makedirs(os.path.dirname(validation_output_path), exist_ok=True)
        os.makedirs(os.path.dirname(batch_output_path), exist_ok=True)
        
        new_validation_set.to_csv(validation_output_path, header=False, index=False)
        batch_set.to_csv(batch_output_path, header=False, index=False)
        print("Saved processed validation and batch data")

    print("Preprocessing complete!")
'''

with open('code/preprocess.py', 'w') as f:
    f.write(preprocess_content)

print("✓ Created code/preprocess.py ({} bytes)".format(os.path.getsize('code/preprocess.py')))

✓ Created code/preprocess.py (4837 bytes)


In [54]:
evaluation_content = '''import json
import pathlib
import pickle
import tarfile
import os

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

if __name__ == "__main__":
    model_path = "/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")
    
    model = xgb.Booster()
    model.load_model("xgboost-model")
    
    test_path = "/opt/ml/processing/test/test_commerce.csv"
    df_test = pd.read_csv(test_path, header=None)
    
    y_test = df_test.iloc[:, 0].values
    X_test = df_test.iloc[:, 1:].values
    
    dtest = xgb.DMatrix(X_test)
    predictions_prob = model.predict(dtest)
    predictions = (predictions_prob > 0.5).astype(int)
    
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, zero_division=0)
    recall = recall_score(y_test, predictions, zero_division=0)
    f1 = f1_score(y_test, predictions, zero_division=0)
    
    try:
        auc = roc_auc_score(y_test, predictions_prob)
    except:
        auc = 0.0
    
    cm = confusion_matrix(y_test, predictions)
    tn, fp, fn, tp = cm.ravel()
    
    report = {
        "binary_classification_metrics": {
            "accuracy": {"value": float(accuracy)},
            "precision": {"value": float(precision)},
            "recall": {"value": float(recall)},
            "f1_score": {"value": float(f1)},
            "auc": {"value": float(auc)},
            "confusion_matrix": {
                "true_negatives": int(tn),
                "false_positives": int(fp),
                "false_negatives": int(fn),
                "true_positives": int(tp)
            }
        }
    }
    
    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        json.dump(report, f, indent=4)
    
    print("Evaluation complete!")
    print(json.dumps(report, indent=4))
'''

with open('code/evaluation.py', 'w') as f:
    f.write(evaluation_content)

print("✓ Created code/evaluation.py ({} bytes)".format(os.path.getsize('code/evaluation.py')))

✓ Created code/evaluation.py (2171 bytes)


In [55]:
if os.path.exists('code/preprocess.py'):
    size = os.path.getsize('code/preprocess.py')
    print(f" preprocess.py exists ({size:,} bytes)")
else:
    print(" preprocess.py NOT FOUND")

if os.path.exists('code/evaluation.py'):
    size = os.path.getsize('code/evaluation.py')
    print(f" evaluation.py exists ({size:,} bytes)")
else:
    print(" evaluation.py NOT FOUND")

print("\nContents of code/ directory:")
for item in os.listdir('code'):
    item_path = os.path.join('code', item)
    if os.path.isfile(item_path):
        size = os.path.getsize(item_path)
        print(f"  - {item} ({size:,} bytes)")

print("\n Scripts created successfully! You can now run Section 6.")

 preprocess.py exists (4,837 bytes)
 evaluation.py exists (2,171 bytes)

Contents of code/ directory:
  - preprocess.py (4,837 bytes)
  - evaluation.py (2,171 bytes)

 Scripts created successfully! You can now run Section 6.


## Section 6: Model Training Pipeline

In [10]:
# Run SageMaker Processing job for data preprocessing
def run_preprocessing_job(train_path, test_path, val_path):
    
    sklearn_processor = SKLearnProcessor(
        framework_version="1.2-1",
        role=role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        base_job_name="ecommerce-preprocessing"
    )
    
    print("\n→ Starting SageMaker Processing Job...")
    sklearn_processor.run(
        code="code/preprocess.py",
        inputs=[
            ProcessingInput(
                source=train_path,
                destination="/opt/ml/processing/input/train",
                s3_data_distribution_type="FullyReplicated"
            ),
            ProcessingInput(
                source=test_path,
                destination="/opt/ml/processing/input/test",
                s3_data_distribution_type="FullyReplicated"
            ),
            ProcessingInput(
                source=val_path,
                destination="/opt/ml/processing/input/validation",
                s3_data_distribution_type="FullyReplicated"
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/output/train",
                destination=f"s3://{default_bucket}/{S3_PREFIX}/processed/train"
            ),
            ProcessingOutput(
                source="/opt/ml/processing/output/test",
                destination=f"s3://{default_bucket}/{S3_PREFIX}/processed/test"
            ),
            ProcessingOutput(
                source="/opt/ml/processing/output/validation",
                destination=f"s3://{default_bucket}/{S3_PREFIX}/processed/validation"
            ),
            ProcessingOutput(
                source="/opt/ml/processing/output/batch",
                destination=f"s3://{default_bucket}/{S3_PREFIX}/processed/batch"
            )
        ],
        arguments=[
            "--train-input", "train_processed.csv",
            "--test-input", "test_processed.csv",
            "--validation-input", "validation_processed.csv"
        ]
    )
    
    print("\n Preprocessing job completed successfully!")

In [11]:
# Train XGBoost model with SageMaker
def train_xgboost_model():
    
    job_name = "xgb-recommender-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
    output_location = f"s3://{default_bucket}/{S3_PREFIX}/output/{job_name}"
    
    image = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.7-1"
    )
    
    estimator = sagemaker.estimator.Estimator(
        image,
        role,
        instance_count=1,
        instance_type="ml.m5.xlarge",
        volume_size=50,
        input_mode="File",
        output_path=output_location,
        sagemaker_session=sagemaker_session
    )
    
    estimator.set_hyperparameters(
        objective="binary:logistic",
        eval_metric="auc",
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.8,
        verbosity=0,
        num_round=100,
    )
    
    # Define training data
    train_data = TrainingInput(
        f"s3://{default_bucket}/{S3_PREFIX}/processed/train",
        content_type="text/csv"
    )
    
    validation_data = TrainingInput(
        f"s3://{default_bucket}/{S3_PREFIX}/processed/validation",
        content_type="text/csv"
    )
    
    print(f"\n→ Starting training job: {job_name}")
    estimator.fit(
        inputs={"train": train_data, "validation": validation_data},
        job_name=job_name,
        logs=True
    )
    
    print("\n Training completed successfully!")
    return estimator, job_name

In [56]:
# EXECUTE: Run preprocessing job
# Verify the preprocessing script exists
import os
if not os.path.exists('code/preprocess.py'):
    print("\n ERROR: code/preprocess.py not found!")
    print("Please run Section 5 first to create the preprocessing script.")
    raise FileNotFoundError("code/preprocess.py must exist before running preprocessing job")

train_s3_path = f"s3://{BUCKET_NAME}/{S3_PREFIX}/train_processed.csv"
test_s3_path = f"s3://{BUCKET_NAME}/{S3_PREFIX}/test_processed.csv"
val_s3_path = f"s3://{BUCKET_NAME}/{S3_PREFIX}/validation_processed.csv"

run_preprocessing_job(train_s3_path, test_s3_path, val_s3_path)

print("\n Preprocessing job complete!")
print(f"   Processed data available at: s3://{default_bucket}/{S3_PREFIX}/processed/")

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3



→ Starting SageMaker Processing Job...


INFO:sagemaker:Creating processing-job with name ecommerce-preprocessing-2025-10-17-03-00-51-682


.............[34mLoading training data...[0m
[34mLoading test data...[0m
[34mLoading validation data...[0m
[34mApplying target encoding...[0m
[34mCleaning columns...[0m
[34mSaving processed training data...[0m
[34mSaved processed test data[0m
[34mSaved processed validation and batch data[0m
[34mPreprocessing complete![0m


 Preprocessing job completed successfully!

 Preprocessing job complete!
   Processed data available at: s3://sagemaker-us-east-1-115718999037/mlops-pipeline/processed/


In [57]:
# EXECUTE: Train XGBoost model
trained_estimator, training_job_name = train_xgboost_model()

print("\n Model training complete!")
print(f"   Job Name: {training_job_name}")
print(f"   Model artifacts: {trained_estimator.model_data}")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: xgb-recommender-2025-10-17-03-06-28



→ Starting training job: xgb-recommender-2025-10-17-03-06-28
2025-10-17 03:06:30 Starting - Starting the training job...
2025-10-17 03:07:04 Downloading - Downloading input data...
2025-10-17 03:07:34 Downloading - Downloading the training image......
  import pkg_resources[0m
[34m[2025-10-17 03:08:32.551 ip-10-0-180-38.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-10-17 03:08:32.612 ip-10-0-180-38.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-10-17:03:08:32:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-10-17:03:08:32:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-17:03:08:32:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-17:03:08:32:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-10-17:03:0

## Section 7: Model Deployment and Batch Transform

In [58]:
# Deploy model for batch transformation
def deploy_batch_transform(estimator, model_name):
    
    transformer = estimator.transformer(
        instance_count=1,
        instance_type="ml.m5.xlarge",
        output_path=f"s3://{default_bucket}/{S3_PREFIX}/batch-output"
    )
    
    batch_input = f"s3://{default_bucket}/{S3_PREFIX}/processed/batch"
    
    print(f"\n Starting batch transform job...")
    print(f"  Input: {batch_input}")
    print(f"  Output: {transformer.output_path}")
    
    transformer.transform(
        data=batch_input,
        content_type="text/csv",
        split_type="Line"
    )
    
    print("\n Waiting for transform job to complete...")
    transformer.wait()
    
    print("\n Batch transform completed successfully!")
    return transformer

In [60]:
import pandas as pd
import boto3

# Read the current batch file
s3_client = boto3.client('s3')
obj = s3_client.get_object(
    Bucket=default_bucket, 
    Key=f'{S3_PREFIX}/processed/batch/batch_commerce.csv'
)
batch_df = pd.read_csv(io.BytesIO(obj['Body'].read()), header=None)

print(f"Current batch shape: {batch_df.shape}")

# Remove first column (target variable)
batch_df_fixed = batch_df.iloc[:, 1:]
print(f"Fixed batch shape: {batch_df_fixed.shape}")

# Save back to S3
csv_buffer = io.StringIO()
batch_df_fixed.to_csv(csv_buffer, header=False, index=False)
s3_client.put_object(
    Bucket=default_bucket,
    Key=f'{S3_PREFIX}/processed/batch/batch_commerce.csv',
    Body=csv_buffer.getvalue()
)

print(" Batch file fixed! Now re-run Section 7.")

Current batch shape: (153975, 13)
Fixed batch shape: (153975, 12)
 Batch file fixed! Now re-run Section 7.


In [61]:
# EXECUTE: Deploy batch transform
batch_transformer = deploy_batch_transform(trained_estimator, training_job_name)

print("\n Batch transform complete!")
print(f"   Output location: {batch_transformer.output_path}")

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-10-17-03-19-27-032
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2025-10-17-03-19-27-789



 Starting batch transform job...
  Input: s3://sagemaker-us-east-1-115718999037/mlops-pipeline/processed/batch
  Output: s3://sagemaker-us-east-1-115718999037/mlops-pipeline/batch-output
  import pkg_resources[0m
[34m[2025-10-17:03:24:32:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-10-17:03:24:32:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-10-17:03:24:32:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forw

## Section 8: Model and Data Monitoring

In [62]:
# Set up model monitoring for deployed endpoint
def setup_model_monitoring(endpoint_name):
    
    # Create data capture configuration
    data_capture_config = DataCaptureConfig(
        enable_capture=True,
        sampling_percentage=100,
        destination_s3_uri=f"s3://{default_bucket}/{S3_PREFIX}/data-capture"
    )
    
    print(f"\n Data Capture Configuration:")
    print(f"  Sampling: 100%")
    print(f"  Destination: s3://{default_bucket}/{S3_PREFIX}/data-capture")
    
    # Create model monitor
    monitor = DefaultModelMonitor(
        role=role,
        instance_count=1,
        instance_type='ml.m5.xlarge',
        volume_size_in_gb=20,
        max_runtime_in_seconds=3600,
    )
    
    # Suggest baseline
    baseline_results_uri = f"s3://{default_bucket}/{S3_PREFIX}/baselining"
    
    try:
        print(f"\n Suggesting baseline...")
        monitor.suggest_baseline(
            baseline_dataset=f"s3://{default_bucket}/{S3_PREFIX}/processed/train/train_commerce.csv",
            dataset_format={"csv": {"header": False}},
            output_s3_uri=baseline_results_uri,
            wait=True
        )
        print(f" Baseline created: {baseline_results_uri}")
    except Exception as e:
        print(f"Note: {str(e)}")
    
    # Create monitoring schedule
    schedule_name = f"ecommerce-monitor-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"
    
    try:
        print(f"\n Creating monitoring schedule: {schedule_name}")
        monitor.create_monitoring_schedule(
            monitor_schedule_name=schedule_name,
            endpoint_input=endpoint_name,
            output_s3_uri=f"s3://{default_bucket}/{S3_PREFIX}/monitoring-output",
            statistics=monitor.baseline_statistics(),
            constraints=monitor.suggested_constraints(),
            schedule_cron_expression=CronExpressionGenerator.hourly(),
        )
        print(f" Monitoring schedule created")
    except Exception as e:
        print(f"Note: {str(e)}")
    
    return monitor, schedule_name

In [63]:
# Create CloudWatch dashboard for monitoring
def create_cloudwatch_dashboard(endpoint_name):
    
    dashboard_name = f"ECommerce-ML-Dashboard-{strftime('%Y%m%d', gmtime())}"
    
    dashboard_body = {
        "widgets": [
            {
                "type": "metric",
                "properties": {
                    "metrics": [
                        ["AWS/SageMaker", "ModelLatency", {"stat": "Average"}],
                        [".", ".", {"stat": "Maximum"}]
                    ],
                    "period": 300,
                    "stat": "Average",
                    "region": region,
                    "title": "Model Latency",
                    "yAxis": {"left": {"label": "Milliseconds"}}
                }
            },
            {
                "type": "metric",
                "properties": {
                    "metrics": [
                        ["AWS/SageMaker", "Invocations", {"stat": "Sum"}],
                        [".", "InvocationErrors", {"stat": "Sum"}]
                    ],
                    "period": 300,
                    "stat": "Sum",
                    "region": region,
                    "title": "Invocations and Errors"
                }
            },
            {
                "type": "metric",
                "properties": {
                    "metrics": [
                        ["AWS/SageMaker", "CPUUtilization", {"stat": "Average"}],
                        [".", "MemoryUtilization", {"stat": "Average"}]
                    ],
                    "period": 300,
                    "stat": "Average",
                    "region": region,
                    "title": "Infrastructure Metrics",
                    "yAxis": {"left": {"label": "Percent"}}
                }
            },
            {
                "type": "metric",
                "properties": {
                    "metrics": [
                        ["AWS/SageMaker", "ModelSetupTime", {"stat": "Average"}]
                    ],
                    "period": 300,
                    "stat": "Average",
                    "region": region,
                    "title": "Model Setup Time"
                }
            }
        ]
    }
    
    try:
        response = cw_client.put_dashboard(
            DashboardName=dashboard_name,
            DashboardBody=json.dumps(dashboard_body)
        )
        print(f"\n✓ CloudWatch Dashboard created: {dashboard_name}")
        print(f"  View at: https://console.aws.amazon.com/cloudwatch/home?region={region}#dashboards:name={dashboard_name}")
    except Exception as e:
        print(f"Note: {str(e)}")
    
    return dashboard_name

In [64]:
# Create CloudWatch alarms for monitoring
def create_cloudwatch_alarms():
    
    alarms = [
        {
            "AlarmName": f"ECommerce-HighLatency-{strftime('%Y%m%d', gmtime())}",
            "MetricName": "ModelLatency",
            "Threshold": 1000,
            "ComparisonOperator": "GreaterThanThreshold",
            "Description": "Alert when model latency exceeds 1000ms"
        },
        {
            "AlarmName": f"ECommerce-HighErrorRate-{strftime('%Y%m%d', gmtime())}",
            "MetricName": "InvocationErrors",
            "Threshold": 10,
            "ComparisonOperator": "GreaterThanThreshold",
            "Description": "Alert when invocation errors exceed 10"
        },
        {
            "AlarmName": f"ECommerce-HighCPU-{strftime('%Y%m%d', gmtime())}",
            "MetricName": "CPUUtilization",
            "Threshold": 80,
            "ComparisonOperator": "GreaterThanThreshold",
            "Description": "Alert when CPU utilization exceeds 80%"
        }
    ]
    
    created_alarms = []
    for alarm in alarms:
        try:
            cw_client.put_metric_alarm(
                AlarmName=alarm["AlarmName"],
                MetricName=alarm["MetricName"],
                Namespace="AWS/SageMaker",
                Statistic="Average",
                Period=300,
                EvaluationPeriods=2,
                Threshold=alarm["Threshold"],
                ComparisonOperator=alarm["ComparisonOperator"],
                AlarmDescription=alarm["Description"]
            )
            print(f" Created alarm: {alarm['AlarmName']}")
            created_alarms.append(alarm["AlarmName"])
        except Exception as e:
            print(f"Note: {str(e)}")
    
    return created_alarms

In [65]:
# EXECUTE: Setup CloudWatch monitoring
dashboard_name = create_cloudwatch_dashboard("ecommerce-endpoint")
alarm_names = create_cloudwatch_alarms()

print("\nMonitoring setup complete!")
print(f"   Dashboard: {dashboard_name}")
print(f"   Alarms created: {len(alarm_names)}")


✓ CloudWatch Dashboard created: ECommerce-ML-Dashboard-20251017
  View at: https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#dashboards:name=ECommerce-ML-Dashboard-20251017
 Created alarm: ECommerce-HighLatency-20251017
 Created alarm: ECommerce-HighErrorRate-20251017
 Created alarm: ECommerce-HighCPU-20251017

Monitoring setup complete!
   Dashboard: ECommerce-ML-Dashboard-20251017
   Alarms created: 3


## Section 9: Model Evaluation and Metrics

In [66]:
# %%
def create_evaluation_script():
    """Create evaluation script for model performance assessment."""
    
    # Ensure the code directory exists
    import os
    os.makedirs('code', exist_ok=True)
    print("→ Created/verified 'code' directory")
    
    evaluation_script = """import json
import pathlib
import pickle
import tarfile
import os

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

if __name__ == "__main__":
    # Load model
    model_path = "/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")
    
    model = xgb.Booster()
    model.load_model("xgboost-model")
    
    # Load test data
    test_path = "/opt/ml/processing/test/test_commerce.csv"
    df_test = pd.read_csv(test_path, header=None)
    
    # Separate features and target
    y_test = df_test.iloc[:, 0].values
    X_test = df_test.iloc[:, 1:].values
    
    # Make predictions
    dtest = xgb.DMatrix(X_test)
    predictions_prob = model.predict(dtest)
    predictions = (predictions_prob > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, zero_division=0)
    recall = recall_score(y_test, predictions, zero_division=0)
    f1 = f1_score(y_test, predictions, zero_division=0)
    
    try:
        auc = roc_auc_score(y_test, predictions_prob)
    except:
        auc = 0.0
    
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, predictions)
    tn, fp, fn, tp = cm.ravel()
    
    # Create evaluation report
    report = {
        "binary_classification_metrics": {
            "accuracy": {"value": float(accuracy)},
            "precision": {"value": float(precision)},
            "recall": {"value": float(recall)},
            "f1_score": {"value": float(f1)},
            "auc": {"value": float(auc)},
            "confusion_matrix": {
                "true_negatives": int(tn),
                "false_positives": int(fp),
                "false_negatives": int(fn),
                "true_positives": int(tp)
            }
        }
    }
    
    # Save evaluation report
    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        json.dump(report, f, indent=4)
    
    print("Evaluation complete!")
    print(json.dumps(report, indent=4))
"""
    
    # Save the script
    script_path = 'code/evaluation.py'
    with open(script_path, 'w') as f:
        f.write(evaluation_script)
    
    # Verify it was created
    if os.path.exists(script_path):
        file_size = os.path.getsize(script_path)
        print(f"✓ Evaluation script created: {script_path} ({file_size} bytes)")
    else:
        print(f"❌ ERROR: Failed to create {script_path}")
        raise IOError(f"Could not create {script_path}")

## Section 10: CI/CD Pipeline with SageMaker Pipelines

In [67]:
# Create complete CI/CD pipeline using SageMaker Pipelines
def create_cicd_pipeline():
    
    # Define pipeline parameters
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(
        name="ModelApprovalStatus",
        default_value="PendingManualApproval"
    )
    
    # Upload data to S3 for pipeline
    train_input_uri = f"s3://{default_bucket}/{S3_PREFIX}/train_processed.csv"
    test_input_uri = f"s3://{default_bucket}/{S3_PREFIX}/test_processed.csv"
    val_input_uri = f"s3://{default_bucket}/{S3_PREFIX}/validation_processed.csv"
    batch_input_uri = f"s3://{default_bucket}/{S3_PREFIX}/batch_commerce.csv"
    
    input_data = ParameterString(name="InputDataTrain", default_value=train_input_uri)
    test_data = ParameterString(name="InputDataTest", default_value=test_input_uri)
    val_data = ParameterString(name="InputDataVal", default_value=val_input_uri)
    batch_data = ParameterString(name="BatchData", default_value=batch_input_uri)
    auc_threshold = ParameterFloat(name="AUCThreshold", default_value=0.75)
    
    print("\n Setting up pipeline steps...")
    
    # Step 1: Processing
    sklearn_processor = SKLearnProcessor(
        framework_version="1.2-1",
        role=role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        base_job_name="pipeline-preprocessing",
        sagemaker_session=pipeline_session
    )
    
    processor_args = sklearn_processor.run(
        code="code/preprocess.py",
        inputs=[
            ProcessingInput(
                source=input_data,
                destination="/opt/ml/processing/input/train",
                s3_data_distribution_type="FullyReplicated"
            ),
            ProcessingInput(
                source=test_data,
                destination="/opt/ml/processing/input/test",
                s3_data_distribution_type="FullyReplicated"
            ),
            ProcessingInput(
                source=val_data,
                destination="/opt/ml/processing/input/validation",
                s3_data_distribution_type="FullyReplicated"
            )
        ],
        outputs=[
            ProcessingOutput(
                output_name="train",
                source="/opt/ml/processing/output/train",
                destination=f"s3://{default_bucket}/{S3_PREFIX}/pipeline/train"
            ),
            ProcessingOutput(
                output_name="test",
                source="/opt/ml/processing/output/test",
                destination=f"s3://{default_bucket}/{S3_PREFIX}/pipeline/test"
            ),
            ProcessingOutput(
                output_name="validation",
                source="/opt/ml/processing/output/validation",
                destination=f"s3://{default_bucket}/{S3_PREFIX}/pipeline/validation"
            ),
            ProcessingOutput(
                output_name="batch",
                source="/opt/ml/processing/output/batch",
                destination=f"s3://{default_bucket}/{S3_PREFIX}/pipeline/batch"
            )
        ],
        arguments=[
            "--train-input", "train_processed.csv",
            "--test-input", "test_processed.csv",
            "--validation-input", "validation_processed.csv"
        ]
    )
    
    step_process = ProcessingStep(name="PreprocessData", step_args=processor_args)
    print("  Processing step configured")
    
    # Step 2: Training
    image = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.7-1"
    )
    
    estimator = sagemaker.estimator.Estimator(
        image,
        role,
        instance_count=1,
        instance_type=instance_type,
        volume_size=50,
        input_mode="File",
        output_path=f"s3://{default_bucket}/{S3_PREFIX}/pipeline/output",
        sagemaker_session=pipeline_session
    )
    
    estimator.set_hyperparameters(
        objective="binary:logistic",
        eval_metric="auc",
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.8,
        verbosity=0,
        num_round=100,
    )
    
    train_args = estimator.fit(
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
                content_type="text/csv"
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri,
                content_type="text/csv"
            )
        }
    )
    
    step_train = TrainingStep(name="TrainModel", step_args=train_args)
    print("  Training step configured")
    
    # Step 3: Evaluation
    script_eval = ScriptProcessor(
        image_uri=image,
        command=["python3"],
        instance_type="ml.m5.xlarge",
        instance_count=1,
        base_job_name="pipeline-evaluation",
        role=role,
        sagemaker_session=pipeline_session,
    )
    
    eval_args = script_eval.run(
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(
                output_name="evaluation",
                source="/opt/ml/processing/evaluation"
            ),
        ],
        code="code/evaluation.py",
    )
    
    evaluation_report = PropertyFile(
        name="EvaluationReport",
        output_name="evaluation",
        path="evaluation.json"
    )
    
    step_eval = ProcessingStep(
        name="EvaluateModel",
        step_args=eval_args,
        property_files=[evaluation_report],
    )
    print("  Evaluation step configured")
    
    # Step 4: Create Model
    model = sagemaker.model.Model(
        image_uri=image,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=pipeline_session,
        role=role,
    )
    
    step_create_model = ModelStep(
        name="CreateModel",
        step_args=model.create(instance_type="ml.m5.large"),
    )
    print("  Model creation step configured")
    
    # Step 5: Batch Transform
    transformer = estimator.transformer(
        instance_count=1,
        instance_type="ml.m5.xlarge",
        output_path=f"s3://{default_bucket}/{S3_PREFIX}/pipeline/batch-output"
    )
    
    step_transform = TransformStep(
        name="BatchTransform",
        transformer=transformer,
        inputs=TransformInput(
            data=step_process.properties.ProcessingOutputConfig.Outputs["batch"].S3Output.S3Uri,
            content_type="text/csv"
        ),
    )
    print("  Transform step configured")
    
    # Step 6: Register Model
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json",
        )
    )
    
    register_args = model.register(
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=MODEL_PACKAGE_GROUP_NAME,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )
    
    step_register = ModelStep(name="RegisterModel", step_args=register_args)
    print("  Registration step configured")
    
    # Step 7: Conditional Deployment
    step_fail = FailStep(
        name="ModelPerformanceFail",
        error_message=Join(
            on=" ",
            values=["Model AUC below threshold:", auc_threshold]
        ),
    )
    
    cond_gte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step_name=step_eval.name,
            property_file=evaluation_report,
            json_path="binary_classification_metrics.auc.value",
        ),
        right=auc_threshold,
    )
    
    step_cond = ConditionStep(
        name="CheckModelPerformance",
        conditions=[cond_gte],
        if_steps=[step_register, step_create_model, step_transform],
        else_steps=[step_fail],
    )
    print("  Conditional step configured")
    
    # Create Pipeline
    pipeline_name = f"ECommercePipeline-{strftime('%Y%m%d-%H%M%S', gmtime())}"
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_count,
            instance_type,
            model_approval_status,
            input_data,
            test_data,
            val_data,
            batch_data,
            auc_threshold,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=pipeline_session,
    )
    
    print(f"\n Creating pipeline: {pipeline_name}")
    pipeline.upsert(role_arn=role)
    print(f" Pipeline created successfully!")
    
    return pipeline

In [68]:
# Execute the CI/CD pipeline
def execute_pipeline(pipeline):
    
    print(f"\n Starting pipeline execution...")
    execution = pipeline.start()
    
    print(f" Pipeline execution started")
    print(f"  Execution ARN: {execution.arn}")
    
    return execution

## Section 11: Model Registry and Versioning

In [22]:
# Set up Model Registry for model versioning
def setup_model_registry():
    
    model_package_group_description = (
        'Model package group for E-Commerce Recommendation System. '
        'Tracks all trained model versions with performance metrics.'
    )
    
    model_package_group_input = {
        "ModelPackageGroupName": MODEL_PACKAGE_GROUP_NAME,
        "ModelPackageGroupDescription": model_package_group_description,
    }
    
    try:
        response = sm_client.create_model_package_group(**model_package_group_input)
        print(f"\n✓ Model Package Group created: {MODEL_PACKAGE_GROUP_NAME}")
        print(f"  ARN: {response['ModelPackageGroupArn']}")
    except Exception as e:
        if "already exists" in str(e):
            print(f"\n✓ Model Package Group already exists: {MODEL_PACKAGE_GROUP_NAME}")
        else:
            print(f"Note: {str(e)}")
    
    return MODEL_PACKAGE_GROUP_NAME