# Lab 4.4.3: AWS SageMaker Deployment - SOLUTION

**Module:** 4.4 - Containerization & Cloud Deployment  
**This is the complete solution notebook with all exercises solved.**

---

## Exercise 1 Solution: Multi-Model Endpoint Configuration

In [None]:
# Multi-model endpoint with A/B testing

multimodel_config = '''
"""Multi-Model SageMaker Endpoint with A/B Testing."""

import sagemaker
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.model import Model


def create_ab_test_endpoint(
    model_a_uri: str,
    model_b_uri: str,
    traffic_split: float = 0.5,  # 50% to model A
    endpoint_name: str = "ab-test-endpoint",
    role: str = None,
):
    """
    Create A/B test endpoint with two model variants.
    
    Args:
        model_a_uri: S3 path to model A (control)
        model_b_uri: S3 path to model B (treatment)
        traffic_split: Traffic % to model A (0.0-1.0)
        endpoint_name: Name for the endpoint
        role: SageMaker execution role
    """
    role = role or sagemaker.get_execution_role()
    session = sagemaker.Session()
    
    # Create Model A (Control)
    model_a = HuggingFaceModel(
        model_data=model_a_uri,
        role=role,
        transformers_version="4.37",
        pytorch_version="2.1",
        py_version="py310",
        env={
            "HF_MODEL_ID": model_a_uri,
            "VARIANT": "control",
        },
    )
    
    # Create Model B (Treatment)
    model_b = HuggingFaceModel(
        model_data=model_b_uri,
        role=role,
        transformers_version="4.37",
        pytorch_version="2.1",
        py_version="py310",
        env={
            "HF_MODEL_ID": model_b_uri,
            "VARIANT": "treatment",
        },
    )
    
    # Create endpoint configuration with production variants
    from sagemaker.production_variant import ProductionVariant
    
    # Calculate weights (must be integers that sum to match instance counts)
    weight_a = int(traffic_split * 100)
    weight_b = 100 - weight_a
    
    # Deploy with A/B split
    predictor = model_a.deploy(
        initial_instance_count=1,
        instance_type="ml.g5.xlarge",
        endpoint_name=endpoint_name,
        data_capture_config=sagemaker.model_monitor.DataCaptureConfig(
            enable_capture=True,
            sampling_percentage=100,
            destination_s3_uri=f"s3://bucket/data-capture/{endpoint_name}",
        ),
    )
    
    # Add variant B
    session.sagemaker_client.update_endpoint_weights_and_capacities(
        EndpointName=endpoint_name,
        DesiredWeightsAndCapacities=[
            {"VariantName": "AllTraffic", "DesiredWeight": weight_a},
            {"VariantName": "VariantB", "DesiredWeight": weight_b},
        ]
    )
    
    return {
        "endpoint_name": endpoint_name,
        "variants": [
            {"name": "AllTraffic", "weight": weight_a, "model": "model_a"},
            {"name": "VariantB", "weight": weight_b, "model": "model_b"},
        ],
    }


def analyze_ab_results(endpoint_name: str, metric: str = "latency"):
    """
    Analyze A/B test results from CloudWatch metrics.
    """
    import boto3
    from datetime import datetime, timedelta
    
    cloudwatch = boto3.client("cloudwatch")
    
    end_time = datetime.utcnow()
    start_time = end_time - timedelta(hours=24)
    
    results = {}
    for variant in ["AllTraffic", "VariantB"]:
        response = cloudwatch.get_metric_statistics(
            Namespace="AWS/SageMaker",
            MetricName="ModelLatency",
            Dimensions=[
                {"Name": "EndpointName", "Value": endpoint_name},
                {"Name": "VariantName", "Value": variant},
            ],
            StartTime=start_time,
            EndTime=end_time,
            Period=3600,
            Statistics=["Average", "p50", "p99"],
        )
        
        results[variant] = response["Datapoints"]
    
    return results
'''

print("MULTI-MODEL ENDPOINT WITH A/B TESTING:")
print("=" * 60)
print(multimodel_config)

## Exercise 2 Solution: Auto-Scaling Configuration

In [None]:
# Complete auto-scaling configuration

autoscaling_config = '''
"""SageMaker Auto-Scaling Configuration."""

import boto3


def configure_autoscaling(
    endpoint_name: str,
    variant_name: str = "AllTraffic",
    min_capacity: int = 1,
    max_capacity: int = 10,
    target_value: float = 70.0,  # Invocations per instance per minute
    scale_in_cooldown: int = 600,  # 10 minutes
    scale_out_cooldown: int = 300,  # 5 minutes
):
    """
    Configure auto-scaling for a SageMaker endpoint.
    
    Args:
        endpoint_name: SageMaker endpoint name
        variant_name: Production variant name
        min_capacity: Minimum instances
        max_capacity: Maximum instances
        target_value: Target invocations per instance per minute
        scale_in_cooldown: Seconds before scaling in again
        scale_out_cooldown: Seconds before scaling out again
    """
    autoscaling = boto3.client("application-autoscaling")
    
    resource_id = f"endpoint/{endpoint_name}/variant/{variant_name}"
    
    # Step 1: Register scalable target
    autoscaling.register_scalable_target(
        ServiceNamespace="sagemaker",
        ResourceId=resource_id,
        ScalableDimension="sagemaker:variant:DesiredInstanceCount",
        MinCapacity=min_capacity,
        MaxCapacity=max_capacity,
    )
    print(f"Registered scalable target: {min_capacity}-{max_capacity} instances")
    
    # Step 2: Create target tracking scaling policy
    autoscaling.put_scaling_policy(
        PolicyName=f"{endpoint_name}-scaling-policy",
        ServiceNamespace="sagemaker",
        ResourceId=resource_id,
        ScalableDimension="sagemaker:variant:DesiredInstanceCount",
        PolicyType="TargetTrackingScaling",
        TargetTrackingScalingPolicyConfiguration={
            "TargetValue": target_value,
            "PredefinedMetricSpecification": {
                "PredefinedMetricType": "SageMakerVariantInvocationsPerInstance",
            },
            "ScaleInCooldown": scale_in_cooldown,
            "ScaleOutCooldown": scale_out_cooldown,
        },
    )
    print(f"Created scaling policy: target {target_value} invocations/instance/minute")
    
    # Step 3: Add scheduled scaling for known traffic patterns (optional)
    # Scale up during business hours
    autoscaling.put_scheduled_action(
        ServiceNamespace="sagemaker",
        ScheduledActionName=f"{endpoint_name}-scale-up-business-hours",
        ResourceId=resource_id,
        ScalableDimension="sagemaker:variant:DesiredInstanceCount",
        Schedule="cron(0 8 ? * MON-FRI *)",  # 8 AM UTC weekdays
        ScalableTargetAction={
            "MinCapacity": max(2, min_capacity),
            "MaxCapacity": max_capacity,
        },
    )
    
    # Scale down after hours
    autoscaling.put_scheduled_action(
        ServiceNamespace="sagemaker",
        ScheduledActionName=f"{endpoint_name}-scale-down-after-hours",
        ResourceId=resource_id,
        ScalableDimension="sagemaker:variant:DesiredInstanceCount",
        Schedule="cron(0 20 ? * MON-FRI *)",  # 8 PM UTC weekdays
        ScalableTargetAction={
            "MinCapacity": min_capacity,
            "MaxCapacity": max_capacity,
        },
    )
    print("Created scheduled scaling actions for business hours")
    
    return {
        "resource_id": resource_id,
        "min_capacity": min_capacity,
        "max_capacity": max_capacity,
        "target_value": target_value,
    }


def get_scaling_status(endpoint_name: str, variant_name: str = "AllTraffic"):
    """Get current scaling status."""
    autoscaling = boto3.client("application-autoscaling")
    sagemaker = boto3.client("sagemaker")
    
    # Get current capacity
    resource_id = f"endpoint/{endpoint_name}/variant/{variant_name}"
    
    targets = autoscaling.describe_scalable_targets(
        ServiceNamespace="sagemaker",
        ResourceIds=[resource_id],
    )
    
    # Get endpoint status
    endpoint = sagemaker.describe_endpoint(EndpointName=endpoint_name)
    
    return {
        "endpoint_status": endpoint["EndpointStatus"],
        "scaling_config": targets["ScalableTargets"][0] if targets["ScalableTargets"] else None,
        "current_instances": endpoint.get("ProductionVariants", [{}])[0].get("CurrentInstanceCount", 0),
    }
'''

print("AUTO-SCALING CONFIGURATION:")
print("=" * 60)
print(autoscaling_config)

## Exercise 3 Solution: Cost Calculator

In [None]:
# SageMaker cost calculator

cost_calculator = '''
"""SageMaker Cost Calculator."""

from dataclasses import dataclass
from typing import List, Dict


@dataclass
class SageMakerCostEstimate:
    """Cost estimate for SageMaker endpoint."""
    instance_type: str
    instance_count: int
    hourly_cost: float
    daily_cost: float
    monthly_cost: float
    cost_per_1k_requests: float
    spot_savings: float = 0.0


# SageMaker instance pricing (us-west-2, on-demand)
SAGEMAKER_PRICING = {
    # GPU instances
    "ml.g5.xlarge": {"price": 1.006, "gpu": "A10G", "vram": 24, "vcpu": 4, "memory": 16},
    "ml.g5.2xlarge": {"price": 1.515, "gpu": "A10G", "vram": 24, "vcpu": 8, "memory": 32},
    "ml.g5.4xlarge": {"price": 2.533, "gpu": "A10G", "vram": 24, "vcpu": 16, "memory": 64},
    "ml.g5.8xlarge": {"price": 4.051, "gpu": "A10G", "vram": 24, "vcpu": 32, "memory": 128},
    "ml.g5.12xlarge": {"price": 7.598, "gpu": "4xA10G", "vram": 96, "vcpu": 48, "memory": 192},
    "ml.g5.24xlarge": {"price": 10.131, "gpu": "4xA10G", "vram": 96, "vcpu": 96, "memory": 384},
    "ml.g5.48xlarge": {"price": 20.262, "gpu": "8xA10G", "vram": 192, "vcpu": 192, "memory": 768},
    "ml.p4d.24xlarge": {"price": 32.773, "gpu": "8xA100", "vram": 320, "vcpu": 96, "memory": 1152},
    # Inf2 (Inferentia)
    "ml.inf2.xlarge": {"price": 0.758, "accelerator": "Inferentia2", "vcpu": 4, "memory": 16},
    "ml.inf2.8xlarge": {"price": 3.032, "accelerator": "Inferentia2", "vcpu": 32, "memory": 128},
}

# Spot discount (approximate)
SPOT_DISCOUNT = 0.7  # 70% discount


def calculate_endpoint_cost(
    instance_type: str,
    instance_count: int = 1,
    requests_per_day: int = 10000,
    avg_latency_ms: float = 100,
    use_spot: bool = False,
) -> SageMakerCostEstimate:
    """
    Calculate estimated cost for a SageMaker endpoint.
    
    Args:
        instance_type: SageMaker instance type
        instance_count: Number of instances
        requests_per_day: Expected daily requests
        avg_latency_ms: Average inference latency
        use_spot: Use spot instances
    """
    if instance_type not in SAGEMAKER_PRICING:
        raise ValueError(f"Unknown instance type: {instance_type}")
    
    base_price = SAGEMAKER_PRICING[instance_type]["price"]
    
    # Apply spot discount if applicable
    spot_savings = 0.0
    if use_spot:
        spot_savings = base_price * SPOT_DISCOUNT
        base_price = base_price * (1 - SPOT_DISCOUNT)
    
    hourly_cost = base_price * instance_count
    daily_cost = hourly_cost * 24
    monthly_cost = daily_cost * 30
    
    # Calculate cost per 1k requests
    # Assuming each instance can handle requests_per_second = 1000 / avg_latency_ms
    requests_per_second = 1000 / avg_latency_ms
    requests_per_hour = requests_per_second * 3600 * instance_count
    cost_per_1k = (hourly_cost / requests_per_hour) * 1000
    
    return SageMakerCostEstimate(
        instance_type=instance_type,
        instance_count=instance_count,
        hourly_cost=hourly_cost,
        daily_cost=daily_cost,
        monthly_cost=monthly_cost,
        cost_per_1k_requests=cost_per_1k,
        spot_savings=spot_savings * instance_count * 24 * 30,
    )


def recommend_instance(
    model_size_gb: float,
    target_latency_ms: float = 100,
    budget_monthly: float = None,
) -> List[Dict]:
    """
    Recommend instance types for a model.
    
    Args:
        model_size_gb: Model size in GB (fp16)
        target_latency_ms: Target latency in ms
        budget_monthly: Monthly budget cap
    """
    recommendations = []
    
    # Rule: need ~2x model size for inference memory
    required_vram = model_size_gb * 2
    
    for instance_type, specs in SAGEMAKER_PRICING.items():
        vram = specs.get("vram", 0)
        if vram >= required_vram:
            monthly_cost = specs["price"] * 24 * 30
            
            if budget_monthly and monthly_cost > budget_monthly:
                continue
            
            recommendations.append({
                "instance_type": instance_type,
                "gpu": specs.get("gpu", "N/A"),
                "vram_gb": vram,
                "monthly_cost": monthly_cost,
                "headroom_gb": vram - required_vram,
            })
    
    # Sort by cost
    return sorted(recommendations, key=lambda x: x["monthly_cost"])


# Example usage
if __name__ == "__main__":
    # Calculate cost for 7B model deployment
    estimate = calculate_endpoint_cost(
        instance_type="ml.g5.xlarge",
        instance_count=2,
        requests_per_day=50000,
        avg_latency_ms=150,
    )
    
    print(f"Instance: {estimate.instance_type} x {estimate.instance_count}")
    print(f"Hourly: ${estimate.hourly_cost:.2f}")
    print(f"Monthly: ${estimate.monthly_cost:,.0f}")
    print(f"Per 1K requests: ${estimate.cost_per_1k_requests:.4f}")
'''

print("SAGEMAKER COST CALCULATOR:")
print("=" * 60)
print(cost_calculator)

---

## Summary

This solution demonstrated:

1. **Multi-Model Endpoints with A/B Testing**
   - Production variants for traffic splitting
   - Data capture for analysis
   - CloudWatch metrics analysis

2. **Auto-Scaling Configuration**
   - Target tracking scaling policy
   - Scheduled scaling for known patterns
   - Proper cooldown configuration

3. **Cost Calculator**
   - Instance pricing reference
   - Cost estimation per request
   - Instance recommendation engine