# Lab 4.4.4: GCP Vertex AI Deployment - SOLUTION

**Module:** 4.4 - Containerization & Cloud Deployment  
**This is the complete solution notebook with all exercises solved.**

---

## Exercise 1 Solution: Custom Container for Vertex AI

In [None]:
# Vertex AI compliant container implementation

vertex_container_code = '''
"""FastAPI Application for Vertex AI Custom Container.

This server is designed to meet Vertex AI's requirements:
- Health endpoint at /health (or AIP_HEALTH_ROUTE)
- Prediction endpoint at /predict (or AIP_PREDICT_ROUTE)
- Listens on port 8080 (or AIP_HTTP_PORT)
- Model loaded from AIP_STORAGE_URI
"""

import os
import time
from typing import List, Dict, Any, Optional
from contextlib import asynccontextmanager

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


# ============================================
# Configuration from Vertex AI environment
# ============================================

MODEL_PATH = os.environ.get("AIP_STORAGE_URI", "/models")
HTTP_PORT = int(os.environ.get("AIP_HTTP_PORT", 8080))
HEALTH_ROUTE = os.environ.get("AIP_HEALTH_ROUTE", "/health")
PREDICT_ROUTE = os.environ.get("AIP_PREDICT_ROUTE", "/predict")

# Model configuration
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", 512))
TEMPERATURE = float(os.environ.get("TEMPERATURE", 0.7))
TOP_P = float(os.environ.get("TOP_P", 0.9))


# ============================================
# Global model storage
# ============================================

model = None
tokenizer = None
model_loaded = False


# ============================================
# Request/Response Models (Vertex AI format)
# ============================================

class PredictionInstance(BaseModel):
    """Single prediction instance."""
    prompt: str
    max_tokens: Optional[int] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None


class PredictionRequest(BaseModel):
    """Vertex AI prediction request format."""
    instances: List[PredictionInstance]


class PredictionResponse(BaseModel):
    """Vertex AI prediction response format."""
    predictions: List[Dict[str, Any]]
    metadata: Optional[Dict[str, Any]] = None


class HealthResponse(BaseModel):
    """Health check response."""
    status: str
    model_loaded: bool
    gpu_available: bool


# ============================================
# Model loading
# ============================================

@asynccontextmanager
async def lifespan(app: FastAPI):
    """Load model on startup."""
    global model, tokenizer, model_loaded
    
    print(f"Loading model from: {MODEL_PATH}")
    start_time = time.time()
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,
        )
        model_loaded = True
        
        load_time = time.time() - start_time
        print(f"Model loaded in {load_time:.2f}s")
        print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        
    except Exception as e:
        print(f"Failed to load model: {e}")
        model_loaded = False
    
    yield
    
    # Cleanup
    del model, tokenizer
    torch.cuda.empty_cache()


# ============================================
# FastAPI App
# ============================================

app = FastAPI(
    title="Vertex AI LLM Server",
    description="Custom container for Vertex AI deployment",
    lifespan=lifespan,
)


@app.get(HEALTH_ROUTE, response_model=HealthResponse)
@app.get("/ping")  # Alternative health route
async def health():
    """Health check endpoint (required by Vertex AI)."""
    return HealthResponse(
        status="healthy" if model_loaded else "unhealthy",
        model_loaded=model_loaded,
        gpu_available=torch.cuda.is_available(),
    )


@app.post(PREDICT_ROUTE, response_model=PredictionResponse)
async def predict(request: PredictionRequest):
    """Prediction endpoint (required by Vertex AI)."""
    if not model_loaded:
        raise HTTPException(status_code=503, detail="Model not loaded")
    
    predictions = []
    total_tokens = 0
    
    for instance in request.instances:
        # Tokenize
        inputs = tokenizer(
            instance.prompt,
            return_tensors="pt",
        ).to(model.device)
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=instance.max_tokens or MAX_NEW_TOKENS,
                temperature=instance.temperature or TEMPERATURE,
                top_p=instance.top_p or TOP_P,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )
        
        # Decode
        generated = tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[1]:],
            skip_special_tokens=True,
        )
        
        tokens_generated = outputs.shape[1] - inputs["input_ids"].shape[1]
        total_tokens += tokens_generated
        
        predictions.append({
            "generated_text": generated,
            "tokens_generated": tokens_generated,
        })
    
    return PredictionResponse(
        predictions=predictions,
        metadata={
            "total_tokens": total_tokens,
            "model_path": MODEL_PATH,
        },
    )


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=HTTP_PORT)
'''

print("VERTEX AI CUSTOM CONTAINER CODE:")
print("=" * 60)
print(vertex_container_code)

## Exercise 2 Solution: Platform Comparison

In [None]:
# Comprehensive platform comparison

platform_comparison = '''
"""Cloud Platform Comparison for ML Deployment."""

from dataclasses import dataclass
from typing import List, Dict, Any


@dataclass
class PlatformComparison:
    """Side-by-side platform comparison."""
    
    def compare_all(self, model_size_gb: float, requests_per_day: int) -> Dict[str, Any]:
        """Generate comprehensive comparison."""
        
        comparison = {
            "model_size_gb": model_size_gb,
            "requests_per_day": requests_per_day,
            "platforms": {},
        }
        
        # AWS SageMaker
        comparison["platforms"]["sagemaker"] = {
            "name": "AWS SageMaker",
            "recommended_instance": self._recommend_sagemaker_instance(model_size_gb),
            "pros": [
                "Best GPU instance selection (A10G, A100, H100)",
                "Mature MLOps tooling (Pipelines, Model Registry)",
                "HuggingFace integration built-in",
                "Inference recommender for instance selection",
                "Shadow testing for safe deployments",
            ],
            "cons": [
                "Complex IAM configuration",
                "Steeper learning curve",
                "Pricing can be confusing",
                "Cold start times can be high",
            ],
            "best_for": [
                "AWS-centric organizations",
                "Complex ML pipelines",
                "Large-scale production workloads",
            ],
        }
        
        # GCP Vertex AI
        comparison["platforms"]["vertex"] = {
            "name": "GCP Vertex AI",
            "recommended_instance": self._recommend_vertex_instance(model_size_gb),
            "pros": [
                "Native BigQuery integration",
                "Simpler pricing model",
                "Strong AutoML capabilities",
                "Easy custom container deployment",
                "Good Kubernetes integration (GKE)",
            ],
            "cons": [
                "Fewer GPU options than AWS",
                "Less mature than SageMaker",
                "Smaller community/ecosystem",
                "Limited spot instance support",
            ],
            "best_for": [
                "GCP-centric organizations",
                "Data in BigQuery",
                "Simpler deployment needs",
            ],
        }
        
        # Self-hosted (EKS/GKE + vLLM)
        comparison["platforms"]["self_hosted"] = {
            "name": "Self-hosted (K8s + vLLM)",
            "recommended_instance": self._recommend_k8s_instance(model_size_gb),
            "pros": [
                "Maximum flexibility",
                "30-50% cost savings vs managed",
                "No vendor lock-in",
                "Latest optimizations (vLLM, SGLang)",
                "Full control over infrastructure",
            ],
            "cons": [
                "Operational overhead",
                "Need K8s expertise",
                "Build own monitoring/scaling",
                "Security responsibility",
            ],
            "best_for": [
                "High-volume workloads",
                "Cost-sensitive projects",
                "Teams with K8s experience",
            ],
        }
        
        # Calculate costs
        for platform in comparison["platforms"].values():
            instance = platform["recommended_instance"]
            platform["estimated_monthly_cost"] = instance["price"] * 24 * 30
        
        return comparison
    
    def _recommend_sagemaker_instance(self, model_size_gb: float) -> Dict:
        if model_size_gb <= 14:
            return {"type": "ml.g5.xlarge", "gpu": "A10G 24GB", "price": 1.006}
        elif model_size_gb <= 28:
            return {"type": "ml.g5.2xlarge", "gpu": "A10G 24GB", "price": 1.515}
        elif model_size_gb <= 48:
            return {"type": "ml.g5.4xlarge", "gpu": "A10G 24GB", "price": 2.533}
        else:
            return {"type": "ml.g5.12xlarge", "gpu": "4xA10G 96GB", "price": 7.598}
    
    def _recommend_vertex_instance(self, model_size_gb: float) -> Dict:
        if model_size_gb <= 14:
            return {"type": "n1-standard-4 + T4", "gpu": "T4 16GB", "price": 0.54}
        elif model_size_gb <= 24:
            return {"type": "n1-standard-8 + L4", "gpu": "L4 24GB", "price": 1.08}
        elif model_size_gb <= 40:
            return {"type": "n1-standard-8 + A100", "gpu": "A100 40GB", "price": 3.31}
        else:
            return {"type": "a2-highgpu-2g", "gpu": "2xA100 80GB", "price": 7.34}
    
    def _recommend_k8s_instance(self, model_size_gb: float) -> Dict:
        if model_size_gb <= 14:
            return {"type": "g5.xlarge (EKS)", "gpu": "A10G 24GB", "price": 0.75}  # ~25% cheaper
        elif model_size_gb <= 28:
            return {"type": "g5.2xlarge (EKS)", "gpu": "A10G 24GB", "price": 1.13}
        elif model_size_gb <= 48:
            return {"type": "g5.4xlarge (EKS)", "gpu": "A10G 24GB", "price": 1.90}
        else:
            return {"type": "g5.12xlarge (EKS)", "gpu": "4xA10G 96GB", "price": 5.70}


# Generate comparison
if __name__ == "__main__":
    import json
    
    comparator = PlatformComparison()
    result = comparator.compare_all(model_size_gb=14.0, requests_per_day=50000)
    
    print(json.dumps(result, indent=2))
'''

print("PLATFORM COMPARISON CODE:")
print("=" * 60)
print(platform_comparison)

---

## Summary

This solution demonstrated:

1. **Custom Container for Vertex AI**
   - Proper environment variable handling
   - Required endpoint structure
   - GPU memory management
   - Health check implementation

2. **Platform Comparison**
   - Side-by-side feature comparison
   - Instance recommendations by model size
   - Cost estimates across platforms
   - Best-fit scenarios for each platform