# Amazon SageMaker Training Job

This notebook demonstrates:
- Preparing training data
- Uploading data to S3
- Launching a SageMaker training job in script mode
- Deploying a model endpoint
- Making predictions

In [None]:
import boto3
import sagemaker
from sagemaker.sklearn import SKLearn
from sagemaker import get_execution_role
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/sklearn-rf-demo"

print(f"Region: {region}")
print(f"S3 Bucket: {bucket}")
print(f"Prefix: {prefix}")

# Get execution role (or use a specific role ARN)
try:
    role = get_execution_role()
except:
    role = "arn:aws:iam::YOUR-ACCOUNT-ID:role/SageMakerExecutionRole"
    print(f"Note: Update role ARN if running locally: {role}")

## 1. Prepare Sample Dataset

In [None]:
# Generate synthetic classification data
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    n_classes=2,
    random_state=42
)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create DataFrames (last column is target)
train_df = pd.DataFrame(
    np.column_stack([X_train, y_train])
)
test_df = pd.DataFrame(
    np.column_stack([X_test, y_test])
)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Save locally
train_df.to_csv("train.csv", index=False, header=False)
test_df.to_csv("test.csv", index=False, header=False)

print("✓ Data saved locally")

## 2. Upload Data to S3

In [None]:
# Upload to S3
train_input = sagemaker_session.upload_data(
    path="train.csv",
    bucket=bucket,
    key_prefix=f"{prefix}/data/train"
)

test_input = sagemaker_session.upload_data(
    path="test.csv",
    bucket=bucket,
    key_prefix=f"{prefix}/data/test"
)

print(f"Train data uploaded to: {train_input}")
print(f"Test data uploaded to: {test_input}")

## 3. Launch SageMaker Training Job

In [None]:
# Create SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point="../src/sagemaker/train_script.py",
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    framework_version="1.2-1",
    py_version="py3",
    hyperparameters={
        "n-estimators": 100,
        "max-depth": 10,
        "random-state": 42
    },
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sagemaker_session
)

print("Estimator created. Starting training job...")
print("Note: This will take 5-10 minutes")

In [None]:
# Start training
sklearn_estimator.fit({
    "train": train_input,
    "test": test_input
})

print("✓ Training complete!")
print(f"Model artifacts: {sklearn_estimator.model_data}")

## 4. Deploy Model to Endpoint

In [None]:
# Deploy model (this takes 5-10 minutes)
predictor = sklearn_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name="sklearn-rf-endpoint"
)

print("✓ Endpoint deployed!")
print(f"Endpoint name: {predictor.endpoint_name}")

## 5. Make Predictions

In [None]:
# Test prediction with a sample
sample = X_test[0:1]
prediction = predictor.predict(sample)

print(f"Sample features: {sample[0][:5]}... (showing first 5)")
print(f"Prediction: {prediction}")
print(f"Actual: {y_test[0]}")

## 6. Clean Up Resources

In [None]:
# Delete endpoint to avoid charges
# Uncomment to delete:
# predictor.delete_endpoint()
# print("✓ Endpoint deleted")

print("⚠️  Remember to delete the endpoint when done to avoid charges:")
print(f"   predictor.delete_endpoint()")

## Cost Optimization Tips

1. **Use Spot Instances** for training to save up to 70%
2. **Delete endpoints** immediately after testing
3. **Use Inference Recommender** to find optimal instance types
4. **Enable autoscaling** for production endpoints
5. **Use batch transform** instead of real-time endpoints when possible

See `docs/06-sagemaker.md` for more details.