# ANA680 – Problem 2B (With Container Technology)
Linear Regression using the SageMaker **SKLearn container**: train, deploy, invoke, and clean up.

In [1]:
import os
import json
import pandas as pd

import boto3
import sagemaker
from sagemaker.s3 import S3Uploader
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import StringDeserializer

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# SageMaker session / config
sess   = sagemaker.Session()
role   = sagemaker.get_execution_role()
region = sess.boto_region_name
bucket = sess.default_bucket()                     # e.g., sagemaker-<region>-<acctid>
prefix = "ana680/wine-quality"                    # folder prefix in S3

print("Role:", role)
print("Region:", region)
print("Bucket:", bucket)
print("S3 prefix:", prefix)

Role: arn:aws:iam::167819473785:role/service-role/AmazonSageMaker-ExecutionRole-20250824T160549
Region: us-east-2
Bucket: sagemaker-us-east-2-167819473785
S3 prefix: ana680/wine-quality


## Prepare & Upload Data
We combine the red and white wine CSVs into one dataset, save locally, and upload to S3 for training.

In [3]:
# Load datasets
red   = pd.read_csv("winequality-red.csv", sep=";")
white = pd.read_csv("winequality-white.csv", sep=";")

# Add labels for type
red["type"] = "red"
white["type"] = "white"

# Combine
df = pd.concat([red, white], ignore_index=True)
print("Combined shape:", df.shape)

# Save locally
local_train_path = "winequality_combined.csv"
df.to_csv(local_train_path, index=False)

# Upload to S3
s3_train_uri = S3Uploader.upload(
    local_path=local_train_path,
    desired_s3_uri=f"s3://{bucket}/{prefix}/training"
)
print("S3 training data:", s3_train_uri)

# Keep feature order for inference
FEATURES = [c for c in df.columns if c not in ("quality", "type")]
print("Number of features:", len(FEATURES))
feature_order = [c for c in df.columns if c not in ("quality", "type")]
feature_order

Combined shape: (6497, 13)
S3 training data: s3://sagemaker-us-east-2-167819473785/ana680/wine-quality/training/winequality_combined.csv
Number of features: 11


['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

## Training Script
This script will be executed inside the SKLearn container.

In [4]:
%%writefile train.py
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
import joblib

def main():
    # SageMaker I/O env
    input_dir = os.environ.get("SM_CHANNEL_TRAINING", "/opt/ml/input/data/training")
    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")

    data_path = os.path.join(input_dir, "winequality_combined.csv")
    print(f"[INFO] Reading data from: {data_path}")

    df = pd.read_csv(data_path)  # our combined CSV is comma-separated

    drop_cols = [c for c in ["quality", "type"] if c in df.columns]
    X = df.drop(columns=drop_cols)
    y = df["quality"]

    X = X.apply(pd.to_numeric, errors="coerce")
    y = pd.to_numeric(y, errors="coerce")

    print(f"[INFO] X shape: {X.shape}, y shape: {y.shape}")

    model = LinearRegression()
    model.fit(X, y)

    out_path = os.path.join(model_dir, "model.joblib")
    joblib.dump(model, out_path)
    print(f"[INFO] Saved model to: {out_path}")

if __name__ == "__main__":
    main()

Overwriting train.py


## Train in SageMaker
Run the estimator inside the SKLearn container.

In [5]:
framework_version = "1.2-1"   # If your region lacks this, try "1.4-1" or "1.0-1"

estimator = SKLearn(
    entry_point       = "train.py",
    role              = role,
    instance_type     = "ml.m5.large",    # training instance
    instance_count    = 1,
    framework_version = framework_version,
    py_version        = "py3",
    base_job_name     = "wine-quality-lr"
)

estimator.fit({"training": s3_train_uri})

INFO:sagemaker:Creating training-job with name: wine-quality-lr-2025-08-25-06-28-57-530


2025-08-25 06:28:59 Starting - Starting the training job...
2025-08-25 06:29:14 Starting - Preparing the instances for training...
  import pkg_resources[0m
[34m2025-08-25 06:31:09,996 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-08-25 06:31:09,999 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-08-25 06:31:10,002 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-08-25 06:31:10,019 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-08-25 06:31:10,286 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-08-25 06:31:10,289 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-08-25 06:31:10,307 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-08-25 06:31

## Create inference.py (tells the container how to load the model)
Defines how input/output is handled for predictions.

In [6]:
%%writefile inference.py
import os
import json
import joblib
import pandas as pd

# The 11 features used for training (combined red+white) in the SAME order
FEATURES = [
    "fixed acidity",
    "volatile acidity",
    "citric acid",
    "residual sugar",
    "chlorides",
    "free sulfur dioxide",
    "total sulfur dioxide",
    "density",
    "pH",
    "sulphates",
    "alcohol",
]

def model_fn(model_dir):
    return joblib.load(os.path.join(model_dir, "model.joblib"))

def input_fn(request_body, content_type="text/csv"):
    if content_type == "text/csv":
        row = [float(x) for x in request_body.strip().split(",")]
        return pd.DataFrame([row], columns=FEATURES)
    elif content_type == "application/json":
        obj = json.loads(request_body)
        arr = obj.get("features") or obj.get("data")
        return pd.DataFrame([arr], columns=FEATURES)
    else:
        raise ValueError(f"Unsupported content type: {content_type}")

def predict_fn(input_data, model):
    return model.predict(input_data)

def output_fn(prediction, accept="text/csv"):
    val = float(prediction[0])
    if accept == "text/csv":
        return str(val)
    elif accept == "application/json":
        return json.dumps({"prediction": val})
    else:
        raise ValueError(f"Unsupported accept: {accept}")

Overwriting inference.py


## Deploy the model as a real-time endpoint
We deploy the trained artifact to a small instance and set the serializer to CSV.

In [7]:
# (optional) clean up old endpoint
try:
    predictor.delete_endpoint()
except:
    pass

sk_model = SKLearnModel(
    model_data        = estimator.model_data,
    role              = role,
    entry_point       = "inference.py",
    framework_version = "1.2-1",
    py_version        = "py3",
)

predictor = sk_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)

predictor.serializer   = CSVSerializer()         # send as CSV
predictor.accept       = "text/csv"              # expect CSV
predictor.deserializer = StringDeserializer()    # read textual responses
print("Endpoint:", predictor.endpoint_name)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2025-08-25-06-33-50-111
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2025-08-25-06-33-50-639
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2025-08-25-06-33-50-639


------!Endpoint: sagemaker-scikit-learn-2025-08-25-06-33-50-639


## Smoke Test (Single Prediction)
Send one row from the dataset to verify predictions work.

In [8]:
row = df.iloc[0][FEATURES].astype(float).tolist()
payload = row

try:
    resp = predictor.predict(payload)
    print("Raw prediction:", resp)
except Exception as e:
    print("Predict failed:", e)

Raw prediction: 4.997607397150475


## Utility Function & Batch Test

Helper for repeated predictions.

In [9]:
def predict_quality(features_row):
    return float(predictor.predict(list(map(float, features_row))))

# Example batch
test = df[FEATURES].head(5).astype(float).values.tolist()
for i, r in enumerate(test, 1):
    print(i, predict_quality(r))

1 4.997607397150475
2 4.924992756593689
3 5.034663031356061
4 5.680332591878226
5 4.997607397150475


## Cleanup (Stop Charges)
Always shut down the endpoint when done.

In [10]:
try:
    predictor.delete_endpoint()
except Exception as e:
    print("Cleanup note:", e)

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-scikit-learn-2025-08-25-06-33-50-639
INFO:sagemaker:Deleting endpoint with name: sagemaker-scikit-learn-2025-08-25-06-33-50-639


## Results & Conclusion

We successfully trained and deployed a **Linear Regression model** for predicting wine quality using the **Amazon SageMaker SKLearn container**.  
The workflow followed these steps:

1. **Data preparation** – Combined red and white wine datasets, uploaded to S3.  
2. **Training** – Ran a custom `train.py` inside the SageMaker container.  
3. **Deployment** – Registered the trained model with a custom `inference.py`, deployed to a real-time endpoint.  
4. **Testing** – Performed smoke tests and batch predictions, which returned expected quality values (~5–6).  
5. **Cleanup** – Deleted the endpoint to avoid unnecessary costs.

This confirms a complete end-to-end machine learning pipeline: **train → deploy → invoke → clean up**, fully operational inside SageMaker Studio with container technology.