# house-price-sagemaker.ipynb



## Startup cells

In [0]:
# Set environment variables for sagemaker_studio imports

import os
os.environ['DataZoneProjectId'] = 'cub33qyt7rpdbd'
os.environ['DataZoneDomainId'] = 'dzd-5oywjcwy06igyx'
os.environ['DataZoneEnvironmentId'] = 'azcqgy5cyjke95'
os.environ['DataZoneDomainRegion'] = 'us-east-2'

# create both a function and variable for metadata access
_resource_metadata = None

def _get_resource_metadata():
    global _resource_metadata
    if _resource_metadata is None:
        _resource_metadata = {
            "AdditionalMetadata": {
                "DataZoneProjectId": "cub33qyt7rpdbd",
                "DataZoneDomainId": "dzd-5oywjcwy06igyx",
                "DataZoneEnvironmentId": "azcqgy5cyjke95",
                "DataZoneDomainRegion": "us-east-2",
            }
        }
    return _resource_metadata
metadata = _get_resource_metadata()

In [0]:
"""
Logging Configuration

Purpose:
--------
This sets up the logging framework for code executed in the user namespace.
"""

from typing import Optional


def _set_logging(log_dir: str, log_file: str, log_name: Optional[str] = None):
    import os
    import logging
    from logging.handlers import RotatingFileHandler

    level = logging.INFO
    max_bytes = 5 * 1024 * 1024
    backup_count = 5

    # fallback to /tmp dir on access, helpful for local dev setup
    try:
        os.makedirs(log_dir, exist_ok=True)
    except Exception:
        log_dir = "/tmp/kernels/"

    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, log_file)

    logger = logging.getLogger() if not log_name else logging.getLogger(log_name)
    logger.handlers = []
    logger.setLevel(level)

    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    # Rotating file handler
    fh = RotatingFileHandler(filename=log_path, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info(f"Logging initialized for {log_name}.")


_set_logging("/var/log/computeEnvironments/kernel/", "kernel.log")
_set_logging("/var/log/studio/data-notebook-kernel-server/", "metrics.log", "metrics")

In [0]:
import logging
from sagemaker_studio import ClientConfig, sqlutils, sparkutils, dataframeutils

logger = logging.getLogger(__name__)
logger.info("Initializing sparkutils")
spark = sparkutils.init()
logger.info("Finished initializing sparkutils")

In [0]:
def _reset_os_path():
    """
    Reset the process's working directory to handle mount timing issues.
    
    This function resolves a race condition where the Python process starts
    before the filesystem mount is complete, causing the process to reference
    old mount paths and inodes. By explicitly changing to the mounted directory
    (/home/sagemaker-user), we ensure the process uses the correct, up-to-date
    mount point.
    
    The function logs stat information (device ID and inode) before and after
    the directory change to verify that the working directory is properly
    updated to reference the new mount.
    
    Note:
        This is executed at module import time to ensure the fix is applied
        as early as possible in the kernel initialization process.
    """
    try:
        import os
        import logging

        logger = logging.getLogger(__name__)
        logger.info("---------Before------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)

        os.chdir("/home/sagemaker-user")

        logger.info("---------After------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)
    except Exception as e:
        logger.exception(f"Failed to reset working directory: {e}")

_reset_os_path()

## Notebook

In [0]:
import pandas as pd

s3_path = "s3://dream-house-prediction/dataset/raw/housing_dataset for ML tasks.csv"

df = pd.read_csv(s3_path)

df.head()


Unnamed: 0,Id,Neighborhood,LotFrontage,LotArea,number of bedrooms,Street,Alley,OverallQual,OverallCond,Date,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,SquareFootage,Heating,HeatingQC,SaleType,SaleCondition,Price
0,1,RL,65.0,8450.0,2.0,Pave,,7,5,2003,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,WD,Normal,208500
1,2,RL,80.0,9600.0,3.0,Pave,,6,8,1976,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,WD,Normal,181500
2,3,RL,68.0,11250.0,3.0,Pave,,7,5,2001,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,WD,Normal,223500
3,4,RL,60.0,9550.0,3.0,Pave,,7,5,1915,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,WD,Abnorml,140000
4,5,RL,84.0,,2.0,Pave,,8,5,2000,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,WD,Normal,250000


In [0]:
df.shape


(1460, 23)

In [0]:
df.columns


Index(['Id', 'Neighborhood', 'LotFrontage', 'LotArea', 'number of bedrooms',
       'Street', 'Alley', 'OverallQual', 'OverallCond', 'Date', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'SquareFootage', 'Heating', 'HeatingQC',
       'SaleType', 'SaleCondition', 'Price'],
      dtype='object')

In [0]:
df.dtypes


Id                      int64
Neighborhood           object
LotFrontage           float64
LotArea               float64
number of bedrooms    float64
Street                 object
Alley                  object
OverallQual             int64
OverallCond             int64
Date                    int64
BsmtCond               object
BsmtExposure           object
BsmtFinType1           object
BsmtFinSF1              int64
BsmtFinType2           object
BsmtFinSF2              int64
BsmtUnfSF               int64
SquareFootage           int64
Heating                object
HeatingQC              object
SaleType               object
SaleCondition          object
Price                   int64
dtype: object

In [0]:
target_column = "Price"


In [0]:
numeric_features = df.select_dtypes(include=["int64", "float64"]).columns
numeric_features


Index(['Id', 'LotFrontage', 'LotArea', 'number of bedrooms', 'OverallQual',
       'OverallCond', 'Date', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'SquareFootage', 'Price'],
      dtype='object')

In [0]:
feature_columns = [col for col in numeric_features if col != target_column]
feature_columns


['Id',
 'LotFrontage',
 'LotArea',
 'number of bedrooms',
 'OverallQual',
 'OverallCond',
 'Date',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'SquareFootage']

In [0]:
training_df.isnull().sum().sum()


298

In [0]:
training_df = df[[target_column] + feature_columns]
training_df.head()


Unnamed: 0,Price,Id,LotFrontage,LotArea,number of bedrooms,OverallQual,OverallCond,Date,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,SquareFootage
0,208500,1,65.0,8450.0,2.0,7,5,2003,706,0,150,856
1,181500,2,80.0,9600.0,3.0,6,8,1976,978,0,284,1262
2,223500,3,68.0,11250.0,3.0,7,5,2001,486,0,434,920
3,140000,4,60.0,9550.0,3.0,7,5,1915,216,0,540,756
4,250000,5,84.0,,2.0,8,5,2000,655,0,490,1145


In [0]:
import numpy as np

# Replace blanks and strange values
training_df = training_df.replace("", np.nan)

# Convert everything to numeric strictly
training_df = training_df.apply(pd.to_numeric, errors="coerce")

# Drop any row that has even one missing value
training_df = training_df.dropna()

print("Remaining missing values:", training_df.isnull().sum().sum())
print("Final shape:", training_df.shape)


Remaining missing values: 0
Final shape: (1170, 12)


In [0]:
training_df.to_csv("training_data.csv", header=False, index=False)


In [0]:
import boto3

bucket_name = "dream-house-prediction"

s3 = boto3.client("s3")
s3.upload_file("training_data.csv", bucket_name, "training/training_data.csv")

print("Upload completed")


Upload completed


In [0]:
import sagemaker
from sagemaker import image_uris
from sagemaker.estimator import Estimator

sess = sagemaker.Session()
region = sess.boto_region_name
role = sagemaker.get_execution_role()

print("Region:", region)
print("Role:", role)


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


Region: us-east-2
Role: arn:aws:iam::339713060325:role/service-role/AmazonSageMakerAdminIAMExecutionRole


In [0]:
linear_image = image_uris.retrieve(
    framework="linear-learner",
    region=region
)

linear_image


'404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:1'

In [0]:
bucket_name = "dream-house-prediction"


In [0]:
linear_estimator = Estimator(
    image_uri=linear_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket_name}/output",
    sagemaker_session=sess
)


In [0]:
linear_estimator.set_hyperparameters(
    predictor_type="regressor",
    mini_batch_size=32,
    epochs=10
)


In [0]:
train_input = sagemaker.inputs.TrainingInput(
    s3_data=f"s3://{bucket_name}/training/training_data.csv",
    content_type="text/csv"
)

linear_estimator.fit({"train": train_input})


2026-02-11 08:23:11 Starting - Starting the training job.

.

.


2026-02-11 08:23:25 Starting - Preparing the instances for training.

.

.


2026-02-11 08:23:47 Downloading - Downloading input data.

.

.


2026-02-11 08:24:27 Downloading - Downloading the training image.

.

.

.

.

.

.

.

.


2026-02-11 08:26:03 Training - Training image download completed. Training in progress..

[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[02/11/2026 08:26:08 INFO 140636901365568] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile': '0.5', 'loss_insensitivity': '0.01', 'huber_delta': '1.0', 'num_classes': '1', 'accuracy_top_k': '3', 'wd': 'auto', 'l1': 'auto', 'momentum': 'auto', 'learning_rate': 'auto', 'beta_1': 'auto', 'beta_2': 'auto', 'bias_lr_mult': 'auto', 'bias_wd_mult': 'auto', 'use_lr_scheduler': 'true', 'lr_


2026-02-11 08:26:42 Uploading - Uploading generated training model
2026-02-11 08:26:42 Completed - Training job completed


Training seconds: 174
Billable seconds: 174


In [0]:
train_input = sagemaker.inputs.TrainingInput(
    s3_data="s3://dream-house-prediction/training/training_data.csv",
    content_type="text/csv"
)


In [0]:
linear_estimator.fit({"train": train_input})


2026-02-11 08:27:08 Starting - Starting the training job.

.

.


2026-02-11 08:27:23 Starting - Preparing the instances for training.

.

.


2026-02-11 08:27:45 Downloading - Downloading input data.

.

.


2026-02-11 08:28:25 Downloading - Downloading the training image.

.

.

.

.

.

.

.

.


2026-02-11 08:30:01 Training - Training image download completed. Training in progress..

[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[02/11/2026 08:30:06 INFO 139977277740864] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile': '0.5', 'loss_insensitivity': '0.01', 'huber_delta': '1.0', 'num_classes': '1', 'accuracy_top_k': '3', 'wd': 'auto', 'l1': 'auto', 'momentum': 'auto', 'learning_rate': 'auto', 'beta_1': 'auto', 'beta_2': 'auto', 'bias_lr_mult': 'auto', 'bias_wd_mult': 'auto', 'use_lr_scheduler': 'true', 'lr_


2026-02-11 08:30:35 Uploading - Uploading generated training model
2026-02-11 08:30:35 Completed - Training job completed


Training seconds: 169
Billable seconds: 169


In [0]:
predictor = linear_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)


-

-

-

-

-

-

-

-

!

In [0]:
import numpy as np

# Take one row of features (exclude target)
sample = training_df.iloc[0, 1:].values

# Convert to comma-separated string
sample_csv = ",".join(map(str, sample))

sample_csv


'1.0,65.0,8450.0,2.0,7.0,5.0,2003.0,706.0,0.0,150.0,856.0'

In [0]:
sample = training_df.iloc[0, 1:].values
sample_csv = ",".join(map(str, sample))

result = predictor.predict(
    sample_csv,
    initial_args={"ContentType": "text/csv"}
)

print(result)


b'{"predictions": [{"score": 217831.5625}]}'


In [0]:
predictor.delete_endpoint()
print("Endpoint deleted")


Endpoint deleted


In [0]:
from sklearn.model_selection import train_test_split

# Features and target
X = training_df.iloc[:, 1:]
y = training_df.iloc[:, 0]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (936, 11)
Test size: (234, 11)


In [0]:
import pandas as pd

train_df = pd.concat([y_train, X_train], axis=1)
test_df = pd.concat([y_test, X_test], axis=1)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

train_df.head()


Train shape: (936, 12)
Test shape: (234, 12)


Unnamed: 0,Price,Id,LotFrontage,LotArea,number of bedrooms,OverallQual,OverallCond,Date,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,SquareFootage
1416,122500,1417,60.0,11340.0,3.0,4,6,1885,0,0,777,777
1000,82000,1001,74.0,10206.0,4.0,3,3,1952,0,0,0,0
1351,171000,1352,70.0,9247.0,3.0,6,6,1962,319,0,539,858
228,125000,229,70.0,8521.0,3.0,5,5,1967,842,0,70,912
1390,235000,1391,70.0,9100.0,4.0,7,5,2000,1400,0,125,1525


In [0]:
train_df.to_csv("train_data.csv", header=False, index=False)


In [0]:
s3.upload_file("train_data.csv", bucket_name, "training/train_data.csv")
print("Train data uploaded to S3")


Train data uploaded to S3


In [0]:
linear_estimator = Estimator(
    image_uri=linear_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket_name}/output",
    sagemaker_session=sess
)

linear_estimator.set_hyperparameters(
    predictor_type="regressor",
    mini_batch_size=32,
    epochs=10
)


In [0]:
train_input = sagemaker.inputs.TrainingInput(
    s3_data="s3://dream-house-prediction/training/train_data.csv",
    content_type="text/csv"
)


In [0]:
linear_estimator.fit({"train": train_input})


2026-02-11 08:54:46 Starting - Starting the training job.

.

.


2026-02-11 08:55:01 Starting - Preparing the instances for training.

.

.


2026-02-11 08:55:21 Downloading - Downloading input data.

.

.


2026-02-11 08:56:02 Downloading - Downloading the training image.

.

.

.

.

.

.

.

.


2026-02-11 08:57:38 Training - Training image download completed. Training in progress..

[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[02/11/2026 08:57:42 INFO 139939879597888] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile': '0.5', 'loss_insensitivity': '0.01', 'huber_delta': '1.0', 'num_classes': '1', 'accuracy_top_k': '3', 'wd': 'auto', 'l1': 'auto', 'momentum': 'auto', 'learning_rate': 'auto', 'beta_1': 'auto', 'beta_2': 'auto', 'bias_lr_mult': 'auto', 'bias_wd_mult': 'auto', 'use_lr_scheduler': 'true', 'lr_


2026-02-11 08:58:16 Uploading - Uploading generated training model
2026-02-11 08:58:16 Completed - Training job completed


Training seconds: 175
Billable seconds: 175


In [0]:
predictor = linear_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)


-

-

-

-

-

-

-

!

In [0]:
import numpy as np

predictions = []

for i in range(len(X_test)):
    row = X_test.iloc[i].values
    row_csv = ",".join(map(str, row))
    
    result = predictor.predict(
        row_csv,
        initial_args={"ContentType": "text/csv"}
    )
    
    score = eval(result)["predictions"][0]["score"]
    predictions.append(score)

predictions = np.array(predictions)

print("Total predictions:", len(predictions))


Total predictions: 234


In [0]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, predictions))

print("RMSE:", rmse)


RMSE: 40897.87577953279


In [0]:
print("Min price:", y_test.min())
print("Max price:", y_test.max())
print("Mean price:", y_test.mean())


Min price: 37900
Max price: 625000
Mean price: 177830.08974358975


In [0]:
predictor.delete_endpoint()
print("Endpoint deleted")


Endpoint deleted


## Shutdown cells

In [0]:
"""
Stop spark session and associated Athena Spark session
"""

from IPython import get_ipython as _get_ipython
_get_ipython().user_ns["spark"].stop()