In [1]:
import pandas as pd

# Replace with your bucket and file path
bucket = 'sales-bucket-rec'
key = 'sales_dataset.csv'
s3_path = f's3://{bucket}/{key}'

# Load dataset from S3
df = pd.read_csv(s3_path)
df = df[['Total Revenue', 'Region', 'Units Sold', 'Unit Price']].dropna()

# Convert to category first
df['Region'] = df['Region'].astype('category')

# Save the mapping
region_mapping = dict(enumerate(df['Region'].cat.categories))

# Then encode the Region column
df['Region'] = df['Region'].cat.codes

# Add interaction feature (Units Sold × Unit Price)
df['Revenue_Calc'] = df['Units Sold'] * df['Unit Price']

# Scale target to avoid large loss values
df['Total Revenue'] = df['Total Revenue'] / 1000

# Final column order: label first
df = df[['Total Revenue', 'Region', 'Units Sold', 'Unit Price', 'Revenue_Calc']]

# Print the mapping
print("Region Mapping (code → name):")
for code, name in region_mapping.items():
    print(f"{code} → {name}")

# Save preprocessed data locally
df.to_csv('region_sales.csv', index=False,header=False)
df.head()

  from pandas.core.computation.check import NUMEXPR_INSTALLED
severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Region Mapping (code → name):
0 → Asia
1 → Australia and Oceania
2 → Central America and the Caribbean
3 → Europe
4 → Middle East and North Africa
5 → North America
6 → Sub-Saharan Africa


Unnamed: 0,Total Revenue,Region,Units Sold,Unit Price,Revenue_Calc
0,14.86269,6,1593,9.33,14862.69
1,503.89008,4,4611,109.28,503890.08
2,151.8804,1,360,421.89,151880.4
3,61.41536,6,562,109.28,61415.36
4,188.51885,3,3973,47.45,188518.85


In [2]:
df

Unnamed: 0,Total Revenue,Region,Units Sold,Unit Price,Revenue_Calc
0,14.86269,6,1593,9.33,14862.69
1,503.89008,4,4611,109.28,503890.08
2,151.88040,1,360,421.89,151880.40
3,61.41536,6,562,109.28,61415.36
4,188.51885,3,3973,47.45,188518.85
...,...,...,...,...,...
499995,1.67940,0,180,9.33,1679.40
499996,4614.47406,1,7086,651.21,4614474.06
499997,664.33332,3,4354,152.58,664333.32
499998,405.79240,0,8552,47.45,405792.40


In [3]:
import boto3
import json

with open('region_mapping.json', 'w') as f:
    json.dump(region_mapping, f)
# s3 = boto3.client('s3')
# s3.upload_file('region_mapping.json', bucket, 'xgboost-region-sales/region_mapping.json')


In [4]:
import boto3
import sagemaker

session = sagemaker.Session()
boto3.client('s3').upload_file('region_mapping.json', bucket, 'xgboost-region-sales/region_mapping.json')

# Upload preprocessed training data
processed_s3_path = session.upload_data('region_sales.csv', bucket=bucket, key_prefix='xgboost-region-sales')
print(f"Processed data uploaded to: {processed_s3_path}")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Processed data uploaded to: s3://sales-bucket-rec/xgboost-region-sales/region_sales.csv


In [5]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role

role = get_execution_role()
container_uri = sagemaker.image_uris.retrieve("xgboost", session.boto_region_name, version="1.5-1")

xgb_estimator = Estimator(
    image_uri=container_uri,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket}/xgboost-region-sales/output',
    sagemaker_session=session
)

xgb_estimator.set_hyperparameters(
    objective='reg:squarederror',
    num_round=300,
    max_depth=5,
    eta=0.1,
    seed=42
)

# Train
xgb_estimator.fit({'train': TrainingInput(processed_s3_path, content_type='csv')})



INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-06-17-04-40-04-937


2025-06-17 04:40:06 Starting - Starting the training job...
2025-06-17 04:40:21 Starting - Preparing the instances for training...
2025-06-17 04:40:41 Downloading - Downloading input data...
2025-06-17 04:41:21 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-06-17 04:42:27.671 ip-10-0-129-66.ap-south-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-06-17 04:42:27.696 ip-10-0-129-66.ap-south-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-06-17:04:42:28:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-06-17:04:42:28:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-06-17:04:42:28:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-17:04:42:28:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-06-

In [6]:
# Deploy model to an endpoint
predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large'
)

endpoint_name = predictor.endpoint_name
print(f"SageMaker endpoint created: {endpoint_name}")


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-06-17-04-43-52-396
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-06-17-04-43-52-396
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-06-17-04-43-52-396


------!SageMaker endpoint created: sagemaker-xgboost-2025-06-17-04-43-52-396


In [7]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
import json

# Load region mapping if needed
with open('region_mapping.json', 'r') as f:
    region_mapping = json.load(f)
region_reverse = {v: int(k) for k, v in region_mapping.items()}

# Set up predictor
predictor = Predictor(
    endpoint_name=endpoint_name,
    serializer=CSVSerializer()
)

# EXAMPLE INPUT
# Suppose Region = 'Europe' → code = region_reverse['Europe']
region_code = 1  # example: manually specify or use region_reverse
units_sold = 1000
unit_price = 15.5
revenue_calc = units_sold * unit_price

sample = [[region_code, units_sold, unit_price, revenue_calc]]

# Predict
response = predictor.predict(sample)
scaled_prediction = float(response.decode('utf-8').strip())
actual_prediction = scaled_prediction * 1000  # Rescale

print("✅ Predicted Total Revenue:", round(actual_prediction, 2))


✅ Predicted Total Revenue: 15968.11
