## Mobile Phone prices Classification using Sagemaker

In [1]:
import sagemaker
import boto3
from sklearn.model_selection import train_test_split
import pandas as pd

sm_boto = boto3.client('sagemaker')
sagemaker_session = sagemaker.Session(boto3.session.Session())
region = sagemaker_session.boto_region_name
bucket = "sainir-ml-learn"
print(f"SageMaker session created in region: {region} using bucket: {bucket}")

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/rahulsaini/Library/Application Support/sagemaker/config.yaml
SageMaker session created in region: us-west-2 using bucket: sainir-ml-learn


In [2]:
train_df = pd.read_csv("dataset/train.csv")

In [3]:
train_df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
train_df.shape

(2000, 21)

In [5]:
train_df["price_range"].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [6]:
train_df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [7]:
# Find the percentage of values that are missing in each column
train_df.isnull().mean() * 100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [8]:
features = list(train_df.columns)
features


['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [9]:
label = features.pop(-1)
label

'price_range'

In [10]:
x = train_df[features]
y = train_df[label]

In [11]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [12]:
y.head() # 0: Low, 1: Medium, 2: High, 3: Very High 

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [14]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1700, 20) (300, 20) (1700,) (300,)


In [15]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [16]:
trainX.to_csv("dataset/train_split_v1.csv", index=False)
testX.to_csv("dataset/test_split_v1.csv", index=False)

In [17]:
# send data to s3
sk_prefix = "mobile-price-classification/sagemaker/sklearn-mobile-price-classification/sklearncontainer"

trainpath = sagemaker_session.upload_data("dataset/train_split_v1.csv", bucket=bucket, key_prefix=f"{sk_prefix}/train")
testpath = sagemaker_session.upload_data("dataset/test_split_v1.csv", bucket=bucket, key_prefix=f"{sk_prefix}/test")

print(trainpath)
print(testpath)

s3://sainir-ml-learn/mobile-price-classification/sagemaker/sklearn-mobile-price-classification/sklearncontainer/train/train_split_v1.csv
s3://sainir-ml-learn/mobile-price-classification/sagemaker/sklearn-mobile-price-classification/sklearncontainer/test/test_split_v1.csv


In [29]:
%%writefile utils/script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
  clf = joblib.load(os.path.join(model_dir, "model.joblib"))
  return clf

if __name__ == "__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train_split_v1.csv")
    parser.add_argument("--test-file", type=str, default="test_split_v1.csv")
    
    args, _ = parser.parse_known_args()
    
    print("SKlearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)
    
    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]
    
    print('Column order: ')
    print(features)
    
    print('Label column is: ', label)
    print()
    
    print("----- SHAPE OF TRAINING DATA (85%) -----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("----- SHAPE OF TESTING DATA (15%) -----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    print("Training RandomForest Model")
    print()
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(X_train, y_train)
    print()
    
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at " + model_path)
    print()
    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)
    
    print()
    print("----- METRICS RESULTS FOR TESTING DATA -----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print("[TESTING] Model accuracy is: ", test_acc)
    print("[TESTING] Testing Report: ")
    print(test_rep)
    

Overwriting utils/script.py


In [30]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.2-1"

sklearn_estimator = SKLearn(
    entry_point="utils/script.py",
    role="arn:aws:iam::804008072808:role/SageMakerExecutionRole",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600,
)

In [31]:
# Launch the training job, with asynchronous execution
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2025-11-17-01-45-57-470


2025-11-17 01:46:00 Starting - Starting the training job...
2025-11-17 01:46:14 Starting - Preparing the instances for training...
  import pkg_resources
2025-11-17 01:48:08,934 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-11-17 01:48:08,938 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-11-17 01:48:08,941 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-11-17 01:48:08,963 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-11-17 01:48:09,207 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-11-17 01:48:09,210 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-11-17 01:48:09,228 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-11-17 01:48:09,232 sagemaker-training-toolkit INFO     No Neurons detected (normal 

In [32]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto.describe_training_job(TrainingJobName=sklearn_estimator.latest_training_job.job_name)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact stored at: ", artifact)


2025-11-17 01:48:30 Starting - Preparing the instances for training
2025-11-17 01:48:30 Downloading - Downloading the training image
2025-11-17 01:48:30 Training - Training image download completed. Training in progress.
2025-11-17 01:48:30 Uploading - Uploading generated training model
2025-11-17 01:48:30 Completed - Training job completed
Model artifact stored at:  s3://sagemaker-us-west-2-804008072808/RF-custom-sklearn-2025-11-17-01-45-57-470/output/model.tar.gz


In [33]:
artifact

's3://sagemaker-us-west-2-804008072808/RF-custom-sklearn-2025-11-17-01-45-57-470/output/model.tar.gz'

In [34]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="arn:aws:iam::804008072808:role/SageMakerExecutionRole",
    entry_point="utils/script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [35]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x140cc7530>

### Endpoint deployment

In [36]:
endpoint_name = "sklearn-rf-endpoint-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name,
)

INFO:sagemaker:Creating model with name: Custom-sklearn-model-2025-11-17-01-49-08
INFO:sagemaker:Creating endpoint-config with name sklearn-rf-endpoint-2025-11-17-01-50-45
INFO:sagemaker:Creating endpoint with name sklearn-rf-endpoint-2025-11-17-01-50-45


------!

## Endpoint Testing

In [37]:
print(predictor.predict(testX[features][0:4].values.tolist()))

[0 2 1 3]


### Endpoint cleaning

In [38]:
sm_boto.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '2a046757-f59e-45a2-90d6-64e2a6a47630',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2a046757-f59e-45a2-90d6-64e2a6a47630',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 17 Nov 2025 01:55:40 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}