In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3 # used to access s3 bucket
import pandas as pd

sm_boto3 = boto3.client('sagemaker') # create a sagemaker client
sess = sagemaker.Session() 
region = sess.boto_region_name
bucket = 'mobile-classification-sklearn'
print("Using Bucket " + bucket)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\rsk13\AppData\Local\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\rsk13\AppData\Local\sagemaker\sagemaker\config.yaml
Using Bucket mobile-classification-sklearn


In [2]:
region

'us-east-1'

In [3]:
df = pd.read_csv('archive/train.csv')
df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [4]:
df['price_range'].value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [5]:
df.isna().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [6]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [7]:
y

0       1
1       2
2       2
3       2
4       1
       ..
1995    0
1996    2
1997    3
1998    0
1999    3
Name: price_range, Length: 2000, dtype: int64

In [8]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.15, random_state=0)

print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

(1700, 20) (300, 20) (1700,) (300,)


In [9]:
trainX = pd.DataFrame(Xtrain)
trainX['price_range'] = ytrain

In [10]:
testX = pd.DataFrame(Xtest)
testX['price_range'] = ytest

In [11]:
print(trainX.shape, testX.shape)

(1700, 21) (300, 21)


In [12]:
# Saving train and test to push to S3

trainX.to_csv('train-V-1.csv', index=False)
testX.to_csv('test-V-1.csv', index=False)

# Data Ingestion

In [13]:
sk_prefix = 'sagemaker/mobile_price_classification/sklearncontainer'
trainpath = sess.upload_data(
    path='train-V-1.csv', bucket=bucket, key_prefix=sk_prefix
    )

testpath = sess.upload_data(
    path='test-V-1.csv', bucket=bucket, key_prefix=sk_prefix
    )

print(trainpath, testpath, sep='\n')

s3://mobile-classification-sklearn/sagemaker/mobile_price_classification/sklearncontainer/train-V-1.csv
s3://mobile-classification-sklearn/sagemaker/mobile_price_classification/sklearncontainer/test-V-1.csv


Arguments =  help customize the model's behavior and make it learn in the way we want it to. Providing arguments is like giving the model a set of rules to follow during the learning process.

In [62]:
%%writefile script.py 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import joblib
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    

    print("[INFO] Extracting the Arguments")
    parser = argparse.ArgumentParser() 
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)
    
    
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train_file', type=str, default='train-V-1.csv')
    parser.add_argument('--test_file', type=str, default='test-V-1.csv')
    
    args, _ = parser.parse_known_args()
    
    print('Sklearn Version:', sklearn.__version__)
    print('Joblib Version:', joblib.__version__)
    
    print("[INFO] Reading the data")
    print()
    
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print('Building, training and evaluating the model')
    print()
    
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]
    
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()
    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Overwriting script.py


In [49]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
                        entry_point='script.py', # our sklearn script
                        role = 'arn:aws:iam::523918955502:role/mobile-classification' ,
                        instance_count=1, 
                        instance_type='ml.m5.large',
                        framework_version=FRAMEWORK_VERSION,
                        base_job_name='RF-custom-sklearn',
                        hyperparameters={'n_estimators': 100, 'random_state': 0},
                        use_spot_instances=True, 
                        max_wait=7200, 
                        max_run=3600,                      
)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\rsk13\AppData\Local\sagemaker\sagemaker\config.yaml


In [50]:
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2023-09-18-13-19-48-788


2023-09-18 13:19:54 Starting - Starting the training job...
2023-09-18 13:20:07 Starting - Preparing the instances for training.........
2023-09-18 13:22:05 Downloading - Downloading input data...
2023-09-18 13:22:35 Training - Downloading the training image...
2023-09-18 13:23:11 Training - Training image download completed. Training in progress..2023-09-18 13:23:14,669 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2023-09-18 13:23:14,673 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-09-18 13:23:14,723 sagemaker_sklearn_container.training INFO     Invoking user training script.
2023-09-18 13:23:14,884 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-09-18 13:23:14,898 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-09-18 13:23:14,910 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-09-18 

In [51]:
# Location of the model

sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2023-09-18 13:23:31 Starting - Preparing the instances for training
2023-09-18 13:23:31 Downloading - Downloading input data
2023-09-18 13:23:31 Training - Training image download completed. Training in progress.
2023-09-18 13:23:31 Uploading - Uploading generated training model
2023-09-18 13:23:31 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-523918955502/RF-custom-sklearn-2023-09-18-13-19-48-788/output/model.tar.gz


In [52]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

# Creating a separate folder for deployment as end points
model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::523918955502:role/mobile-classification",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\rsk13\AppData\Local\sagemaker\sagemaker\config.yaml


In [53]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x1b3444d9ee0>

In [54]:
# Deployment as end point

endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

EndpointName=Custom-sklearn-model-2023-09-18-13-30-30
sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\rsk13\AppData\Local\sagemaker\sagemaker\config.yaml


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2023-09-18-13-30-24
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2023-09-18-13-30-30
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2023-09-18-13-30-30


-------!

## Now the predictor is getting exposed to the endpoint

In [55]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x1b344b620d0>

In [61]:
testX.iloc[0:2,:]

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
405,1454,1,0.5,1,1,0,34,0.7,83,4,...,250,1033,3419,7,5,5,1,1,0,3
1190,1092,1,0.5,1,10,0,11,0.5,167,3,...,468,571,737,14,4,11,0,1,0,0


In [79]:
testX.iloc[0,:-1]

battery_power    1454.0
blue                1.0
clock_speed         0.5
dual_sim            1.0
fc                  1.0
four_g              0.0
int_memory         34.0
m_dep               0.7
mobile_wt          83.0
n_cores             4.0
pc                  3.0
px_height         250.0
px_width         1033.0
ram              3419.0
sc_h                7.0
sc_w                5.0
talk_time           5.0
three_g             1.0
touch_screen        1.0
wifi                0.0
Name: 405, dtype: float64

In [80]:
test_values = testX.iloc[0:1,:-1].values.tolist()

In [81]:
print(predictor.predict(test_values))

[3]


In [82]:
testX.iloc[0,-1]

3

In [83]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '6e5b3fac-69a3-4a23-b679-a4949661a9fd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6e5b3fac-69a3-4a23-b679-a4949661a9fd',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Mon, 18 Sep 2023 14:01:18 GMT'},
  'RetryAttempts': 0}}