# Mobile Price Prediction

### importing libraries

In [1]:
import sagemaker
import pandas as pd
import boto3   # boto3 library is used to connect s3 bucket
from sklearn.model_selection import train_test_split

### setting up the system

In [2]:
sm_boto3 = boto3.client('sagemaker')  
# boto3 client is used to connect sagemaker
sess = sagemaker.Session()
# sagemaker session is used to create sagemaker session
bucket = 'saibalpatramobbuckets'
# bucket name is given
print("Using bucket " + bucket)

Using bucket saibalpatramobbuckets


### loading data and basic preprocessing

In [3]:
data = pd.read_csv("Data/train.csv")

In [4]:
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 25)

In [5]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1


Mobile Phones Category - 
   * 0(low cost), 
   * 1(medium cost), 
   * 2(high cost) and 
   * 3(very high cost)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [7]:
data.price_range.value_counts(normalize=True)

1    0.25
2    0.25
3    0.25
0    0.25
Name: price_range, dtype: float64

In [8]:
features = list(data.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [9]:
label = features.pop(-1)
label

'price_range'

In [10]:
X = data[features]
y = data[label]

In [11]:
X.shape, y.shape

((2000, 20), (2000,))

In [12]:
X.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [13]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [14]:
y.value_counts()

1    500
2    500
3    500
0    500
Name: price_range, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, stratify=y, random_state=42)

In [16]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1600, 20) (400, 20) (1600,) (400,)


In [17]:
## creating dataframes

trainX = pd.DataFrame(X_train)
trainX['label'] = y_train

testX = pd.DataFrame(X_test)
testX['label'] = y_test

In [18]:
trainX.shape, testX.shape

((1600, 21), (400, 21))

In [19]:
trainX.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,label
469,1845,1,0.5,1,10,0,61,0.3,96,1,12,292,695,1731,9,5,6,0,0,0,1
565,1271,1,0.5,0,12,0,32,0.1,113,7,15,1350,1949,445,15,14,14,0,0,1,0
396,788,0,2.5,1,4,1,57,0.9,91,8,12,42,1161,3969,12,8,6,1,1,1,3
452,1406,0,0.5,1,2,1,34,0.8,98,3,5,1017,1366,3915,9,8,2,1,0,1,3
325,920,0,2.1,1,5,0,25,0.4,115,7,14,209,1078,785,10,7,5,0,1,1,0


## Data Ingestion

In [20]:
## saving it to csv file

trainX.to_csv("Data/train-V-1.csv", index = False)
testX.to_csv("Data/test-V-1.csv", index = False)

In [21]:
bucket

'saibalpatramobbuckets'

In [22]:
## send data to S3, SageMaker will take training data from S3

sk_prefix = 'sagemaker/mobile_price_classification/sklearncontainer'

trainpath = sess.upload_data(
    path = "Data/train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path='Data/test-V-1.csv', bucket=bucket, key_prefix=sk_prefix
)

In [29]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--n_jobs", type=int, default=-1)
    parser.add_argument("--verbose", type=int, default=3)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = args.verbose ,n_jobs=args.n_jobs)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)


Overwriting script.py


In [27]:
# from sklearn.ensemble import RandomForestClassifier
# RandomForestClassifier().get_params()

In [36]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    # role=sagemaker.get_execution_role(),
    role = "arn:aws:iam::207413127695:role/saibalpatramobilesagemaker",
    instance_count = 1,
    instance_type = "ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name = "sklearn-mobile-price-classification",
    hyperparameters = {
        "n_estimators":100,
        "n_jobs":-1,
        "verbose":3,
        "random_state":0
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run = 3600
)

In [37]:
## launch training job, with asynchronous call
sklearn_estimator.fit({"train":trainpath, "test":testpath}, wait = True)

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: sklearn-mobile-price-classification-2023-06-30-07-29-38-955


2023-06-30 07:29:43 Starting - Starting the training job...
2023-06-30 07:29:59 Starting - Preparing the instances for training.........
2023-06-30 07:31:34 Downloading - Downloading input data...
2023-06-30 07:31:59 Training - Downloading the training image...
2023-06-30 07:32:34 Training - Training image download completed. Training in progress.2023-06-30 07:32:37,345 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2023-06-30 07:32:37,348 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-06-30 07:32:37,390 sagemaker_sklearn_container.training INFO     Invoking user training script.
2023-06-30 07:32:37,545 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-06-30 07:32:37,557 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-06-30 07:32:37,568 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-06-30 0

In [42]:
sklearn_estimator.latest_training_job.wait(logs='None')

artifact = sm_boto3.describe.training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print("Model artifact persisted at " + artifact)



2023-06-30 07:33:41 Starting - Preparing the instances for training
2023-06-30 07:33:41 Downloading - Downloading input data
2023-06-30 07:33:41 Training - Training image download completed. Training in progress.
2023-06-30 07:33:41 Uploading - Uploading generated training model
2023-06-30 07:33:41 Completed - Training job completed


AttributeError: 'SageMaker' object has no attribute 'describe'

## Deployment

In [47]:
artifact = "s3://sagemaker-ap-south-1-207413127695/sklearn-mobile-price-classification-2023-06-30-07-29-38-955/output/model.tar.gz"

from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role = "arn:aws:iam::207413127695:role/saibalpatramobilesagemaker",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [48]:
model_name

'Custom-sklearn-model-2023-06-30-07-55-32'

In [49]:
## Endpoints deployment

endpoints_name = "Customer-sklearn-model" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Endpointname{endpoints_name}")

predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    endpoint_name = endpoints_name
)

Endpointname{endpoints_name}


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2023-06-30-07-55-32
INFO:sagemaker:Creating endpoint-config with name Customer-sklearn-model2023-06-30-07-55-35
INFO:sagemaker:Creating endpoint with name Customer-sklearn-model2023-06-30-07-55-35


-----!

In [50]:
endpoints_name

'Customer-sklearn-model2023-06-30-07-55-35'

In [52]:
predictor.endpoint_name

'Customer-sklearn-model2023-06-30-07-55-35'

In [None]:
predictor.predict(testX[features][1:2].values.tolist())

In [62]:
result = predictor.predict(testX[features][:].values.tolist())

In [67]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, result)

0.8875