In [None]:
# Importing the required Modules
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd



sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\rahul\AppData\Local\sagemaker\sagemaker\config.yaml


In [None]:
# Importing the necessary library to interact with AWS services
sm_boto3 = boto3.client("sagemaker", region_name="ap-south-1")  # Initialize SageMaker client for the specified region

# Creating a SageMaker session object to handle interactions with the AWS SageMaker service
sess = sagemaker.Session()

# Fetching the region name from the session object, which provides the current AWS region
region = sess.boto_session.region_name

# Defining the name of the S3 bucket to be used for the project in SageMaker
bucket = "###" # Enter the name of your S3 bucket here

# Printing the name of the bucket to confirm it will be used in subsequent operations
print(f"Using the Bucket: {bucket}")

Using the Bucket: sagemaker-1-project-example


In [None]:
# Importing the dataset
df = pd.read_csv("../data/Machine_Learning/train.csv")

In [4]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [None]:
# Checking the Shape of the dataset
df.shape

(2000, 21)

In [None]:
# Using the value_counts() function to get the distribution of values in the 'price_range' column
# The 'normalize=True' parameter returns the relative frequency (proportion) of each unique value
df["price_range"].value_counts(normalize = True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [None]:
# Column names
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [None]:
# Calculating the percentage of missing values in each column of the DataFrame
df.isnull().mean() * 100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [None]:
# Extracting the column names of the DataFrame as a list of feature names
features = list(df.columns)

In [None]:
# Removing and returning the last column name from the 'features' list and assigning it to 'label'
label = features.pop()
label

'price_range'

In [None]:
# Selecting all columns in 'features' as the input features (X) from the DataFrame
x = df[features]

# Selecting the 'label' column as the target variable (y) from the DataFrame
y = df[label]

In [12]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [13]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [None]:
# Splitting the data into training and testing sets (15% for testing)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.15, random_state = 0)

In [None]:
print(X_train.shape)  # Output the dimensions of the training feature set (X_train)
print(y_train.shape)  # Output the dimensions of the training target set (y_train)
print(X_test.shape)   # Output the dimensions of the testing feature set (X_test)
print(y_test.shape)   # Output the dimensions of the testing target set (y_test)

(1700, 20)
(1700,)
(300, 20)
(300,)


In [None]:
# Converting the training feature set (X_train) into a DataFrame
trainX = pd.DataFrame(X_train)
# Adding the target variable (y_train) as a new column in the DataFrame with the label name
trainX[label] = y_train

# Converting the testing feature set (X_test) into a DataFrame
testX = pd.DataFrame(X_test)
# Adding the target variable (y_test) as a new column in the DataFrame with the label name
testX[label] = y_test

In [17]:
print(trainX.shape)
print(testX.shape)

(1700, 21)
(300, 21)


In [18]:
trainX.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1452,1450,0,2.1,0,1,0,31,0.6,114,5,...,1573,1639,794,11,5,9,0,1,1,1
1044,1218,1,2.8,1,3,0,39,0.8,150,7,...,1122,1746,1667,10,0,12,0,0,0,1
1279,1602,0,0.6,0,12,0,58,0.4,170,1,...,1259,1746,3622,17,2,17,0,1,1,3
674,1034,0,2.6,1,2,1,45,0.3,190,3,...,182,1293,969,15,1,7,1,0,0,0
1200,530,0,2.4,0,1,0,32,0.3,88,6,...,48,1012,959,17,7,6,0,1,0,0


In [19]:
trainX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [None]:
# Saving the training DataFrame (trainX) to a CSV file without including row indices
trainX.to_csv("../data/Machine_Learning/train-V-1.csv", index = False)

# Saving the testing DataFrame (testX) to a CSV file without including row indices
testX.to_csv("../data/Machine_Learning/test-V-1.csv", index = False)

In [None]:
# send data to s3. SageMaker will take training data from s3
sk_prefix = "sagemaker/mobile_price_classification/sklearn_container"  # Defining the prefix for the S3 path

# Uploading the training data CSV file to S3
train_path = sess.upload_data(
    path = "../data/Machine_Learning/train-V-1.csv",  # Path to the local training data file
    bucket = bucket,         # Name of the S3 bucket where the data will be uploaded
    key_prefix = sk_prefix   # S3 key prefix for the uploaded data
)

# Uploading the testing data CSV file to S3
test_path = sess.upload_data(
    path = "../data/Machine_Learning/test-V-1.csv",   # Path to the local testing data file
    bucket = bucket,         # Name of the S3 bucket where the data will be uploaded
    key_prefix = sk_prefix   # S3 key prefix for the uploaded data
)

In [23]:
print(train_path)
print(test_path)

s3://sagemaker-1-project-example/sagemaker/mobile_price_classification/sklearn_container/train-V-1.csv
s3://sagemaker-1-project-example/sagemaker/mobile_price_classification/sklearn_container/test-V-1.csv


In [None]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import joblib
import os
import numpy as np
import pandas as pd

# Function to load the model from the directory
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))  # Loading the trained model from the specified directory
    return clf

if __name__ == "__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client are passed as command line arguments to the script
    parser.add_argument("--n_estimators", type = int, default = 100)  # Number of trees in the RandomForest model
    parser.add_argument("--random_state", type = int, default = 0)    # Random seed for reproducibility

    # Data, model, and output directories
    parser.add_argument("--model_dir", type = str, default = os.environ.get("SM_MODEL_DIR"))  # Directory to save the model
    parser.add_argument("--train", type = str, default = os.environ.get("SM_CHANNEL_TRAIN"))  # Directory for training data
    parser.add_argument("--test", type = str, default = os.environ.get("SM_CHANNEL_TEST"))    # Directory for testing data
    parser.add_argument("--train-file", type = str, default = "../data/Machine_Learning/train-V-1.csv")  # Training data file
    parser.add_argument("--test-file", type = str, default = "../data/Machine_Learning/test-V-1.csv")    # Testing data file

    args, _ = parser.parse_known_args()

    # Printing the versions of sklearn and joblib for debugging
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib version: ", joblib.__version__)

    print("[INFO] Reading Data")
    print()

    # Reading the training and testing data into DataFrames
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)  # Getting the feature columns from the training data
    label = features.pop()  # Removing and storing the label column

    print("[INFO] Building training and testing datasets")
    print()

    # Splitting the data into features (X) and target (y)
    X_train = train_df[features]
    y_train = train_df[label]

    X_test = test_df[features]
    y_test = test_df[label]

    print("Column order: ")
    print(features)
    print()

    print("Label column is: ", label)
    print()

    print("Data Shape")
    print()
    print("-------- SHAPE OF THE TRAINING DATA (85%)--------")
    print(X_train.shape)
    print(y_train.shape)

    print("Training the RandomForest Model...")
    print()
    # Training the RandomForest model with the specified hyperparameters
    model = RandomForestClassifier(n_estimators = args.n_estimators, random_state = args.random_state)
    model.fit(X_train, y_train)
    print()

    # Saving the trained model to the specified directory
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at: ", model_path)
    print()

    # Making predictions on the test set
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)  # Calculating accuracy
    test_rep = classification_report(y_test, y_pred_test)  # Generating a classification report

    print()
    print("-------- METRICS RESULT FOR TESTING DATA --------")
    print()
    print("Total rows are: ", X_test.shape[0])
    print("Accuracy Score: ", test_acc)
    print("Test Report: ")
    print(test_rep)

Overwriting script.py


In [None]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"  # Specify the version of the scikit-learn framework

# Creating the SKLearn estimator to run the training job
sklearn_estimator = SKLearn(
    entry_point = "script.py",  # Path to the script that trains the model
    role = "###",  # IAM role with the necessary permissions, You have to create the roles manualy, check the README file for information
    instance_count = 1,  # Number of instances for training
    instance_type = "ml.m5.large",  # Type of instance to use for training
    framework_version = FRAMEWORK_VERSION,  # Specify the framework version
    base_job_name = "RF-custom-sklearn",  # Base name for the SageMaker job
    hyperparameters = {  # Hyperparameters for the model
        "n_estimators": 100,  # Number of trees in the RandomForest model
        "random_state": 0     # Random seed for reproducibility
    },
    use_spot_instances = True,  # Use spot instances to reduce costs
    max_wait = 7200,  # Maximum waiting time for spot instance requests (in seconds)
    max_run = 3600   # Maximum run time for the training job (in seconds)
)

In [None]:
# Start the training job using the estimator
sklearn_estimator.fit(
    {
        "train": train_path,  # Path to the training data on S3
        "test": test_path     # Path to the test data on S3
    },
    wait = True  # Wait for the job to complete before proceeding
)

2024-11-20 07:36:47 Starting - Starting the training job...
2024-11-20 07:37:01 Starting - Preparing the instances for training...
2024-11-20 07:37:49 Downloading - Downloading the training image.....2024-11-20 07:38:34,062 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-11-20 07:38:34,066 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-20 07:38:34,111 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-11-20 07:38:34,287 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-20 07:38:34,298 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-20 07:38:34,310 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-20 07:38:34,319 sagemaker-training-toolkit INFO     Invoking user script
Training Env:
{
    "additional_framework_parameters": {},
    "channel_input_di