### Setting up the role, bucket and region

In [21]:
import os
import boto3
import re
import sagemaker

In [22]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
bucket = "project01-ml-pipeline-bucket"

prefix = (
    "sagemaker/lung-cancer-prediction"
)
print (region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
us-east-1


In [23]:
role

'arn:aws:iam::193734792448:role/fast-ai-academic-59-Student-Azure'

### Getting the data and then analysising it

In [24]:
import pandas as pd

# loading the dataset
data = pd.read_csv("lung_cancer_ds.csv")

In [25]:
data

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,P995,44,1,6,7,7,7,7,6,...,5,3,2,7,8,2,4,5,3,High
996,996,P996,37,2,6,8,7,7,7,6,...,9,6,5,7,2,4,3,1,4,High
997,997,P997,25,2,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
998,998,P998,18,2,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [26]:
data.shape

(1000, 26)

In [27]:
data.columns

Index(['index', 'Patient Id', 'Age', 'Gender', 'Air Pollution', 'Alcohol use',
       'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk',
       'chronic Lung Disease', 'Balanced Diet', 'Obesity', 'Smoking',
       'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue',
       'Weight Loss', 'Shortness of Breath', 'Wheezing',
       'Swallowing Difficulty', 'Clubbing of Finger Nails', 'Frequent Cold',
       'Dry Cough', 'Snoring', 'Level'],
      dtype='object')

In [28]:
data.describe()

Unnamed: 0,index,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Coughing of Blood,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,37.174,1.402,3.84,4.563,5.165,4.84,4.58,4.38,4.491,...,4.859,3.856,3.855,4.24,3.777,3.746,3.923,3.536,3.853,2.926
std,288.819436,12.005493,0.490547,2.0304,2.620477,1.980833,2.107805,2.126999,1.848518,2.135528,...,2.427965,2.244616,2.206546,2.285087,2.041921,2.270383,2.388048,1.832502,2.039007,1.474686
min,0.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,249.75,27.75,1.0,2.0,2.0,4.0,3.0,2.0,3.0,2.0,...,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
50%,499.5,36.0,1.0,3.0,5.0,6.0,5.0,5.0,4.0,4.0,...,4.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0
75%,749.25,45.0,2.0,6.0,7.0,7.0,7.0,7.0,6.0,7.0,...,7.0,5.0,6.0,6.0,5.0,5.0,5.0,5.0,6.0,4.0
max,999.0,73.0,2.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0,...,9.0,9.0,8.0,9.0,8.0,8.0,9.0,7.0,7.0,7.0


In [29]:
# checking for missing values
data.isnull().sum()

index                       0
Patient Id                  0
Age                         0
Gender                      0
Air Pollution               0
Alcohol use                 0
Dust Allergy                0
OccuPational Hazards        0
Genetic Risk                0
chronic Lung Disease        0
Balanced Diet               0
Obesity                     0
Smoking                     0
Passive Smoker              0
Chest Pain                  0
Coughing of Blood           0
Fatigue                     0
Weight Loss                 0
Shortness of Breath         0
Wheezing                    0
Swallowing Difficulty       0
Clubbing of Finger Nails    0
Frequent Cold               0
Dry Cough                   0
Snoring                     0
Level                       0
dtype: int64

### Cleaning the data

In [30]:
# Dropping columns 'Patient Id', index
data.drop(['Patient Id', 'index'], axis = 1, inplace = True)

In [31]:
data

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,44,1,6,7,7,7,7,6,7,7,...,5,3,2,7,8,2,4,5,3,High
996,37,2,6,8,7,7,7,6,7,7,...,9,6,5,7,2,4,3,1,4,High
997,25,2,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
998,18,2,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High


In [32]:
# Converting the 'Level' column to numerical values
def level_numerical(value):
    if value == 'Low':
        return 0
    elif value == 'Medium':
        return 1
    else:
        return 2
    
data["Level_numerical"] = data.Level.apply(level_numerical)

In [33]:
data

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level,Level_numerical
0,33,1,2,4,5,4,3,2,2,4,...,4,2,2,3,1,2,3,4,Low,0
1,17,1,3,1,5,3,4,2,2,2,...,3,7,8,6,2,1,7,2,Medium,1
2,35,1,4,5,6,5,5,4,6,7,...,7,9,2,1,4,6,7,2,High,2
3,37,1,7,7,7,7,6,7,7,7,...,2,3,1,4,5,6,7,5,High,2
4,46,1,6,8,7,7,7,6,7,7,...,2,4,1,4,2,4,2,3,High,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,44,1,6,7,7,7,7,6,7,7,...,3,2,7,8,2,4,5,3,High,2
996,37,2,6,8,7,7,7,6,7,7,...,6,5,7,2,4,3,1,4,High,2
997,25,2,4,5,6,5,5,4,6,7,...,7,9,2,1,4,6,7,2,High,2
998,18,2,6,8,7,7,7,6,7,7,...,2,4,1,4,2,4,2,3,High,2


In [34]:
# Dropping column Level
data.drop('Level', axis = 1, inplace = True)

In [35]:
data.shape

(1000, 24)

### Splitting the data into 80% training, 10% validation and 10% testing

In [36]:
import numpy as np

rand_split = np.random.rand(len(data))

train_list = rand_split < 0.8
val_list = (rand_split >=0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

# Creating the dataset for training, validation and testing
train_data = data[train_list]
val_data = data[val_list]
test_data = data[test_list]

# 1. Separating into input features and output class
# This code gets all the columns first to last-1 as the input features while 1

# 1. Training data
train_x = train_data.iloc[:, :- 1]
train_y = train_data.iloc[:, -1]

# 2. Validation data
val_x = val_data.iloc[:, :- 1]
val_y = val_data.iloc[:, -1]

# 3. Testing data
test_x = test_data. iloc[:, :- 1]
test_y = test_data.iloc[:, -1]

# 2. Convert DataFrame to NumPy array
# 1. Training data
train_x = train_x.to_numpy().astype("float32")
train_y = train_y.to_numpy().astype("float32")

# 2. Validation data
val_x = val_x.to_numpy().astype("float32")
val_y = val_y.to_numpy().astype("float32")

# 3. Testing data
test_x = test_x.to_numpy().astype("float32")
test_y = test_y.to_numpy().astype("float32")

In [37]:
# Checking whether all the splitting are done correctly
train_x.shape, train_y.shape, val_x.shape, val_y.shape, test_x.shape, test_y.shape

((802, 23), (802,), (99, 23), (99,), (99, 23), (99,))

In [38]:
# Combine x and y for each dataset
train_combined = np.column_stack((train_x, train_y))
val_combined = np.column_stack((val_x, val_y))

# Convert to Pandas DataFrame
train_df = pd.DataFrame(train_combined, columns=data.columns)
val_df = pd.DataFrame(val_combined, columns=data.columns)
test_df = pd.DataFrame(test_x, columns = data.columns[0:23])

# Save to CSV
train_df.to_csv('train_data.csv', index=False, header=None)
val_df.to_csv('val_data.csv', index=False, header=None)
test_df.to_csv('test_data.csv', index=False, header=None)

### Converting the file to protobuf format

In [18]:
import io
import sagemaker.amazon.common as smac
import time

In [19]:
# Training file
train_file = "linear_train_lung.data"

f = io.BytesIO()

smac.write_numpy_to_dense_tensor(f, train_x.astype("float32"), train_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", train_file)).upload_fileobj(f)

In [20]:
# Validation file
val_file = "linear_validation_lung.data"

f = io.BytesIO()

smac.write_numpy_to_dense_tensor(f, val_x.astype("float32"), val_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, "validation", val_file)).upload_fileobj(f)

### Training the model

In [21]:
from sagemaker import image_uris

# getting the container image of linear learner algorithm
container = image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

In [22]:
linear_job = "Project01-linear-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

print("Job name is: ", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.c4.2xlarge", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/".format(bucket, prefix)},
    "HyperParameters": {
        'feature_dim': '23',
        'mini_batch_size': '100',
        'predictor_type': 'regressor',
        'epochs': '100',
        'num_models': '256',
        'loss': 'squared_loss',
        'learning_rate': '0.001',
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

Job name is:  Project01-linear-2023-11-26-21-44-16


In [23]:
region = boto3.Session().region_name
sm = boto3.client("sagemaker")

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName = linear_job)["TrainingJobStatus"]
print(status)
sm.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName = linear_job)

if status == "Failed":
    message = sm.describe_training_job(TrainingJobName = linear_job)["FailureReason"]
    print("Training failed with the following error: {}".format(message))
    raise Exception("Training job failed")

InProgress


### Hosting

In [24]:
linear_hosting_container = {
    "Image": container,
    "ModelDataUrl": sm.describe_training_job(TrainingJobName=linear_job)["ModelArtifacts"][
        "S3ModelArtifacts"
    ],
}

create_model_response = sm.create_model(
    ModelName=linear_job, ExecutionRoleArn=role, PrimaryContainer=linear_hosting_container
)

print(create_model_response["ModelArn"])

arn:aws:sagemaker:us-east-1:193734792448:model/project01-linear-2023-11-26-21-44-16


In [25]:
# Endpoint configuration
linear_endpoint_config = "DEMO-linear-endpoint-config-" + time.strftime(
    "%Y-%m-%d-%H-%M-%S", time.gmtime()
)
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[
        {
            "InstanceType": "ml.m4.xlarge",
            "InitialInstanceCount": 1,
            "ModelName": linear_job,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

DEMO-linear-endpoint-config-2023-11-26-21-48-17
Endpoint Config Arn: arn:aws:sagemaker:us-east-1:193734792448:endpoint-config/demo-linear-endpoint-config-2023-11-26-21-48-17


In [26]:
# Endpoint creation
linear_endpoint = "DEMO-linear-endpoint-" + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint, EndpointConfigName=linear_endpoint_config
)
print(create_endpoint_response["EndpointArn"])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Status: " + status)

sm.get_waiter("endpoint_in_service").wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

if status != "InService":
    raise Exception("Endpoint creation did not succeed")

DEMO-linear-endpoint-202311262148
arn:aws:sagemaker:us-east-1:193734792448:endpoint/demo-linear-endpoint-202311262148
Status: Creating
Arn: arn:aws:sagemaker:us-east-1:193734792448:endpoint/demo-linear-endpoint-202311262148
Status: InService


### Prediction

In [27]:
# Converting the data to csv format
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [28]:
# Testing the test data on our trained model
import json
import numpy as np

runtime = boto3.client("runtime.sagemaker")

payload = np2csv(test_x)

response = runtime.invoke_endpoint(
    EndpointName=linear_endpoint, ContentType="text/csv", Body=payload
)
result = json.loads(response["Body"].read().decode())
test_pred = np.array([r["score"] for r in result["predictions"]])

In [29]:
print (test_pred)

[ 1.81141853  1.15837336  1.84899807  0.93512988  0.94477248 -0.21377826
  0.23231339  1.00452971  0.95933533  2.0254693   1.9722867   0.93512988
  0.94477248  2.03886032  1.60053158  1.05689359  0.93512988  2.09324527
  0.19753325  0.77354026 -0.04930568 -0.37436008  0.31388831  0.14253831
  0.91823292  1.04393291  0.94001055  1.00955129  0.30406475  0.27792692
  0.01495814  0.18298733 -0.26991475  0.0697577   0.13172102 -0.13631332
  0.82951593  0.98779106  0.95933533  1.91201091  1.92747664  1.90320063
  1.89527225  1.87594032  2.10404444  0.19753325  0.3180232  -0.08035588
  1.04393291  0.94852066  0.97607398  1.82815719  1.82221627  1.9722867
  1.95183015  2.00576401  1.81141853  0.40806222  0.04793882  1.15837336
  1.60053158  1.84899807  0.83453751  1.8956399  -0.26991475  0.05209565
  0.84458065  0.82602739  1.00452971  0.82951593  0.91823292  0.91657639
  2.02508163  1.91109133  1.9990685   1.81141853 -0.60395455  0.01495814
  0.95146799  1.04393291  0.91657639  1.74411964  1.

In [30]:
test_mae_linear = np.mean(np.abs(test_y - test_pred))
test_mae_baseline = np.mean(np.abs(test_y - np.median(train_y)))  

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear, 3))

Test MAE Baseline : 0.619
Test MAE Linear: 0.148


In [49]:
test_pred_class = (test_pred > 0.5) + 0

test_pred_baseline = np.repeat(np.median(train_y), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class)) * 100
baseline_accuracy = np.mean((test_y == test_pred_baseline)) * 100

print("Prediction Accuracy:", round(prediction_accuracy, 1), "%")
print("Baseline Accuracy:", round(baseline_accuracy, 1), "%")

Prediction Accuracy: 66.4 %
Baseline Accuracy: 38.1 %


In [32]:
sm.delete_endpoint(EndpointName=linear_endpoint)

{'ResponseMetadata': {'RequestId': 'f6b55807-9c17-4f49-aac3-00330930f8f3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f6b55807-9c17-4f49-aac3-00330930f8f3',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 26 Nov 2023 21:52:19 GMT'},
  'RetryAttempts': 0}}