## BREAST CANCER PREDICTION - XG BOOST MODEL USING AUCPR

#### Load the required libraries

#### Also ensure the kernel is _python3

In [1]:
# data managing and display libs
import pandas as pd
import numpy as np
import os
import io

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline 

In [2]:
# sagemaker libraries
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

#### Set up the Sagemaker Environment

In [3]:
# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
print(role)

# S3 bucket name
bucket = sagemaker_session.default_bucket()
print(bucket)

arn:aws:iam::396358375665:role/service-role/AmazonSageMaker-ExecutionRole-20200814T112856
sagemaker-eu-west-1-396358375665


#### Read the PCA data

In [4]:
# read in the pca csv file
local_data = 'data/pca.csv'

# print out some data
pca_bc_df = pd.read_csv(local_data)
print('Data shape (rows, cols): ', pca_bc_df.shape)
print()
pca_bc_df.head()

Data shape (rows, cols):  (569, 13)



Unnamed: 0,id,id.1,diag_value,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10
0,842302,842302,1,-9.192838,-1.948583,1.123166,-3.633731,-1.19511,-1.411424,-2.15937,-0.398406,0.157118,0.877402
1,842517,842517,1,-2.387802,3.768172,0.529293,-1.118264,0.621775,-0.028657,-0.013358,0.240989,0.711905,-1.106994
2,84300903,84300903,1,-5.733896,1.075174,0.551748,-0.912083,-0.177086,-0.541452,0.668167,0.097373,-0.024066,-0.454275
3,84348301,84348301,1,-7.122953,-10.275589,3.23279,-0.152547,-2.960879,-3.053421,-1.42991,1.059565,1.405438,1.116976
4,84358402,84358402,1,-3.935302,1.948071,-1.389767,-2.940639,0.546748,1.226494,0.936212,0.636376,0.263806,-0.377705


In [5]:
# tidy up the columns and index

pca_bc_df = pca_bc_df.drop('id.1', axis = 1)
pca_bc_df.index=pca_bc_df['id'] 
pca_bc_df = pca_bc_df.drop('id', axis = 1)
pca_bc_df.head()

Unnamed: 0_level_0,diag_value,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
842302,1,-9.192838,-1.948583,1.123166,-3.633731,-1.19511,-1.411424,-2.15937,-0.398406,0.157118,0.877402
842517,1,-2.387802,3.768172,0.529293,-1.118264,0.621775,-0.028657,-0.013358,0.240989,0.711905,-1.106994
84300903,1,-5.733896,1.075174,0.551748,-0.912083,-0.177086,-0.541452,0.668167,0.097373,-0.024066,-0.454275
84348301,1,-7.122953,-10.275589,3.23279,-0.152547,-2.960879,-3.053421,-1.42991,1.059565,1.405438,1.116976
84358402,1,-3.935302,1.948071,-1.389767,-2.940639,0.546748,1.226494,0.936212,0.636376,0.263806,-0.377705


#### Split into Train, Validate and Test datasets

In [6]:
# split into train/test
def train_test_split(df, train_frac= 0.7, seed=1):
    '''Shuffle the data and randomly split into train and test sets;
       separate the class labels (the column in df) from the features.
       :param df: Dataframe of all TNA measurements
       :param train_frac: The decimal fraction of data that should be training data
       :param seed: Random seed for shuffling and reproducibility, default = 1
       :return: Two tuples (in order): (train_features, train_labels), (test_features, test_labels)
       '''
    # convert dataframe to a matrix in order to use numpy shuffle
    trans_matrix = df.to_numpy()
    
    # shuffle and split the data
    np.random.seed(seed)
    np.random.shuffle(trans_matrix)
    
    # define the training cut off from the number of rows
    nTrain = int(trans_matrix.shape[0] * train_frac)
    nFeatures = trans_matrix.shape[1]-1
    
    # the features are all columns except the first one
    train_features = trans_matrix[:nTrain, 1: ]
    train_labels = trans_matrix[:nTrain, :1 ]
    
    test_features = trans_matrix[nTrain: , 1: ]
    test_labels = trans_matrix[nTrain: , :1 ]
    
    return (train_features, train_labels[: , 0]), (test_features, test_labels[: ,0])

In [7]:
# get train/test data
(train_features, train_labels), (test_features, test_labels) = train_test_split(pca_bc_df, train_frac=0.7)

In [8]:
# construct a training dataframe in order to further spliot into train and validate

train_df = pd.concat([pd.DataFrame(train_labels), pd.DataFrame(train_features)], axis = 1)
train_df.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,9
0,0.0,-1.988462,-2.316048,-1.236304,-1.769776,0.143996,-0.133596,-0.365437,-0.498596,0.712368,-0.005232
1,1.0,-1.700007,-2.352272,3.078089,0.066021,-1.056466,0.28211,0.16062,0.01896,-0.592331,0.105427
2,0.0,1.420223,-1.393978,0.836987,-1.105445,-0.364401,-0.089466,0.263475,0.964706,0.47597,0.078254
3,1.0,-0.364876,3.574461,2.223978,-0.223168,0.778531,-0.51023,0.562793,0.235439,0.325623,-0.124681
4,1.0,0.934091,2.105945,-1.432917,2.894262,-1.275416,-1.626692,0.547604,0.850074,-0.368685,-0.292194


In [9]:
# get train/validate data
(train_features, train_labels), (val_features, val_labels) = train_test_split(train_df, train_frac=0.8)

#### Save the Train, Validate and test data locally

In [10]:
data_dir = 'data'

# We use pandas to save our train and validation data to csv files. Note that we make sure not to include header
# information or an index as this is required by the built in algorithms provided by Amazon. Also, it is assumed
# that the first entry in each row is the target variable.

pd.concat([pd.DataFrame(val_labels), pd.DataFrame(val_features)], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([pd.DataFrame(train_labels), pd.DataFrame(train_features)], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)


pd.concat([pd.DataFrame(test_labels), pd.DataFrame(test_features)], axis=1).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

#### Upload the data to S3 

In [11]:
# start the Sagemaker session
session = sagemaker.Session()

# get the role
role = get_execution_role()

# describe the s3 location prefix
prefix = 'xgb'

val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

#### Train the XG Boost Model

In [12]:
# construct the image name for the training container.
container = get_image_uri(session.boto_region_name, 'xgboost', '1.0-1')


# Now that we know which container to use, we can construct the estimator object.
xgb = sagemaker.estimator.Estimator(container,                                   # The name of the training container
                                    role,                                        # The IAM role to use (our current role in this case)
                                    train_instance_count=1,                      # The number of instances to use for training
                                    train_instance_type='ml.m4.xlarge',          # The type of instance ot use for training
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                                                                 # Where to save the output (the model artifacts)
                                    sagemaker_session=session) 

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


##### Set the hyper parameters for the XG Boost model 

In [13]:
xgb.set_hyperparameters(max_depth=5,                      # Maximum depth of a tree. Increasing this value makes the model more complex and likely to be overfit
                        eta=0.2,                          # size shrinkage used in updates to prevent overfitting
                        gamma=4,                          # Minimum loss reduction required to make a further partition on a leaf node 
                        min_child_weight=6,               # min instances for each node
                        subsample=0.8,                    # Subsample ratio of the training instance, 1-Subsample acts like. dropout for overfitting
                        objective='binary:logistic',      # logistic regression for binary classification, output probability
                        early_stopping_rounds=50,         # Validation error needs to decrease at least every early_stopping_rounds to continue training.
                        num_round=200,                    # The number of rounds to run the training.
                        eval_metric='aucpr'               # Precision Recall curve
                       )

Train the model

In [14]:
# This is a wrapper around the location of our train and validation data, to make sure that SageMaker
# knows our data is in csv format.
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-10-12 13:58:26 Starting - Starting the training job...
2020-10-12 13:58:28 Starting - Launching requested ML instances......
2020-10-12 13:59:28 Starting - Preparing the instances for training...
2020-10-12 14:00:22 Downloading - Downloading input data...
2020-10-12 14:00:43 Training - Downloading the training image...
2020-10-12 14:01:26 Uploading - Uploading generated training model
2020-10-12 14:01:26 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value aucpr to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mI

#### deploy the xg boost predictor

In [15]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


---------------!

In [16]:
test_features_df = pd.DataFrame(test_features)
test_labels_df = pd.DataFrame(test_labels)

train_features_df = pd.DataFrame(train_features)
train_labels_df = pd.DataFrame(train_labels)

chek the training dataset predictions as a robustness check with the test dataset

In [17]:
# We need to tell the endpoint what format the data we are sending is in
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

train_pred = xgb_predictor.predict(train_features_df.values).decode('utf-8')
# predictions is currently a comma delimited string and so we would like to break it up
# as a numpy array.
train_pred = pd.DataFrame(np.fromstring(train_pred, sep=','))

use the test features in the predictor as the model performance metrics

In [18]:
# We need to tell the endpoint what format the data we are sending is in
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

test_pred = xgb_predictor.predict(test_features_df.values).decode('utf-8')
# predictions is currently a comma delimited string and so we would like to break it up
# as a numpy array.
test_pred = pd.DataFrame(np.fromstring(test_pred, sep=','))

In [19]:
# convert the scores to an outcome for evaluation
train_pred.columns = ['Score']
train_pred['Outcome'] = np.where(train_pred['Score']>=0.5,1.0,0.0)


test_pred.columns = ['Score']
test_pred['Outcome'] = np.where(test_pred['Score']>=0.5,1.0,0.0)

In [20]:
test_labels.shape

(171,)

In [21]:
test_pred['Outcome'].shape

(171,)

In [22]:
# code to evaluate the endpoint on test data
# returns a variety of model metrics
def evaluate(test_preds, test_labels,  verbose=True):
    """
    Evaluate a model on a test set given the prediction endpoint outcomes.  
    Return binary classification metrics.
    :param test_preds: A prediction endpoint output as a dataframe
    :param test_labels: Class labels for test data as a dataframe
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    
    # calculate true positives, false positives, true negatives, false negatives
    tp = np.logical_and(test_labels, test_preds).sum()
    fp = np.logical_and(1-test_labels, test_preds).sum()
    tn = np.logical_and(1-test_labels, 1-test_preds).sum()
    fn = np.logical_and(test_labels, 1-test_preds).sum()
    
    # calculate binary classification metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    # printing a table of metrics
    if verbose:
        print(pd.crosstab(test_labels, test_preds, rownames=['actual (row)'], colnames=['prediction (col)']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn, 
            'Precision': precision, 'Recall': recall, 'Accuracy': accuracy}

In [23]:
print('Train Metrics XG Boost Model.\n')

# get metrics for xgb predictor
metrics = evaluate(train_pred['Outcome'], 
                   train_labels, 
                   verbose=True) # verbose means we'll print out the metrics

Train Metrics XG Boost Model.

prediction (col)  0.0  1.0
actual (row)              
0.0               200    4
1.0                 7  107

Recall:     0.939
Precision:  0.964
Accuracy:   0.965



In [24]:
print('Test Metrics XG Boost Model.\n')

# get metrics for xgb predictor
metrics = evaluate(test_pred['Outcome'], 
                   test_labels, 
                   verbose=True) # verbose means we'll print out the metrics

Test Metrics XG Boost Model.

prediction (col)  0.0  1.0
actual (row)              
0.0               107    1
1.0                 5   58

Recall:     0.921
Precision:  0.983
Accuracy:   0.965



#### Delete the endpoint

In [25]:
# Deletes a precictor.endpoint
def delete_endpoint(predictor):
        try:
            boto3.client('sagemaker').delete_endpoint(EndpointName=predictor.endpoint)
            print('Deleted {}'.format(predictor.endpoint))
        except:
            print('Already deleted: {}'.format(predictor.endpoint))

In [26]:
delete_endpoint(xgb_predictor)

Deleted sagemaker-xgboost-2020-10-12-13-58-26-700
