## BREAST CANCER PREDICTION - LINEAR MODEL

#### Load the required libraries

#### Also ensure the kernel is _python3

In [1]:
# data managing and display libs
import pandas as pd
import numpy as np
import os
import io

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline 

In [2]:
# sagemaker libraries
import boto3
import sagemaker
from sagemaker import get_execution_role

#### Set up the Sagemaker Environment

In [3]:
# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
print(role)

# S3 bucket name
bucket = sagemaker_session.default_bucket()
print(bucket)

arn:aws:iam::396358375665:role/service-role/AmazonSageMaker-ExecutionRole-20200814T112856
sagemaker-eu-west-1-396358375665


#### Read the PCA data

In [7]:
# read in the pca csv file
local_data = 'data/pca.csv'

# print out some data
pca_bc_df = pd.read_csv(local_data)
print('Data shape (rows, cols): ', pca_bc_df.shape)
print()
pca_bc_df.head()

Data shape (rows, cols):  (569, 13)



Unnamed: 0,id,id.1,diag_value,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10
0,842302,842302,1,-9.192838,-1.948583,1.123166,-3.633731,-1.19511,-1.411424,-2.15937,-0.398406,0.157118,0.877402
1,842517,842517,1,-2.387802,3.768172,0.529293,-1.118264,0.621775,-0.028657,-0.013358,0.240989,0.711905,-1.106994
2,84300903,84300903,1,-5.733896,1.075174,0.551748,-0.912083,-0.177086,-0.541452,0.668167,0.097373,-0.024066,-0.454275
3,84348301,84348301,1,-7.122953,-10.275589,3.23279,-0.152547,-2.960879,-3.053421,-1.42991,1.059565,1.405438,1.116976
4,84358402,84358402,1,-3.935302,1.948071,-1.389767,-2.940639,0.546748,1.226494,0.936212,0.636376,0.263806,-0.377705


In [8]:
# tidy up the columns and index

pca_bc_df = pca_bc_df.drop('id.1', axis = 1)
pca_bc_df.index=pca_bc_df['id'] 
pca_bc_df = pca_bc_df.drop('id', axis = 1)
pca_bc_df.head()

Unnamed: 0_level_0,diag_value,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
842302,1,-9.192838,-1.948583,1.123166,-3.633731,-1.19511,-1.411424,-2.15937,-0.398406,0.157118,0.877402
842517,1,-2.387802,3.768172,0.529293,-1.118264,0.621775,-0.028657,-0.013358,0.240989,0.711905,-1.106994
84300903,1,-5.733896,1.075174,0.551748,-0.912083,-0.177086,-0.541452,0.668167,0.097373,-0.024066,-0.454275
84348301,1,-7.122953,-10.275589,3.23279,-0.152547,-2.960879,-3.053421,-1.42991,1.059565,1.405438,1.116976
84358402,1,-3.935302,1.948071,-1.389767,-2.940639,0.546748,1.226494,0.936212,0.636376,0.263806,-0.377705


#### Calculate the percentage of malignat tumors

In [9]:
# Calculate the fraction of records that are malignant
def malignant_percentage(df):
    '''Calculate the fraction of all data points that have a diagnosis value  of 1; malignant.
       :param df: Dataframe of all FNA data points; has a column 'diag_value'
       :return: A fractional percentage of malignant data points/all points
    '''
    
    nMal = df.diag_value.sum()
    nBen = sum(1-df.diag_value)
    return nMal/(nMal + nBen)

In [10]:
# call the function to calculate the malignant percentage
mal_percentage = malignant_percentage(pca_bc_df)

print('Malignant percentage = ', mal_percentage)
print('Total # of malignant TNA: ', mal_percentage*pca_bc_df.shape[0])
print('Out of (total) TNA: ', pca_bc_df.shape[0])

Malignant percentage =  0.37258347978910367
Total # of malignant TNA:  212.0
Out of (total) TNA:  569


#### Split into train and test datasets

In [11]:
# split into train/test
def train_test_split(df, train_frac= 0.7, seed=1):
    '''Shuffle the data and randomly split into train and test sets;
       separate the class labels (the column in df) from the features.
       :param df: Dataframe of all TNA measurements
       :param train_frac: The decimal fraction of data that should be training data
       :param seed: Random seed for shuffling and reproducibility, default = 1
       :return: Two tuples (in order): (train_features, train_labels), (test_features, test_labels)
       '''
    # convert dataframe to a matrix in order to use numpy shuffle
    trans_matrix = df.to_numpy()
    
    # shuffle and split the data
    np.random.seed(seed)
    np.random.shuffle(trans_matrix)
    
    # define the training cut off from the number of rows
    nTrain = int(trans_matrix.shape[0] * train_frac)
    nFeatures = trans_matrix.shape[1]-1
    
    # the features are all columns except the first one
    train_features = trans_matrix[:nTrain, 1: ]
    train_labels = trans_matrix[:nTrain, :1 ]
    
    test_features = trans_matrix[nTrain: , 1: ]
    test_labels = trans_matrix[nTrain: , :1 ]
    
    return (train_features, train_labels[: , 0]), (test_features, test_labels[: ,0])

In [13]:
# get train/test data
(train_features, train_labels), (test_features, test_labels) = train_test_split(pca_bc_df, train_frac=0.7)

In [17]:
# manual test

# for a split of 0.7:0.3 there should be ~2.33x as many training as test pts
print('Training data pts: ', len(train_features))
print('Test data pts: ', len(test_features))
print()

print(len(train_features)/len(test_features))

# take a look at first item and see that it aligns with first row of data
print('First item: \n', train_features[0])
print('Label: ', train_labels[0])
print()

# test split
assert len(train_features) > 2.32*len(test_features), \
        'Unexpected number of train/test points for a train_frac=0.7'
# test labels
assert np.all(train_labels)== 0 or np.all(train_labels)== 1, \
        'Train labels should be 0s or 1s.'
assert np.all(test_labels)== 0 or np.all(test_labels)== 1, \
        'Test labels should be 0s or 1s.'
print('Tests passed!')

Training data pts:  398
Test data pts:  171

2.327485380116959
First item: 
 [-1.98846221 -2.31604791 -1.23630393 -1.76977611  0.14399576 -0.13359593
 -0.365437   -0.49859563  0.71236807 -0.00523187]
Label:  0.0

Tests passed!


#### Create the default <b>Linear Learner</b> estimator

In [18]:
# import LinearLearner
from sagemaker import LinearLearner

# define the location
prefix = 'linear_bc'
outputpath = 's3://{}/{}'.format(bucket,prefix)

# instantiate LinearLearner

ll = LinearLearner(role = role,
                   train_instance_count = 1,
                   train_instance_type = 'ml.c4.xlarge',
                   output_path = outputpath,
                   sagemaker_session = sagemaker_session,
                   predictor_type = 'binary_classifier',
                   epochs = 25)


#### Convert the training data to recordset format

In [19]:
# first convert to float
train_features_f = train_features.astype('float32')
train_labels_f = train_labels.astype('float32')

# take a look at first item and see that it aligns with first row of data
print('First item: \n', train_features_f[0])
print('Label: ', train_labels_f[0])
print()

First item: 
 [-1.9884622  -2.316048   -1.2363039  -1.7697761   0.14399576 -0.13359593
 -0.365437   -0.49859563  0.7123681  -0.00523187]
Label:  0.0



In [20]:
# create RecordSet of training data
formatted_train_data = ll.record_set(train_features_f, labels = train_labels_f)

#### Train the estimator

In [23]:
%%time 
# train the estimator on formatted training data
ll.fit(formatted_train_data)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-10-09 12:10:36 Starting - Starting the training job...
2020-10-09 12:10:38 Starting - Launching requested ML instances......
2020-10-09 12:11:54 Starting - Preparing the instances for training......
2020-10-09 12:13:06 Downloading - Downloading input data...
2020-10-09 12:13:32 Training - Downloading the training image...
2020-10-09 12:14:05 Uploading - Uploading generated training model
2020-10-09 12:14:05 Completed - Training job completed
[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[10/09/2020 12:13:55 INFO 140311437846336] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler

#### Deploy the trained model

In [24]:
%%time 
# deploy and create a predictor
linear_predictor = ll.deploy(initial_instance_count = 1,
                             instance_type = 'ml.t2.medium')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!CPU times: user 299 ms, sys: 18.3 ms, total: 317 ms
Wall time: 8min 33s


#### Evaluate the model

In [25]:
# code to evaluate the endpoint on test data
# returns a variety of model metrics
def evaluate(predictor, test_features, test_labels, verbose=True):
    """
    Evaluate a model on a test set given the prediction endpoint.  
    Return binary classification metrics.
    :param predictor: A prediction endpoint
    :param test_features: Test features
    :param test_labels: Class labels for test data
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    
    # We have a lot of test data, so we'll split it into batches of 100
    # split the test data set into batches and evaluate using prediction endpoint    
    prediction_batches = [predictor.predict(batch) for batch in np.array_split(test_features, 100)]
    
    # LinearLearner produces a `predicted_label` for each data point in a batch
    # get the 'predicted_label' for every point in a batch
    test_preds = np.concatenate([np.array([x.label['predicted_label'].float32_tensor.values[0] for x in batch]) 
                                 for batch in prediction_batches])
    
    # calculate true positives, false positives, true negatives, false negatives
    tp = np.logical_and(test_labels, test_preds).sum()
    fp = np.logical_and(1-test_labels, test_preds).sum()
    tn = np.logical_and(1-test_labels, 1-test_preds).sum()
    fn = np.logical_and(test_labels, 1-test_preds).sum()
    
    # calculate binary classification metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    # printing a table of metrics
    if verbose:
        print(pd.crosstab(test_labels, test_preds, rownames=['actual (row)'], colnames=['prediction (col)']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn, 
            'Precision': precision, 'Recall': recall, 'Accuracy': accuracy}

Run a prediction on the Training data as an initial measure

In [27]:
print('Training Metrics for simple, LinearLearner.\n')

# get metrics for linear predictor
metrics = evaluate(linear_predictor, 
                   train_features.astype('float32'), 
                   train_labels, 
                   verbose=True) # verbose means we'll print out the metrics

Training Metrics for simple, LinearLearner.

prediction (col)  0.0  1.0
actual (row)              
0.0               246    3
1.0                 6  143

Recall:     0.960
Precision:  0.979
Accuracy:   0.977



Evaluation: Overall Metrics

In [28]:
print('Test Metrics for simple, LinearLearner.\n')

# get metrics for linear predictor
metrics = evaluate(linear_predictor, 
                   test_features.astype('float32'), 
                   test_labels, 
                   verbose=True) # verbose means we'll print out the metrics


Test Metrics for simple, LinearLearner.

prediction (col)  0.0  1.0
actual (row)              
0.0               107    1
1.0                 2   61

Recall:     0.968
Precision:  0.984
Accuracy:   0.982



#### Delete the endpoint

In [29]:
# Deletes a precictor.endpoint
def delete_endpoint(predictor):
        try:
            boto3.client('sagemaker').delete_endpoint(EndpointName=predictor.endpoint)
            print('Deleted {}'.format(predictor.endpoint))
        except:
            print('Already deleted: {}'.format(predictor.endpoint))

#### Retrain the model to focus on avoiding False Negatives

A false negative in this context would represent interpreting the FNA outcomes as benign when in fact it was malignant

Reducing false negatives has the effect of increasing recall:

\begin{equation*}
Recall = \frac{TP}{TP+FN}
\end{equation*}

In [30]:
# tune the model for a higher recall
linear_recall = LinearLearner(role=role,
                              train_instance_count=1, 
                              train_instance_type='ml.c4.xlarge',
                              predictor_type='binary_classifier',
                              output_path=outputpath,
                              sagemaker_session=sagemaker_session,
                              epochs=15,
                              binary_classifier_model_selection_criteria='precision_at_target_recall', # target recall
                              target_recall=0.985) # 98.5% recall

#### Train the recall estimator

In [31]:
%%time 
# train the estimator on formatted training data
linear_recall.fit(formatted_train_data)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-10-09 14:29:32 Starting - Starting the training job...
2020-10-09 14:29:33 Starting - Launching requested ML instances......
2020-10-09 14:30:51 Starting - Preparing the instances for training............
2020-10-09 14:33:01 Downloading - Downloading input data
2020-10-09 14:33:01 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[10/09/2020 14:33:05 INFO 139774039336768] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_schedul

#### Deploy the recall predictor

In [32]:
%%time 
# deploy and create a predictor
recall_predictor = linear_recall.deploy(initial_instance_count = 1,
                                     instance_type = 'ml.t2.medium')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!CPU times: user 287 ms, sys: 22 ms, total: 309 ms
Wall time: 8min 32s


#### Evalute the Recall model

Check the performance on the training data

In [33]:
print('Training Metrics for Recall, LinearLearner.\n')

# get metrics for recall predictor
metrics = evaluate(recall_predictor, 
                   train_features.astype('float32'), 
                   train_labels, 
                   verbose=True) # verbose means we'll print out the metrics

Training Metrics for Recall, LinearLearner.

prediction (col)  0.0  1.0
actual (row)              
0.0               154   95
1.0                 2  147

Recall:     0.987
Precision:  0.607
Accuracy:   0.756



Recall model performance on the Test data

In [34]:
print('Test Metrics for Recall, LinearLearner.\n')

# get metrics for recall predictor
metrics = evaluate(recall_predictor, 
                   test_features.astype('float32'), 
                   test_labels, 
                   verbose=True) # verbose means we'll print out the metrics

Test Metrics for Recall, LinearLearner.

prediction (col)  0.0  1.0
actual (row)              
0.0                78   30
1.0                 1   62

Recall:     0.984
Precision:  0.674
Accuracy:   0.819



#### delete the endpoints

In [35]:
delete_endpoint(linear_predictor)
delete_endpoint(recall_predictor)

Deleted linear-learner-2020-10-09-12-10-36-470
Deleted linear-learner-2020-10-09-14-29-32-039
