## BREAST CANCER PREDICTION - NEURAL NETWORK MODEL

#### Load the required libraries

#### Also ensure the kernel is _pytorch_p36

In [1]:
# data managing and display libs
import pandas as pd
import numpy as np
import os
import io

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [2]:
# sagemaker libraries
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch

#### Set up the Sagemaker Environment

In [3]:
# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
print(role)

# S3 bucket name
bucket = sagemaker_session.default_bucket()
print(bucket)

arn:aws:iam::396358375665:role/service-role/AmazonSageMaker-ExecutionRole-20200814T112856
sagemaker-eu-west-1-396358375665


#### Read the PCA data

In [4]:
# read in the pca csv file
local_data = 'data/pca.csv'

# print out some data
pca_bc_df = pd.read_csv(local_data)
print('Data shape (rows, cols): ', pca_bc_df.shape)
print()
pca_bc_df.head()

Data shape (rows, cols):  (569, 13)



Unnamed: 0,id,id.1,diag_value,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10
0,842302,842302,1,-9.192838,-1.948583,1.123166,-3.633731,-1.19511,-1.411424,-2.15937,-0.398406,0.157118,0.877402
1,842517,842517,1,-2.387802,3.768172,0.529293,-1.118264,0.621775,-0.028657,-0.013358,0.240989,0.711905,-1.106994
2,84300903,84300903,1,-5.733896,1.075174,0.551748,-0.912083,-0.177086,-0.541452,0.668167,0.097373,-0.024066,-0.454275
3,84348301,84348301,1,-7.122953,-10.275589,3.23279,-0.152547,-2.960879,-3.053421,-1.42991,1.059565,1.405438,1.116976
4,84358402,84358402,1,-3.935302,1.948071,-1.389767,-2.940639,0.546748,1.226494,0.936212,0.636376,0.263806,-0.377705


In [5]:
# tidy up the columns and index

pca_bc_df = pca_bc_df.drop('id.1', axis = 1)
pca_bc_df.index=pca_bc_df['id'] 
pca_bc_df = pca_bc_df.drop('id', axis = 1)
pca_bc_df.head()

Unnamed: 0_level_0,diag_value,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
842302,1,-9.192838,-1.948583,1.123166,-3.633731,-1.19511,-1.411424,-2.15937,-0.398406,0.157118,0.877402
842517,1,-2.387802,3.768172,0.529293,-1.118264,0.621775,-0.028657,-0.013358,0.240989,0.711905,-1.106994
84300903,1,-5.733896,1.075174,0.551748,-0.912083,-0.177086,-0.541452,0.668167,0.097373,-0.024066,-0.454275
84348301,1,-7.122953,-10.275589,3.23279,-0.152547,-2.960879,-3.053421,-1.42991,1.059565,1.405438,1.116976
84358402,1,-3.935302,1.948071,-1.389767,-2.940639,0.546748,1.226494,0.936212,0.636376,0.263806,-0.377705


#### Split the data into Train and Test

In [7]:
# split into train/test
def train_test_split(df, train_frac= 0.7, seed=1):
    '''Shuffle the data and randomly split into train and test sets;
       separate the class labels (the column in df) from the features.
       :param df: Dataframe of all TNA measurements
       :param train_frac: The decimal fraction of data that should be training data
       :param seed: Random seed for shuffling and reproducibility, default = 1
       :return: Two tuples (in order): (train_features, train_labels), (test_features, test_labels)
       '''
    # convert dataframe to a matrix in order to use numpy shuffle
    trans_matrix = df.to_numpy()
    
    # shuffle and split the data
    np.random.seed(seed)
    np.random.shuffle(trans_matrix)
    
    # define the training cut off from the number of rows
    nTrain = int(trans_matrix.shape[0] * train_frac)
    nFeatures = trans_matrix.shape[1]-1
    
    # the features are all columns except the first one
    train_features = trans_matrix[:nTrain, 1: ]
    train_labels = trans_matrix[:nTrain, :1 ]
    
    test_features = trans_matrix[nTrain: , 1: ]
    test_labels = trans_matrix[nTrain: , :1 ]
    
    return (train_features, train_labels[: , 0]), (test_features, test_labels[: ,0])

In [8]:
# get train/test data
(train_features, train_labels), (test_features, test_labels) = train_test_split(pca_bc_df, train_frac=0.7)

#### Create csv files for use with modelling

In [12]:
data_dir = 'data'

# We use pandas to save our train and test data to csv files. Note that we make sure not to include header
# information or an index 

pd.concat([pd.DataFrame(train_labels), pd.DataFrame(train_features)], axis=1).to_csv(os.path.join(data_dir, 'nntrain.csv'), header=False, index=False)
pd.concat([pd.DataFrame(test_labels), pd.DataFrame(test_features)], axis=1).to_csv(os.path.join(data_dir, 'nntest.csv'), header=False, index=False)

#### Upload data to S3

In [14]:
# describe the s3 location prefix
prefix = 'nn'

test_location = sagemaker_session.upload_data(os.path.join(data_dir, 'nntest.csv'), key_prefix=prefix)
train_location = sagemaker_session.upload_data(os.path.join(data_dir, 'nntrain.csv'), key_prefix=prefix)