# Stacking ensemble

Lets start by getting our SM environemnt ready

In [None]:
bucket = 'stacking-ensemble'
#prefix = 'sagemaker/DEMO-stacking-ensemble'
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()
region=boto3.Session().region_name
account=boto3.client('sts').get_caller_identity()['Account']

Now let's bring in the Python libraries that we'll use throughout the analysis

In [None]:
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker                                  # Amazon SageMaker's Python SDK provides many helper functions
from sagemaker.serializers import CSVSerializer
from sklearn.compose import ColumnTransformer
from sklearn.externals import joblib
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder

Lets read our data from our S3 bucket

In [None]:
data_key = 'UCI_Credit_Card.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

data=pd.read_csv(data_location)
#data.info()
print(data.columns)

Now lets read this into a Pandas data frame and take a look.

### Exploration
Let's start exploring the data.  First, let's understand how the features are distributed.

In [None]:
# Frequency tables for each categorical feature
for column in data.select_dtypes(include=['object']).columns:
    display(pd.crosstab(index=data[column], columns='% observations', normalize='columns'))

# Histograms for each numeric features
display(data.describe())
%matplotlib inline
hist = data.hist(bins=30, sharey=True, figsize=(15, 15))

## Data processing and cleaning

In [None]:
model_data = data.drop(['ID', 'default.payment.next.month'], axis=1)


Standardise our numerical features and one hot encode our categorial features

In [None]:
numeric_features= data[['LIMIT_BAL', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]

numeric_transformer= StandardScaler()
num_array=numeric_transformer.fit_transform(numeric_features)
print(num_array.shape)


categorical_features = data[['SEX', 'EDUCATION', 'MARRIAGE', 'AGE']]
categorical_transformer = OneHotEncoder(handle_unknown='ignore').fit(categorical_features)
cat_array=categorical_transformer.transform(categorical_features).toarray()
print(cat_array.shape)

processed_array=np.concatenate((num_array, cat_array), 1)
print(processed_array.shape)

In [None]:
processed_matirx= np.asmatrix(processed_array)
X_data=pd.DataFrame(processed_matirx)
Y_data = data[['default.payment.next.month']]
X_data.head()

split into train, validation and test sets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.30, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.30, random_state=42)

In [None]:
pd.concat([Y_train, X_train], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([Y_val, X_val], axis=1).to_csv('validation.csv', index=False, header=False)


In [None]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('validation/validation.csv')).upload_file('validation.csv')

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/train'.format(bucket), content_type='text/csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/validation/'.format(bucket), content_type='text/csv')

In [None]:
container_1 = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')

In [None]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container_1,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/output'.format(bucket),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

In [None]:
xgb_model_url= xgb.model_data
print(xgb_model_url)

# Model 2: Linear learner

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container_2 = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='linear-learner')


In [None]:
linear = sagemaker.estimator.Estimator(
    container_2,
    role,
    instance_count=1,
    instance_type="ml.c4.xlarge",
    output_path='s3://{}/output'.format(bucket),
    sagemaker_session=sess,
    content_type='text/csv'
)
linear.set_hyperparameters(feature_dim=88, predictor_type="binary_classifier", mini_batch_size=200)

linear.fit({'train': s3_input_train, 'validation': s3_input_validation}) 


In [None]:
linear_model_url=linear.model_data

## Deploy a multi container endpoint that can host both our learner and later we will add our meta learner model to it as well

In [None]:
container1 = { 'Image': container_1,
                'ContainerHostname': 'xgbContainer',
             'ModelDataUrl': xgb_model_url}

container2 = { 'Image': container_2,
                'ContainerHostname': 'LinearlearnerContainer',
              'ModelDataUrl':linear_model_url
             }
inferenceExecutionConfig = {'Mode': 'Direct'}    

sm_client = boto3.Session().client('sagemaker')

model_name= 'my-direct-model' + strftime("%Y-%m-%d-%H-%M-%S")
response = sm_client.create_model(ModelName =model_name,
              InferenceExecutionConfig = inferenceExecutionConfig,
              ExecutionRoleArn = role,
              Containers = [container1, container2])

In [None]:
response = sm_client.create_endpoint_config(
    EndpointConfigName = 'my-epc',
    ProductionVariants=[{
        'InstanceType':        'ml.m4.xlarge',
        'InitialInstanceCount': 2,
        'InitialVariantWeight': 1,
        'ModelName':            model_name,
        'VariantName':          'AllTraffic'}])

In [None]:
response = sm_client.create_endpoint(
              EndpointName       = 'my-endpoint',
              EndpointConfigName = 'my-epc')

Invoke the endpoint to make predictions using each of the models we deployed.

In [None]:
import json
runtime_sm_client = boto3.Session().client('sagemaker-runtime')
import io
from io import StringIO
csv_file = io.StringIO()
results_XGB=list()

for i in range(len(X_test)):
    body=X_test.iloc[[i]]
    csv_file=io.StringIO()
    body.to_csv(csv_file, sep=",", header=False, index=False)
    payload = csv_file.getvalue()
    response = runtime_sm_client.invoke_endpoint(EndpointName ='my-endpoint',ContentType = 'text/csv',TargetContainerHostname='xgbContainer', Body = payload)
    result = json.loads(response['Body'].read().decode())
    results_XGB.append(result)
#print(results_XGB)

Generate confusion matrix and calculate accuracy for our XGB model

In [None]:
Y_test=Y_test.to_numpy()
Y_test=Y_test.reshape(-1)

In [None]:
cm_xgb=pd.crosstab(index=Y_test, columns=np.round(results_XGB), rownames=['actuals'], colnames=['predictions'])
print(cm_xgb)

(cm_xgb.iloc[0,0]+cm_xgb.iloc[1,1])/len(Y_test)

In [None]:
import json
runtime_sm_client = boto3.Session().client('sagemaker-runtime')
import io
from io import StringIO
csv_file = io.StringIO()
results_LL=list()

for i in range(len(X_test)):
    body=X_test.iloc[[i]]
    csv_file=io.StringIO()
    body.to_csv(csv_file, sep=",", header=False, index=False)
    payload = csv_file.getvalue()
    response = runtime_sm_client.invoke_endpoint(EndpointName ='my-endpoint',ContentType = 'text/csv',TargetContainerHostname='LinearlearnerContainer', Body = payload)
    result = json.loads(response['Body'].read().decode())
    result=result['predictions'][0]['score']
    results_LL.append(result)
#print(results_LL)

Generate confusion matrix and calculate accuracy for our Linear learner model

In [None]:
cm_ll=pd.crosstab(index=Y_test, columns=np.round(results_LL), rownames=['actuals'], colnames=['predictions'])
cm_ll

(cm_ll.iloc[0,0]+cm_ll.iloc[1,1])/len(Y_test)

## Create level 2 (meta learner model) input data from outputs of the other two models

In [None]:
from pandas import DataFrame
# XGB_features= 1- DataFrame(results_XGB)
# XGB_features=pd.concat([XGB_features, DataFrame(results_XGB)], axis=1)
def dataset_feature(input):
    feature=1-DataFrame(input)
    features=pd.concat([feature, DataFrame(input)], axis=1)
    return features

    
XGB_features=dataset_feature(results_XGB)    
#print(XGB_features)
LL_features=dataset_feature(results_LL)

MetaLearner_data=pd.concat([XGB_features, LL_features], axis=1)
print(MetaLearner_data)
MetaLearner_data.dtypes
MetaLearner_data.isna().sum()

Load the data back onto S3 for training the Meta learner- we will use a Ada boost from SKlearn library in cross validated format.


In [None]:
Y_test=pd.Series(Y_test)

In [None]:
pd.concat([Y_test, MetaLearner_data], axis=1).to_csv('meta_train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('metalearner/train/meta_train.csv')).upload_file('meta_train.csv')


Define environmnet parameters passed on to our training script

In [None]:
csv_s3_uri = 's3://{}/metalearner/train/meta_train.csv'.format(bucket)
model_dir='s3://{}/output/'.format(bucket)


## Define the Meta learner and add it to the endpoint we'd had already

In [None]:
from sagemaker.sklearn.estimator import SKLearn

hyperparameters = {"max_depth":10, "K": 5}

train_instance_type = "ml.c5.xlarge"
inputs = {"train": csv_s3_uri}



estimator_parameters = {
    "entry_point": "randomforest.py",
    "dependencies": ["my_custom_library"],
    "instance_type": train_instance_type,
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "role": role,
    "base_job_name": "randomforest-model",
    "framework_version": "0.23-1",
    "py_version": "py3",
    "output_path": model_dir
}

estimator = SKLearn(**estimator_parameters)
estimator.fit(inputs)


meta_learner_url=estimator.model_data

In [None]:
# I saw on githun some similar error suggesting to use the Attach function but this doesnt seem to help 
# #estimator = SKLearn.attach('training job name')
# training_job_name = estimator.latest_training_job.name
# training_job_name
# estimator = SKLearn.attach(training_job_name)

In [None]:
container_3='783357654285.dkr.ecr.{}.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3'.format(region)

container3 = { 'Image': container_3,
               'ContainerHostname': 'SklearnContainer',
                'ModelDataUrl': meta_learner_url}




create the new model and update the endpoint to include the meta learner as well

In [None]:
response = sm_client.create_model(ModelName = 'my-direct-modelML',
              InferenceExecutionConfig = inferenceExecutionConfig,
              ExecutionRoleArn = role,
              Containers = [container1, container2, container3])

In [None]:
response = sm_client.create_endpoint_config(
    EndpointConfigName = 'my-epc-ml',
    ProductionVariants=[{
        'InstanceType':        'ml.m4.xlarge',
        'InitialInstanceCount': 2,
        'InitialVariantWeight': 1,
        'ModelName':            'my-direct-modelML',
        'VariantName':          'AllTraffic'}])

In [None]:
response = sm_client.update_endpoint(
              EndpointName       = 'my-endpoint',
              EndpointConfigName = 'my-epc-ml')

In [None]:
#test it on the entire test data
meta_data=pd.read_csv("meta_train.csv", header=None)
meta_data=meta_data.iloc[: ,1:]
meta_data.head()

In [None]:
#Test on the full test dataset

In [None]:
# import json
# runtime_sm_client = boto3.Session().client('sagemaker-runtime')
# import io
# from io import StringIO
# csv_file = io.StringIO()
# results_ML=list()

# for i in range(len(meta_data)):
#     body=meta_data.iloc[[i]]
#     csv_file=io.StringIO()
#     body.to_csv(csv_file, sep=",", header=False, index=False)
#     payload = csv_file.getvalue()
#     response = runtime_sm_client.invoke_endpoint(EndpointName ='my-endpoint',ContentType = 'text/csv', TargetContainerHostname='thirdContainer', Body = payload)
#     result = json.loads(response['Body'].read().decode())
#     result=result['predictions'][0]['score']
#     results_ML.append(result)
# #print(results_LL)

In [None]:
cm_ml=pd.crosstab(index=Y_test, columns=np.round(results_LL), rownames=['actuals'], colnames=['predictions'])
cm_ml

(cm_ml.iloc[0,0]+cm_ll.iloc[1,1])/len(Y_test)