# This notebook is to do random predictions to a SageMaker endpoint.

In [1]:
# !pip install -q SQLAlchemy==1.3.13
# !pip install psycopg2-binary pyathena
# !pip install -U pip
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from pyathena import connect
from botocore.exceptions import ClientError
import numpy as np
import pandas as pd
import time
import json
import boto3
import sagemaker
from sagemaker.serializers import CSVSerializer
import random

In [2]:
# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

s3 = boto3.client('s3')
redshift = boto3.client('redshift')
secretsmanager = boto3.client('secretsmanager')

### Variables

In [3]:
secret_name='bankdemo_redshift_login' ## replace the secret name with yours

database_name_redshift = 'bankdemo'
database_name_athena = 'bankdemo'

schema_redshift = 'dm'
schema_athena = 'athena'

table_name_glue = 'bankdemo_glue'
table_name_redshift = 'data'

# endpoint name is case sensitive
endpoint_name = 'BankDemo-staging'

### Get credentials

In [4]:
session = boto3.session.Session()
region = session.region_name

client = session.client(
        service_name='secretsmanager',
        region_name=region
    )

try:
    get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    secret_arn=get_secret_value_response['ARN']

except ClientError as e:
    print("Error retrieving secret. Error: " + e.response['Error']['Message'])
    
else:
    # Depending on whether the secret is a string or binary, one of these fields will be populated.
    if 'SecretString' in get_secret_value_response:
        secret = get_secret_value_response['SecretString']
    else:
        secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            
secret_json = json.loads(secret)
master_user_name = secret_json['username']
master_user_pw = secret_json['password']
redshift_port = secret_json['port']
redshift_cluster_identifier = secret_json['dbClusterIdentifier']
redshift_endpoint_address = secret_json['host']
print(master_user_name)

bankdemo


# RedShift

## Connect to RedShift

In [5]:
response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)
iam_role = response['Clusters'][0]['IamRoles'][0]['IamRoleArn']

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(master_user_name, master_user_pw, redshift_endpoint_address, redshift_port, database_name_redshift))
session = sessionmaker()
session.configure(bind=engine)


## Get data from RedShift

In [6]:
statement = f"""
select *  
    FROM {schema_redshift}.{table_name_redshift} order by random()
"""

# print(statement)

data = pd.read_sql_query(statement, engine)
data.head(5)

Unnamed: 0,age,job,marital,education,defaulted,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,34,admin.,married,university.degree,no,yes,no,cellular,may,mon,...,1,999,1,failure,-2.0,93.0,-46.0,1.0,,no
1,42,blue-collar,married,basic.9y,no,yes,no,cellular,aug,fri,...,1,6,1,success,-2.0,94.0,-38.0,1.0,,yes
2,76,housemaid,married,basic.4y,no,no,no,cellular,jun,tue,...,5,999,0,nonexistent,-2.0,94.0,-40.0,1.0,,yes
3,57,blue-collar,divorced,unknown,unknown,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.0,94.0,-36.0,5.0,,no
4,52,entrepreneur,married,basic.9y,no,yes,yes,telephone,may,fri,...,2,999,0,nonexistent,1.0,94.0,-36.0,5.0,,no


In [7]:
data['no_previous_contact'] = np.where(data['pdays'] == 999, 1, 0)                                 # Indicator variable to capture when pdays takes a value of 999
data['not_working'] = np.where(np.in1d(data['job'], ['student', 'retired', 'unemployed']), 1, 0)   # Indicator for individuals not actively employed
model_data = pd.get_dummies(data)                                                                  # Convert categorical variables to sets of indicators

model_data = model_data.drop(['duration', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m'], axis=1)
df = pd.concat([model_data['y_yes'], model_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)
df.columns
# train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))]) 
# pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv("train.csv", index=False, header=True)

Index(['y_yes', 'age', 'campaign', 'pdays', 'previous', 'no_previous_contact',
       'not_working', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'defaulted_no', 'defaulted_unknown',
       'defaulted_yes', 'housing_no', 'housing_unknown', 'housing_yes',
       'loan_no', 'loan_unknown', 'loan_yes', 'contact_cellular',
       'contact_telephone', 'month_apr', 'month_aug', 'month_dec', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'day_of_week_fri', 'd

In [8]:
df_y_yes = df['y_yes']
df = df.drop(['y_yes'], axis=1)
df.columns


Index(['age', 'campaign', 'pdays', 'previous', 'no_previous_contact',
       'not_working', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'defaulted_no', 'defaulted_unknown',
       'defaulted_yes', 'housing_no', 'housing_unknown', 'housing_yes',
       'loan_no', 'loan_unknown', 'loan_yes', 'contact_cellular',
       'contact_telephone', 'month_apr', 'month_aug', 'month_dec', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'day_of_week_fri', 'day_of_wee

In [9]:
df_y_yes.head()

0    0
1    1
2    1
3    0
4    0
Name: y_yes, dtype: uint8

In [10]:
arr_test =df[df.columns[:]].values
arr_test

array([[ 34,   1, 999, ...,   1,   0,   0],
       [ 42,   1,   6, ...,   0,   0,   1],
       [ 76,   5, 999, ...,   0,   1,   0],
       ...,
       [ 30,   1, 999, ...,   0,   1,   0],
       [ 29,   2, 999, ...,   0,   1,   0],
       [ 31,   4, 999, ...,   0,   1,   0]])

In [11]:
predictor = sagemaker.predictor.Predictor (endpoint_name=endpoint_name)
predictor.serializer = CSVSerializer()

In [12]:
# Randomly select a user and predicts
i = random.randint(1, len(arr_test)) # Start from 1 to avoid header being selected
print(i)
result = predictor.predict(arr_test[:][i])
result

880


b'0.1475665271282196'

In [13]:
# actual value
df_y_yes[i]

0