In [1]:
import sagemaker
import boto3
from sys import getsizeof
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session

In [2]:
bucket_name = 'insurancee' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)

us-east-1


In [3]:
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [4]:
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://insurancee/xgboost-as-a-built-in-algo/output


In [5]:
# Downloading The Dataset And Storing in S3

In [6]:
import pandas as pd
try:
    model_data = pd.read_csv('train.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.


In [7]:
model_data.shape

(381109, 11)

In [8]:
model_data.dtypes

Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [9]:
model_data.Age = model_data.Age.astype('uint8')
model_data.Driving_License = model_data.Driving_License.astype('uint8')
model_data.Previously_Insured = model_data.Previously_Insured.astype('uint8')
model_data.Vintage = model_data.Vintage.astype('uint8')
model_data.Response = model_data.Response.astype('uint8')

In [10]:
new_size = getsizeof(model_data)/(1024.0**3)
print('dataframe size:%2.2fGB'%new_size)

dataframe size:0.08GB


In [11]:
data1= model_data.sample(frac = 0.5, replace=False, random_state=2)

In [12]:
data1.shape

(190554, 11)

In [13]:
data1

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
243787,Male,27,1,30.0,0,< 1 Year,Yes,29379.0,156.0,120,0
32318,Female,24,1,11.0,1,< 1 Year,No,26518.0,152.0,46,0
156564,Male,35,1,37.0,1,1-2 Year,No,25232.0,152.0,150,0
75118,Female,41,1,28.0,0,1-2 Year,Yes,25483.0,124.0,54,0
341724,Female,51,1,28.0,0,> 2 Years,Yes,48981.0,26.0,42,1
...,...,...,...,...,...,...,...,...,...,...,...
370613,Male,39,1,8.0,1,1-2 Year,No,27369.0,124.0,0,0
254734,Female,25,1,37.0,1,< 1 Year,No,36262.0,152.0,27,0
34821,Female,67,1,8.0,1,1-2 Year,No,31842.0,154.0,204,0
125819,Female,52,1,40.0,0,1-2 Year,Yes,52597.0,26.0,253,0


In [14]:
pd.crosstab(data1["Age"],"count")

col_0,count
Age,Unnamed: 1_level_1
20,3229
21,8272
22,10456
23,12131
24,12901
...,...
81,30
82,12
83,10
84,9


In [15]:
data2=data1.groupby("Age").sample(frac=.3)

In [16]:
data2.Age.unique()

array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85],
      dtype=uint8)

In [17]:
data2

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10772,Male,20,1,42.0,0,< 1 Year,No,2630.0,160.0,112,0
11081,Male,20,1,8.0,1,< 1 Year,Yes,24518.0,160.0,37,0
62188,Male,20,1,2.0,0,< 1 Year,Yes,30123.0,160.0,23,0
52122,Male,20,1,46.0,1,< 1 Year,No,2630.0,157.0,212,0
351888,Male,20,1,3.0,0,< 1 Year,No,37243.0,160.0,68,0
...,...,...,...,...,...,...,...,...,...,...,...
132044,Female,83,1,28.0,1,1-2 Year,No,36539.0,122.0,70,0
236773,Female,84,1,28.0,1,1-2 Year,No,51879.0,26.0,125,0
49956,Male,84,1,16.0,1,1-2 Year,No,26748.0,26.0,94,0
95729,Female,84,1,45.0,1,1-2 Year,No,31835.0,13.0,191,0


In [18]:
{x: len(data2[x].unique()) for x in data2.select_dtypes('number').columns}

{'Age': 66,
 'Driving_License': 2,
 'Region_Code': 53,
 'Previously_Insured': 2,
 'Annual_Premium': 25757,
 'Policy_Sales_Channel': 130,
 'Vintage': 256,
 'Response': 2}

In [19]:
final_size = getsizeof(data2)/(1024.0**3)
print('dataframe size:%2.2fGB'%final_size)

dataframe size:0.01GB


In [20]:
def preprocessing(data2):
    data2 = data2.copy()
    data2.drop(columns = ['id'], inplace=True)
    data2['male_id'] = np.where(data2['Gender'] == 'Male', 1, 0)

    data2['veh_damage_id'] = np.where(data2['Vehicle_Damage'] == 'Yes', 1, 0)

    for name, val in zip(['lt_1_yr', 'btwn_1_2_years', 'plus2yrs'], ['< 1 Year', '1-2 Year', '> 2 Years']):
        data2[f'vehicle_age__{name}'] = np.where(data2['Vehicle_Age'] == val, 1, 0)
        
    daat2.drop(columns = ['Gender', 'Vehicle_Damage', 'Vehicle_Age'], inplace=True)


    return data2.drop(columns = ['Response']), data2['Response']

In [22]:
X = data2.loc[data2.Response==1].sample(frac = .5, random_state=0)
Y = data2.loc[data2.Response==0].sample(frac = .5, random_state=0)

In [23]:
data_train = pd.concat([X.iloc[:int(len(X)*0.8)], Y.iloc[:int(len(Y)*0.8)]], axis=0)
data_test = pd.concat([X.iloc[int(len(X)*0.8):], Y.iloc[int(len(Y)*0.8):]], axis=0)

In [24]:
data_train.shape

(22865, 11)

In [25]:
data_test.shape

(5717, 11)

In [26]:
### Saving Train And Test Into Buckets

In [27]:
import os
data_train = pd.concat([X.iloc[:int(len(X)*0.8)], Y.iloc[:int(len(Y)*0.8)]], axis=0).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [28]:
data_test = pd.concat([X.iloc[int(len(X)*0.8):], Y.iloc[int(len(Y)*0.8):]], axis=0).to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

In [29]:
## Building Models Xgboot- Inbuilt Algorithm

In [30]:
container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [41]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":1000
        }

In [42]:
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          train_use_spot_instances=True,
                                          train_max_run=300,
                                          train_max_wait=600)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [46]:
estimator.fit({"train":s3_input_train, "validation":s3_input_test}, wait=True)

2022-08-17 14:54:54 Starting - Starting the training job...
2022-08-17 14:55:19 Starting - Preparing the instances for trainingProfilerReport-1660748094: InProgress
.........
2022-08-17 14:56:54 Downloading - Downloading input data
2022-08-17 14:56:54 Training - Downloading the training image.....[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[14:57:28] 22865x10 matrix with 228650 entries loaded from /opt/ml/input/data/train?format=csv&label_colum

In [None]:
# to specify the S3 bucket URI where the Debugger training reports are generated and check if the reports exist.

In [49]:
rule_output_path = estimator.output_path + "/" + estimator.latest_training_job.job_name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive

2022-08-17 14:59:31     329715 xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-08-17-14-54-54-044/rule-output/ProfilerReport-1660748094/profiler-output/profiler-report.html
2022-08-17 14:59:31     171084 xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-08-17-14-54-54-044/rule-output/ProfilerReport-1660748094/profiler-output/profiler-report.ipynb
2022-08-17 14:59:27        191 xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-08-17-14-54-54-044/rule-output/ProfilerReport-1660748094/profiler-output/profiler-reports/BatchSize.json
2022-08-17 14:59:27        199 xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-08-17-14-54-54-044/rule-output/ProfilerReport-1660748094/profiler-output/profiler-reports/CPUBottleneck.json
2022-08-17 14:59:27        126 xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-08-17-14-54-54-044/rule-output/ProfilerReport-1660748094/profiler-output/profiler-reports/Dataloader.json
2022-08-17 14:59:27        127 xgboost-as-a-bu

In [None]:
# Download the Debugger XGBoost training and profiling reports to the current workspace:

In [50]:
! aws s3 cp {rule_output_path} ./ --recursive

download: s3://insurancee/xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-08-17-14-54-54-044/rule-output/ProfilerReport-1660748094/profiler-output/profiler-reports/CPUBottleneck.json to ProfilerReport-1660748094/profiler-output/profiler-reports/CPUBottleneck.json
download: s3://insurancee/xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-08-17-14-54-54-044/rule-output/ProfilerReport-1660748094/profiler-output/profiler-reports/BatchSize.json to ProfilerReport-1660748094/profiler-output/profiler-reports/BatchSize.json
download: s3://insurancee/xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-08-17-14-54-54-044/rule-output/ProfilerReport-1660748094/profiler-output/profiler-reports/OverallFrameworkMetrics.json to ProfilerReport-1660748094/profiler-output/profiler-reports/OverallFrameworkMetrics.json
download: s3://insurancee/xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-08-17-14-54-54-044/rule-output/ProfilerReport-1660748094/profiler-output/profiler-repo

In [None]:
# Deploy Machine Learning Model As Endpoints

In [54]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

------!

In [61]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2022-08-17-15-05-26-459'