## Imports

In [35]:
import pandas as pd
import xgboost as xgb
import numpy as np
import collections

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import shuffle

from google.cloud import aiplatform

In [37]:
GCP_PROJECT = 'dt-tu-sandbox-dev'
MODEL_BUCKET = 'gs://ovo-xgboost-demo'
VERSION_NAME = 'v1'
MODEL_NAME = 'mortgage_approval_xgboost'
REGION = "europe-west2"

In [40]:
aiplatform.init(project=GCP_PROJECT, location=REGION)

# Training the Model

## Download Data

In [3]:
!gsutil cp 'gs://mortgage_dataset_files/mortgage-small.csv' .

Copying gs://mortgage_dataset_files/mortgage-small.csv...
- [1 files][330.8 MiB/330.8 MiB]                                                
Operation completed over 1 objects/330.8 MiB.                                    


## Load Data

In [11]:
COLUMN_NAMES = collections.OrderedDict({
 'as_of_year': np.int16,
 'agency_code': 'category',
 'loan_type': 'category',
 'property_type': 'category',
 'loan_purpose': 'category',
 'occupancy': np.int8,
 'loan_amt_thousands': np.float64,
 'preapproval': 'category',
 'county_code': np.float64,
 'applicant_income_thousands': np.float64,
 'purchaser_type': 'category',
 'hoepa_status': 'category',
 'lien_status': 'category',
 'population': np.float64,
 'ffiec_median_fam_income': np.float64,
 'tract_to_msa_income_pct': np.float64,
 'num_owner_occupied_units': np.float64,
 'num_1_to_4_family_units': np.float64,
 'approved': np.int8
})

In [12]:
data = pd.read_csv(
 'mortgage-small.csv',
 index_col=False,
 dtype=COLUMN_NAMES
)
data = data.dropna()
data = shuffle(data, random_state=2)
data.head(2)

Unnamed: 0,as_of_year,agency_code,loan_type,property_type,loan_purpose,occupancy,loan_amt_thousands,preapproval,county_code,applicant_income_thousands,purchaser_type,hoepa_status,lien_status,population,ffiec_median_fam_income,tract_to_msa_income_pct,num_owner_occupied_units,num_1_to_4_family_units,approved
310650,2016,Consumer Financial Protection Bureau (CFPB),"Conventional (any loan other than FHA, VA, FSA...",One to four-family (other than manufactured ho...,Refinancing,1,110.0,Not applicable,119.0,55.0,Freddie Mac (FHLMC),Not a HOEPA loan,Secured by a first lien,5930.0,64100.0,98.81,1305.0,1631.0,1
630129,2016,Department of Housing and Urban Development (HUD),"Conventional (any loan other than FHA, VA, FSA...",One to four-family (other than manufactured ho...,Home purchase,1,480.0,Not applicable,33.0,270.0,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Secured by a first lien,4791.0,90300.0,144.06,1420.0,1450.0,0


## Data Validation

In [13]:
# Class labels - 0: denied, 1: approved
print(data['approved'].value_counts())

labels = data['approved'].values
data = data.drop(columns=['approved'])

approved
1    665389
0    334610
Name: count, dtype: int64


## Feature Engineering

In [14]:
dummy_columns = list(data.dtypes[data.dtypes == 'category'].index)
data = pd.get_dummies(data, columns=dummy_columns)

data.head(2)

Unnamed: 0,as_of_year,occupancy,loan_amt_thousands,county_code,applicant_income_thousands,population,ffiec_median_fam_income,tract_to_msa_income_pct,num_owner_occupied_units,num_1_to_4_family_units,...,"purchaser_type_Life insurance company, credit union, mortgage bank, or finance company",purchaser_type_Loan was not originated or was not sold in calendar year covered by register,purchaser_type_Other type of purchaser,purchaser_type_Private securitization,hoepa_status_HOEPA loan,hoepa_status_Not a HOEPA loan,lien_status_Not applicable (purchased loans),lien_status_Not secured by a lien,lien_status_Secured by a first lien,lien_status_Secured by a subordinate lien
310650,2016,1,110.0,119.0,55.0,5930.0,64100.0,98.81,1305.0,1631.0,...,False,False,False,False,False,True,False,False,True,False
630129,2016,1,480.0,33.0,270.0,4791.0,90300.0,144.06,1420.0,1450.0,...,False,True,False,False,False,True,False,False,True,False


## Train-Test Split

In [15]:
x,y = data.values,labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

## Modelling

In [16]:
model = xgb.XGBClassifier(
    objective='reg:logistic'
)

In [17]:
model.fit(x_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='reg:logistic', ...)

## Evaluation

In [18]:
y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred.round())
print(acc, '\n')

0.873904 



## Save Model

In [19]:
model.save_model('model.bst')



# Deploy the model to Vertex Endpoints

In [41]:
PRE_BUILT_IMAGE = "europe-docker.pkg.dev/vertex-ai/training/xgboost-cpu.1-6:latest"

In [48]:
model = aiplatform.Model.upload_xgboost_model_file(
    display_name = MODEL_NAME,
    model_file_path="./model.bst",
    sync=True
)

print(model)

Creating Model
Create Model backing LRO: projects/435046587974/locations/europe-west2/models/5716660019970179072/operations/1834784740083761152
Model created. Resource name: projects/435046587974/locations/europe-west2/models/5716660019970179072@1
To use this Model in another session:
model = aiplatform.Model('projects/435046587974/locations/europe-west2/models/5716660019970179072@1')
<google.cloud.aiplatform.models.Model object at 0x7fc5bfbb16f0> 
resource name: projects/435046587974/locations/europe-west2/models/5716660019970179072


In [56]:
endpoint = model.deploy()

Creating Endpoint
Create Endpoint backing LRO: projects/435046587974/locations/europe-west2/endpoints/4554432248945836032/operations/310316266218848256
Endpoint created. Resource name: projects/435046587974/locations/europe-west2/endpoints/4554432248945836032
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/435046587974/locations/europe-west2/endpoints/4554432248945836032')
Deploying model to Endpoint : projects/435046587974/locations/europe-west2/endpoints/4554432248945836032
Using default machine_type: n1-standard-2
Deploy Endpoint model backing LRO: projects/435046587974/locations/europe-west2/endpoints/4554432248945836032/operations/321575265287274496
Endpoint model deployed. Resource name: projects/435046587974/locations/europe-west2/endpoints/4554432248945836032


In [57]:
example_input = [
    [2016.0, 1.0, 346.0, 27.0, 211.0, 4530.0, 86700.0, 132.13, 1289.0, 1408.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0]
  ]

In [58]:
endpoint.predict(example_input)

Prediction(predictions=[0.9999957084655762], deployed_model_id='8846890459411251200', model_version_id='1', model_resource_name='projects/435046587974/locations/europe-west2/models/5716660019970179072', explanations=None)

endpoint.delete(force=True)
model.delete()