In [1]:

def project_2_scoring_loans(data):
    """
    Function to score input dataset.
    
    Input: dataset in Pandas DataFrame format
    Output: Python list of labels in the same order as input records
    
    Flow:
        - Load artifacts
        - Transform dataset
        - Score dataset
        - Return labels
    
    """
    
    import numpy as np

    '''Replacing Na/Null values'''
    values_to_fill = {}
    for col in data.drop(columns=['MIS_Status']).columns:
        if data[col].dtype == 'object':
            values_to_fill[col] = "Missing"
        else:
            values_to_fill[col] = 0

    data.fillna(value=values_to_fill,inplace=True)

    '''Converting the strings styled as '$XXXX.XX' to float values'''
    col_toFloat = ['DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv']
    for col in col_toFloat:
        data[col] = data[col].apply(lambda x: (x.replace(' ','')))
        data[col] = data[col].apply(lambda x: (x.replace(',','')))
        data[col] = data[col].apply(lambda x: (x.replace('$','')))
        data[col] = data[col].astype(float)

    
    data['LoanDisbursedPerCity'] = data.groupby('City')['DisbursementGross'].transform(np.sum)
    data['LoanPaid'] = data['DisbursementGross'] - data['BalanceGross']

    X = data.copy()

    loans_hold_out_data = h2o.H2OFrame(X)

    cat_columns = ['City', 'State', 'Bank', 'BankState', 'RevLineCr', 'LowDoc', 'Zip', 'UrbanRural', 'FranchiseCode', 'NewExist']
    response = "MIS_Status"


    loans_hold_out_data[cat_columns+[response]] = loans_hold_out_data[cat_columns+[response]].asfactor()


    # upload the model that you just downloded above
    # to the H2O cluster
    model_path = "./artifacts_SBALoans/gbm_grid1_model_120"
    loans_gbm_model = h2o.upload_model(model_path)


    columns_to_score = loans_hold_out_data.columns
    columns_to_score.remove('MIS_Status')

    y_pred = loans_gbm_model.predict(loans_hold_out_data[:,columns_to_score])['predict']
    
    
    return y_pred

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 

import shap

import h2o
from h2o.estimators import H2OTargetEncoderEstimator

try:
    h2o.cluster().shutdown()
except:
    pass 

In [3]:
#Limit to 3 threads and 8GB memory
h2o.init(nthreads=3, max_mem_size=8)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,2 hours 54 mins
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.1
H2O_cluster_version_age:,4 months and 6 days !!!
H2O_cluster_name:,H2O_from_python_General_1x6yvg
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.413 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,3


In [7]:
data_hold_out = pd.read_csv('sbaLoansTest.csv')

In [8]:
ans = project_2_scoring_loans(data_hold_out)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [9]:
ans

predict
0
0
0
0
0
0
0
0
1
0


