In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
%load_ext blackcellmagic

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Extract

In [None]:
import pandas as pd
import multiprocessing as mp

In [None]:
from extract import *

## for Training

In [None]:
df = pd.read_csv('training_data.csv')

In [None]:
df = get_examples("2018-01-01", "2019-12-31")

In [None]:
BV_status_list = ['Bank Validation Uncertain', 'Bank Validation Approved']

In [None]:
df = df[df['final_decision'].isin(BV_status_list)]

## for Evaluation

In [None]:
df_eval = pd.read_csv('testing_data.csv')

In [None]:
df_eval = get_examples("2020-01-01","2020-04-19")

In [None]:
BV_status_list = ['Bank Validation Uncertain', 'Bank Validation Approved']

In [None]:
df_eval = df_eval[df_eval['final_decision'].isin(BV_status_list)]

# EDA

In [None]:
from preprocess import *

## Feature engineering

### Primary account

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1

In [None]:
with mp.Pool(processes=NCPU) as pool:
    res_primary_accts = pool.map(get_primary_account, df['BankReportData'])

In [None]:
df['primary_account'] = res_primary_accts

In [None]:
df = df.loc[df['primary_account'].notnull(),:]

### Filter loans having transaction days >= 60 in primary account

In [None]:
with mp.Pool(processes=NCPU) as pool:
    txn_days_count = pool.starmap(get_transaction_days_count, zip(df['primary_account'],df['BankReportData']))

In [None]:
df['txn_days_count'] = txn_days_count

In [None]:
df = df.loc[df['txn_days_count'] == True, :]

### Calculate Age

In [None]:
df['Age'] = df.apply(lambda x: calculate_age(x['OriginationDate'],x['DateOfBirth']), axis = 1)

### New or Reloan

In [None]:
df['Reloan'] = df['LoanCount'].apply(lambda x:True if x>1 else False)

### Lead Provider

In [None]:
df = df.loc[df['Campaign'].notnull(),:]

In [None]:
lead_provider_list = [
    "MarketBullet",
    "StopNGo",
    "Nimbus",
    "EPCVIP",
    "PingBid",
    "LeapThry",
    "Acquir",
    "RoundSky",
    "Zero",
    "LeadPie",
    "ITMedia",
    "LeadsMarket",
]

In [None]:
df['LeadProvider'] = df['Campaign'].str.extract("(" + "|".join(lead_provider_list) +")",flags = re.IGNORECASE)

In [None]:
df['LeadProvider'] = df['LeadProvider'].fillna('Freedom')

### Lender vars

#### in case the lender vars are to be generated for funded loans between 2018-01-01 to 2019-12-31 do not run the below cell, instead download from s3 ( look for "download lender vars from s3" markdown)

In [None]:
df_lender_vars = pd.DataFrame()
with mp.Pool(processes=NCPU) as pool:
    df_lender_vars_temp = pool.starmap(create_lender_vars, zip(df['LoanId'],df['BankReportData'],df['ReportTimeAdded'],df['primary_account']))
df_lender_vars=pd.concat(df_lender_vars_temp,ignore_index=True)

In [None]:
df_lender_vars.reset_index(drop=True,inplace=True)

#### download lender vars from s3

In [None]:
import boto3

In [None]:
#provide access keys if needed
s3 = boto3.client('s3')

In [None]:
s3.download_file('predicon-bucket', 'lender_vars.csv', 'FILE_NAME')

In [None]:
df_lender_vars = pd.read_csv('lender_vars.csv')

In [None]:
df = pd.merge(df,df_lender_vars,how='left',on='LoanId')

### Number of days positive and negative

In [None]:
with mp.Pool(processes = NCPU) as pool:
    df_gc_vars_temp = pool.starmap(gc_summary_vars, zip(df['LoanId'], df['BankReportData'], df['primary_account']))
df_gc_vars = pd.concat(df_gc_vars_temp, ignore_index = True)

In [None]:
df_gc_vars['diff_positive_negative_days'] = df_gc_vars['noOfDaysPositive']  - df_gc_vars['noOfDaysNegative']

In [None]:
df = pd.merge(df,df_gc_vars[['LoanId','diff_positive_negative_days']],how = 'left',on = 'LoanId')

### Outlier Treatment and Cleaning

In [None]:
from imputations import *

In [None]:
df['EsigTimeSignedDiff_In_SEC'] = df['EsigTimeSignedDiff_In_SEC'].map(esisgn_outlier_treat)

In [None]:
df.replace({'in1_is_direct_deposite': {'': 'Unknown'}}, inplace = True)

In [None]:
df['dti_percentage'] = df['dti_percentage'].map(dti_outlier_treat)

In [None]:
df.replace({'LeadProvider': {'Roundsky': 'RoundSky'}}, inplace = True)

### Imputation

In [None]:
df_bankapp_imp = fetch_imputation_examples_bankapp("2018-01-01", "2019-12-31")

In [None]:
df_esign_imp = fetch_imputation_examples_esign("2018-01-01", "2019-12-31")

In [None]:
df_esign_imp['EsigTimeSignedDiff_In_SEC'] = df_esign_imp['EsigTimeSignedDiff_In_SEC'].map(esisgn_outlier_treat)

In [None]:
df_bankapp_imp['dti_percentage'] = df_bankapp_imp['dti_percentage'].map(dti_outlier_treat)

In [None]:
imp_dti = impute(df_bankapp_imp, ['dti_percentage'], 'median')

In [None]:
imp_pay_day = impute(df_bankapp_imp, ['pay_day_test_result_amount'], 'median')

In [None]:
imp_esign = impute(df_esign_imp, ['EsigTimeSignedDiff_In_SEC'], 'median')

In [None]:
df[['dti_percentage']] = imp_dti.transform(df[['dti_percentage']])

In [None]:
df[['pay_day_test_result_amount']] = imp_pay_day.transform(df[['pay_day_test_result_amount']])

In [None]:
df[['EsigTimeSignedDiff_In_SEC']] = imp_esign.transform(df[['EsigTimeSignedDiff_In_SEC']])

In [None]:
imp_dti.statistics_, imp_pay_day.statistics_, imp_esign.statistics_

### Pycaret preprocessing

In [None]:
from pycaret.classification import *

In [None]:
df_loans = df[['MonthlyGrossIncome', 'Age', 'Reloan', 'LeadProvider', 'LenderCountCred30', 
                'UniqLenderCount', 'LenderAmountDeb', 'LenderAmountCred', 'LenderAmountDeb30',
                'LenderAmountCred30', 'LenderCountDeb', 'LenderCountCred', 'LenderCountDeb30',
                'AccessCount', 'IsFirstDefault', 'dti_percentage',
                'pay_day_test_result_amount', 'diff_positive_negative_days']]

In [None]:
from dtypes import *

In [None]:
df_loans = changing_bool_dtypes_to_str(df_loans)

In [None]:
df_loans.info()

In [None]:
clf = setup(data = df_loans, train_size = .99, target = 'IsFirstDefault', session_id = 69,
            normalize = True,
            transformation = True,
            remove_outliers = True)

# Train

## useful links
https://docs.databricks.com/_static/notebooks/mlflow/mlflow-quick-start-deployment-aws.html

https://towardsdatascience.com/deploying-models-to-production-with-mlflow-and-amazon-sagemaker-d21f67909198

https://www.h2o.ai/blog/a-deep-dive-into-h2os-automl/

## specify features

In [None]:
features_drop = ['LoanCount',
'OriginationDate',             
'BankReportData',                   
'ReportTimeAdded',                  
'Campaign',
'primary_account',
'txn_days_count', 'DateOfBirth',]

In [None]:
df_train = df_loans.drop(columns=features_drop,axis=1)

### H20

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(max_mem_size='16G')

In [None]:
df_h20_train =  h2o.H2OFrame(df_loans)

In [None]:
y = "IsFirstDefault" 
x = df_h20_train.columns
x.remove(y)

In [None]:
aml = H2OAutoML(max_runtime_secs=300, seed=7)
aml.train(x=x, y=y, training_frame=df_h20_train)

In [None]:
lb = aml.leaderboard
lb.head()

### pycaret

In [None]:
tuned_lr = tune_model('lr', optimize = 'F1')

In [None]:
bagged_lr = ensemble_model(tuned_lr, method = 'Bagging')

In [None]:
final_model = finalize_model(bagged_lr)

## Saving model

### Pycaret

In [None]:
save_model(final_model, 'refactored_model_without_esign')

### h2o

In [None]:
temp = aml.leader.download_mojo

In [None]:
save_model(temp, 'h2o_model_mojo')

### Saving the imputer objects

In [None]:
import joblib

In [None]:
joblib.dump(imp_dti, open("imp_dti.p", "wb"))

In [None]:
joblib.dump(imp_pay_day, open("imp_pay_day.p", "wb"))

In [None]:
pickle.dump(imp_esign, open("imp_esign.p", "wb"))

# Predict

### primary account

In [None]:
with mp.Pool(processes=NCPU) as pool:
    res_primary_accts = pool.map(get_primary_account, df_eval['BankReportData'])

In [None]:
df_eval['primary_account'] = res_primary_accts

In [None]:
df_eval = df_eval.loc[df_eval['primary_account'].notnull(),:]

### filter loans having transaction days >= 60 in primary account

In [None]:
with mp.Pool(processes=NCPU) as pool:
    txn_days_count = pool.starmap(get_transaction_days_count, zip(df_eval['primary_account'],df_eval['BankReportData']))

In [None]:
df_eval['txn_days_count'] = txn_days_count

In [None]:
df_eval = df_eval.loc[df_eval['txn_days_count'] == True, :]

### Calculate Age

In [None]:
df_eval['Age'] = df_eval.apply(lambda x: calculate_age(x['OriginationDate'],x['DateOfBirth']), axis = 1)

### New or Reloan

In [None]:
df_eval['Reloan'] = df_eval['LoanCount'].apply(lambda x:True if x>1 else False)

### Lead Provider

In [None]:
df_eval = df_eval.loc[df_eval['Campaign'].notnull(),:]

In [None]:
lead_provider_list = [
    "MarketBullet",
    "StopNGo",
    "Nimbus",
    "EPCVIP",
    "PingBid",
    "LeapThry",
    "Acquir",
    "RoundSky",
    "Zero",
    "LeadPie",
    "ITMedia",
    "LeadsMarket",
]

In [None]:
df_eval['LeadProvider'] = df_eval['Campaign'].str.extract("(" + "|".join(lead_provider_list) +")",flags = re.IGNORECASE)

In [None]:
df_eval['LeadProvider'] = df_eval['LeadProvider'].fillna('Freedom')

### lender vars

#### in case the lender vars are to be generated for funded loans between 2018-01-01 to 2019-12-31 do not run the below cell, instead download from s3 ( look for "download lender vars from s3" markdown)

In [None]:
df_lender_vars = pd.DataFrame()
with mp.Pool(processes=NCPU) as pool:
    df_lender_vars_temp = pool.starmap(create_lender_vars, zip(df_eval['LoanId'],df_eval['BankReportData'],df_eval['ReportTimeAdded'],df_eval['primary_account']))
df_lender_vars=pd.concat(df_lender_vars_temp,ignore_index=True)

In [None]:
df_lender_vars.reset_index(drop=True,inplace=True)

#### download lender vars from s3

In [None]:
import boto3

In [None]:
#provide access keys if needed
s3 = boto3.client('s3')

In [None]:
s3.download_file('predicon-bucket', 'lender_vars.csv', 'FILE_NAME')

In [None]:
df_lender_vars = pd.read_csv('lender_vars.csv')

In [None]:
df_eval = pd.merge(df_eval,df_lender_vars,how='left',on='LoanId')

### Number of days positive and negative

In [None]:
with mp.Pool(processes = NCPU) as pool:
    df_gc_vars_temp = pool.starmap(gc_summary_vars, zip(df_eval['LoanId'], df_eval['BankReportData'], df_eval['primary_account']))
df_gc_vars = pd.concat(df_gc_vars_temp, ignore_index = True)

In [None]:
df_gc_vars['diff_positive_negative_days'] = df_gc_vars['noOfDaysPositive']  - df_gc_vars['noOfDaysNegative']

In [None]:
df_eval = pd.merge(df_eval,df_gc_vars[['LoanId','diff_positive_negative_days']],how = 'left',on = 'LoanId')

In [None]:
df_eval['EsigTimeSignedDiff_In_SEC'] = df_eval['EsigTimeSignedDiff_In_SEC'].map(esisgn_outlier_treat)

In [None]:
df_eval.replace({'in1_is_direct_deposite': {'': 'Unknown'}}, inplace = True)

In [None]:
df_eval['dti_percentage'] = df_eval['dti_percentage'].map(dti_outlier_treat)

In [None]:
df_eval.replace({'LeadProvider': {'Roundsky': 'RoundSky'}}, inplace = True)

### Imputation

In [None]:
df_eval[['dti_percentage']] = imp_dti.transform(df_eval[['dti_percentage']])

In [None]:
df_eval[['pay_day_test_result_amount']] = imp_pay_day.transform(df_eval[['pay_day_test_result_amount']])

In [None]:
df_eval[['EsigTimeSignedDiff_In_SEC']] = imp_esign.transform(df_eval[['EsigTimeSignedDiff_In_SEC']])

### run prediction

## Pycaret

In [None]:
df_loans_eval = df_eval[['MonthlyGrossIncome', 'Age', 'Reloan', 'LeadProvider', 'LenderCountCred30', 
                         'UniqLenderCount', 'LenderAmountDeb', 'LenderAmountCred', 'LenderAmountDeb30',
                         'LenderAmountCred30', 'LenderCountDeb', 'LenderCountCred', 'LenderCountDeb30',
                         'AccessCount', 'IsFirstDefault', 'dti_percentage',
                         'pay_day_test_result_amount', 'diff_positive_negative_days']]

In [None]:
df_loans_eval = changing_bool_dtypes_to_str(df_loans_eval)

## To load the pycaret model, run the cell below, if required

In [None]:
new_model = load_model('refactored_model_without_esign')

In [None]:
predictions_pycaret = predict_model(new_model, data = df_loans_eval)

## H20

In [None]:
df_loan_predict = df_loan_eval.drop(columns=features_drop,axis=1)

In [None]:
h2o_eval = h2o.H2OFrame(df_loans_eval)

In [None]:
pred = aml.leader.predict(h2o_eval)
pred.head()

In [None]:
#convert to pandas dataframe
predictions = h2o.as_list(pred)

In [None]:
predictions.head()

In [None]:
df_loans_eval.reset_index(drop=True,inplace=True)

In [None]:
predictions['target'] = df_loans_eval['IsFirstDefault']  

In [None]:
predictions = predictions.rename(columns={'True':'prob'})

In [None]:
predictions_h2o = predictions[['target','prob']]

In [None]:
predictions_h2o['target'] = predictions_h2o['target'].astype(str)

In [None]:
df_predictions['target'].value_counts(normalize = True)

# Evaluate

### compute KS

In [None]:
from evaluate import *

In [None]:
get_KS(predictions_h2o, 'target', 'prob')

In [None]:
get_KS(predictions_pycaret, 'IsFirstDefault', 'Score')

### quantiling & get bins for quantile assignment

### Pycaret

In [None]:
predictions_pycaret.replace({'IsFirstDefault' : {'True' : 1, 'False' : 0}}, inplace = True)

In [None]:
quant, bins = quantile_table_and_score_bins(predictions_pycaret, 'IsFirstDefault', 'Score')

In [None]:
quant

In [None]:
bins

In [None]:
joblib.dump(quant, open("quanttiles_pycaret.p", "wb"))

In [None]:
joblib.dump(bins, open("bins_pycaret.p", "wb"))

### h2o

In [None]:
predictions_h2o.replace({'target' : {'True' : 1, 'False' : 0}}, inplace = True)

In [None]:
quant, bins = quantile_table_and_score_bins(predictions_h2o, 'target', 'prob')

In [None]:
quant

In [None]:
bins

In [None]:
joblib.dump(quant, open("quanttiles_h2o.p", "wb"))

In [None]:
joblib.dump(bins, open("bins_h2o.p", "wb"))

### upload to sagemaker

In [None]:
import mlflow

In [None]:
import mlflow.h2o as mh2o

In [None]:
import mlflow.sagemaker as mfs

In [None]:
mh2o.save_model(aml.leader,path="path/to/trained/model")

In [None]:
region = "us-east-1"
arn = "arn:aws:iam::757719720041:role/Sagemaker"
appname = "h20-mlflow-deploy"
modeluri = "path/to/saved/model" 
image_url = "757719720041.dkr.ecr.us-east-1.amazonaws.com/freedom-pyfunc:latest"

In [None]:
mfs.deploy(app_name=appname, model_path=modeluri, instance_type='ml.t2.medium',region_name=region, mode="create",execution_role_arn=arn,image_url=image_url)

In [None]:
import boto3

def check_status(app_name):
    sage_client = boto3.client('sagemaker', region_name="us-east-1")
    endpoint_description = sage_client.describe_endpoint(EndpointName=app_name)
    endpoint_status = endpoint_description["EndpointStatus"]
    return endpoint_status

In [None]:
check_status(appname)

### Deploy pycaret

In [None]:
deploy_model(new_model, model_name = 'pycaret_aws', platform = 'aws',
             authentication = {'bucket' : 'pycaret-predicon'})

In [None]:
predictions_pycaret_deployed = predict_model('pycaret_aws', data = df_loans_eval, platform = 'aws', authentication = {'bucket' : 'pycaret-predicon'})