In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
%load_ext blackcellmagic

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
import extract
import preprocess as prep
import feature_engineering as fe
import imputations as imp
import evaluate as eval_
import EDA as eda
%matplotlib inline

In [None]:
import sys
sys.path.insert(0,'/home/shared/utils')

## Train Data Preparing

In [None]:
loans = extract.fetch_funded_mature_loans("'2018-01-01'", "'2019-12-31'")

In [None]:
loans = prep.preprocess_iloans(loans)

In [None]:
loans = fe.fe_iloans(loans)

In [None]:
loans = imp.iloans_impute(loans)

In [None]:
access_count = extract.get_esign_time_diff("'2018-01-01'", "'2019-12-31'")

In [None]:
eda_access_count = eda.intitial_eda_checks(access_count)

In [None]:
access_count = prep.preprocess_esign(access_count)

In [None]:
access_count, imp_acc_count = imp.impute_esign(access_count)

In [None]:
imp_acc_count.statistics_

In [None]:
bank_reports = extract.fetch_bank_reports("'2018-01-01'", "'2019-12-31'")

In [None]:
bank_reports = prep.preprocess_bank_reports(bank_reports)

In [None]:
bank_reports = fe.fe_bank_reports(bank_reports)

In [None]:
bank_app = extract.fetch_bank_app_loans("'2018-01-01'", "'2019-12-31'")

In [None]:
cols = eda.counting_unique_values(bank_app)

In [None]:
bank_app.info()

In [None]:
cols

In [None]:
bank_app['in1_is_direct_deposite'].value_counts()

In [None]:
eda.countplot_categorical_columns(bank_app)

In [None]:
a, b, c = eda.intitial_eda_checks(bank_app)

In [None]:
eda.boxplot_numerical_columns(bank_app)

In [None]:
bank_app = prep.preprocess_bank_app(bank_app)

In [None]:
bank_app = fe.fe_bank_app(bank_app)

In [None]:
bank_app, imp_dti, imp_pay_day, imp_in1_cycle, imp_miss_loan_pay, imp_is_latest_sal_least, imp_rolling_sal, imp_income_type, imp_net_sal_change = imp.impute_bankapp(bank_app)

In [None]:
imp_dti.statistics_, imp_pay_day.statistics_, imp_in1_cycle.fill_value, imp_miss_loan_pay.statistics_, imp_is_latest_sal_least.fill_value, imp_rolling_sal.statistics_, imp_income_type.fill_value, imp_net_sal_change.statistics_

In [None]:
learning = pd.merge(loans, access_count, how = 'left',on = 'loan_id')

In [None]:
learning = pd.merge(learning, bank_reports, how = 'left',on = 'loan_id')

In [None]:
learning = pd.merge(learning, bank_app, how = 'left', on = 'loan_id')

In [None]:
BV_status_list = ['Bank Validation Uncertain', 'Bank Validation Approved']

In [None]:
learning = learning[learning['bank_app_decision'].isin(BV_status_list)]

In [None]:
learning = learning.loc[learning['primary_account'].notnull(), :]

In [None]:
eda_learning, g = eda.intitial_eda_checks(learning)

In [None]:
eda_learning

In [None]:
dendro = eda.dendrogram(learning)

In [None]:
eda.histograms_numeric_columns(learning)

In [None]:
eda.heatmap_numeric_w_dependent_variable(learning, 'IsFirstDefault')

## Test Data Preparing

In [None]:
loans_eval = extract.fetch_funded_mature_loans("'2020-01-01'", "'2020-04-19'")

In [None]:
loans_eval = prep.preprocess_iloans(loans_eval)

In [None]:
loans_eval = fe.fe_iloans(loans_eval)

In [None]:
loans_eval = imp.iloans_impute(loans_eval)

In [None]:
access_count_eval = extract.get_esign_time_diff("'2020-01-01'", "'2020-04-19'")

In [None]:
access_count_eval = prep.preprocess_esign(access_count_eval)

In [None]:
access_count_eval[['AccessCount']] = imp_acc_count.transform(access_count_eval[['AccessCount']])

In [None]:
imp_acc_count.statistics_

In [None]:
bank_reports_eval = extract.fetch_bank_reports("'2020-01-01'", "'2020-04-19'")

In [None]:
bank_reports_eval = prep.preprocess_bank_reports(bank_reports_eval)

In [None]:
bank_reports_eval = fe.fe_bank_reports(bank_reports_eval)

In [None]:
bank_app_eval = extract.fetch_bank_app_loans("'2020-01-01'", "'2020-04-19'")

In [None]:
bank_app_eval = prep.preprocess_bank_app(bank_app_eval)

In [None]:
bank_app_eval = fe.fe_bank_app(bank_app_eval)

In [None]:
bank_app_eval.info()

In [None]:
bank_app_eval[['dti']] = imp_dti.transform(bank_app_eval[['dti']])

In [None]:
bank_app_eval[['pay_day_test_result_amount']] = imp_pay_day.transform(bank_app_eval[['pay_day_test_result_amount']])

In [None]:
bank_app_eval[['in1_income_cycle']] = imp_in1_cycle.transform(bank_app_eval[['in1_income_cycle']])

In [None]:
bank_app_eval[['missing_loan_payment']] = imp_miss_loan_pay.transform(bank_app_eval[['missing_loan_payment']])

In [None]:
bank_app_eval[['is_latest_sal_least']] = imp_is_latest_sal_least.transform(bank_app_eval[['is_latest_sal_least']])

In [None]:
bank_app_eval[['rolling_sal_mean']] = imp_rolling_sal.transform(bank_app_eval[['rolling_sal_mean']])

In [None]:
bank_app_eval[['income_type']] = imp_income_type.transform(bank_app_eval[['income_type']])

In [None]:
bank_app_eval[['net_sal_change']] = imp_net_sal_change.transform(bank_app_eval[['net_sal_change']])

In [None]:
testing = pd.merge(loans_eval, access_count_eval, how = 'left',on = 'loan_id')

In [None]:
testing = pd.merge(testing, bank_reports_eval, how = 'left',on = 'loan_id')

In [None]:
testing = pd.merge(testing, bank_app_eval, how = 'left', on = 'loan_id')

In [None]:
BV_status_list = ['Bank Validation Uncertain', 'Bank Validation Approved']

In [None]:
testing = testing[testing['bank_app_decision'].isin(BV_status_list)]

In [None]:
testing = testing.loc[testing['primary_account'].notnull(), :]

In [None]:
testing.info()

## Training

In [None]:
df_loans = learning[['MonthlyGrossIncome', 'Age', 'Reloan', 'LeadProvider', 'LenderCountCred30', 
                     'UniqLenderCount', 'LenderAmountDeb', 'LenderAmountCred', 'LenderAmountDeb30',
                     'LenderAmountCred30', 'LenderCountDeb', 'LenderCountCred', 'LenderCountDeb30',
                     'median_weekly_credit', 'dev_weekly_credit_count', 'median_daily_debit', 'AccessCount',
                     'IsFirstDefault', 'dti', 'pay_day_test_result_amount', 'diff_pos_neg_days', 'median_daily_balance',]]

In [None]:
df_loans = prep.changing_bool_dtypes_to_str(df_loans)

## Pycaret

In [None]:
from pycaret.classification import *

In [None]:
clf = setup(data = df_loans, train_size = .99, target = 'IsFirstDefault', session_id = 69,
            normalize = True,
            transformation = True,
            remove_outliers = True)

In [None]:
tuned_lr = tune_model('lr', optimize = 'F1')

In [None]:
bagged_lr = ensemble_model(tuned_lr, method = 'Bagging')

In [None]:
final_model = finalize_model(bagged_lr)

In [None]:
save_model(final_model, 'new_model')

## H2O

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(max_mem_size='16G')

In [None]:
df_h20_train =  h2o.H2OFrame(df_loans)

In [None]:
y = "IsFirstDefault" 
x = df_h20_train.columns
x.remove(y)

In [None]:
aml = H2OAutoML(max_runtime_secs=300, seed=7)
aml.train(x=x, y=y, training_frame=df_h20_train)

In [None]:
lb = aml.leaderboard
lb.head()

In [None]:
h2o.save_model(aml.leaderboard, path = 'refactorin_pipeline/new_features_h2o_model')

## Evaluation

## Pycaret

In [None]:
df_eval = testing[['MonthlyGrossIncome', 'Age', 'Reloan', 'LeadProvider', 'LenderCountCred30', 
                     'UniqLenderCount', 'LenderAmountDeb', 'LenderAmountCred', 'LenderAmountDeb30',
                     'LenderAmountCred30', 'LenderCountDeb', 'LenderCountCred', 'LenderCountDeb30',
                     'median_weekly_credit', 'dev_weekly_credit_count', 'median_daily_debit', 'AccessCount',
                     'IsFirstDefault', 'dti', 'pay_day_test_result_amount', 'diff_pos_neg_days', 'median_daily_balance']]

In [None]:
df_eval = prep.changing_bool_dtypes_to_str(df_eval)

In [None]:
new_model = load_model('refactored_model_without_esign')

In [None]:
predictions_pycaret = predict_model(new_model, data = df_eval)

In [None]:
eval_.get_KS(predictions_pycaret, 'IsFirstDefault', 'Score')

## H2O

In [None]:
h2o_eval = h2o.H2OFrame(df_eval)

In [None]:
pred = aml.leader.predict(h2o_eval)
pred.head()

In [None]:
predictions = h2o.as_list(pred)

In [None]:
df_eval.reset_index(drop=True,inplace=True)

In [None]:
predictions['target'] = df_eval['IsFirstDefault']

In [None]:
predictions.rename(columns={'True':'prob'}, inplace = True)

In [None]:
predictions_h2o = predictions[['target','prob']]

In [None]:
predictions_h2o['target'] = predictions_h2o['target'].astype(str)

In [None]:
eval_.get_KS(predictions_h2o, 'target', 'prob')