In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
%load_ext blackcellmagic

In [4]:
import warnings
warnings.filterwarnings('ignore')

# Extract

In [5]:
import pandas as pd
import multiprocessing as mp

In [6]:
import extract

In [7]:
import importlib
importlib.reload(extract)

<module 'extract' from '/home/vishal/refactoring_pipeline/extract.py'>

## for Training

In [8]:
df = extract.get_examples("2018-01-01", "2019-12-31")

In [9]:
BV_status_list = ['Bank Validation Uncertain', 'Bank Validation Approved']

In [10]:
df = df[df['final_decision'].isin(BV_status_list)]

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5208 entries, 0 to 5526
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   LoanId                      5208 non-null   object        
 1   LoanCount                   5208 non-null   int64         
 2   OriginationDate             5208 non-null   datetime64[ns]
 3   BankReportData              5208 non-null   object        
 4   ReportTimeAdded             5208 non-null   datetime64[ns]
 5   Campaign                    5207 non-null   object        
 6   MonthlyGrossIncome          5208 non-null   float64       
 7   DateOfBirth                 5208 non-null   datetime64[ns]
 8   IsFirstDefault              5208 non-null   bool          
 9   AccessCount                 5207 non-null   float64       
 10  EsigTimeSignedDiff_In_SEC   5207 non-null   float64       
 11  final_decision              5208 non-null   object      

## for Evaluation

In [12]:
df_eval = extract.get_examples("2020-01-01","2020-03-31")

In [13]:
BV_status_list = ['Bank Validation Uncertain', 'Bank Validation Approved']

In [14]:
df_eval = df_eval[df_eval['final_decision'].isin(BV_status_list)]

# EDA

## check missing values

## outlier detection

## check data distribution

# Preprocess

In [15]:
from preprocess import *

## Useful links. 
https://github.com/pycaret/pycaret/blob/master/Tutorials/Binary%20Classification%20Tutorial%20Level%20Intermediate%20-%20CLF102.ipynb

## Feature engineering

### primary account

In [16]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1

In [17]:
with mp.Pool(processes=NCPU) as pool:
    res_primary_accts = pool.map(get_primary_account, df['BankReportData'])

In [18]:
df['primary_account'] = res_primary_accts

In [19]:
df = df.loc[df['primary_account'].notnull(),:]

### filter loans having transaction days >= 60 in primary account

In [20]:
with mp.Pool(processes=NCPU) as pool:
    txn_days_count = pool.starmap(get_transaction_days_count, zip(df['primary_account'],df['BankReportData']))

In [21]:
df['txn_days_count'] = txn_days_count

In [22]:
df = df.loc[df['txn_days_count'] == True, :]

### Calculate Age

In [23]:
df['Age'] = df.apply(lambda x: calculate_age(x['OriginationDate'],x['DateOfBirth']), axis = 1)

### New or Reloan

In [24]:
df['Reloan'] = df['LoanCount'].apply(lambda x:True if x>1 else False)

### Lead Provider

In [25]:
df = df.loc[df['Campaign'].notnull(),:]

In [26]:
lead_provider_list = [
    "MarketBullet",
    "StopNGo",
    "Nimbus",
    "EPCVIP",
    "PingBid",
    "LeapThry",
    "Acquir",
    "RoundSky",
    "Zero",
    "LeadPie",
    "ITMedia",
    "LeadsMarket",
]

In [27]:

df['LeadProvider'] = df['Campaign'].str.extract("(" + "|".join(lead_provider_list) +")",flags = re.IGNORECASE)

In [28]:
df['LeadProvider'] = df['LeadProvider'].fillna('Freedom')

### lender vars

#### in case the lender vars are to be generated for funded loans between 2018-01-01 to 2019-12-31 do not run the below cell, instead download from s3 ( look for "download lender vars from s3" markdown)

In [29]:
df_lender_vars = pd.DataFrame()
with mp.Pool(processes=NCPU) as pool:
    df_lender_vars_temp = pool.starmap(create_lender_vars, zip(df['LoanId'],df['BankReportData'],df['ReportTimeAdded'],df['primary_account']))
df_lender_vars=pd.concat(df_lender_vars_temp,ignore_index=True)

In [30]:
df_lender_vars.reset_index(drop=True,inplace=True)

#### download lender vars from s3

In [None]:
import boto3

In [None]:
#provide access keys if needed
s3 = boto3.client('s3')

In [None]:
s3.download_file('predicon-bucket', 'lender_vars.csv', 'FILE_NAME')

In [None]:
df_lender_vars = pd.read_csv('lender_vars.csv')

In [52]:
df = pd.merge(df,df_lender_vars,how='left',on='LoanId')

### Number of days positive and negative

In [31]:
with mp.Pool(processes = NCPU) as pool:
    df_gc_vars_temp = pool.starmap(gc_summary_vars, zip(df['LoanId'], df['BankReportData'], df['primary_account']))
df_gc_vars = pd.concat(df_gc_vars_temp, ignore_index = True)

In [32]:
df_gc_vars['diff_positive_negative_days'] = df_gc_vars['noOfDaysPositive']  - df_gc_vars['noOfDaysNegative']

In [33]:
df = pd.merge(df,df_gc_vars[['LoanId','diff_positive_negative_days']],how = 'left',on = 'LoanId')

In [41]:
from imputations import *

In [35]:
df['EsigTimeSignedDiff_In_SEC'] = df['EsigTimeSignedDiff_In_SEC'].map(esisgn_outlier_treat)

In [36]:
df.replace({'in1_is_direct_deposite': {'': 'Unknown'}}, inplace = True)

In [37]:
df['dti_percentage'] = df['dti_percentage'].map(dti_outlier_treat)

In [38]:
df.replace({'LeadProvider': {'Roundsky': 'RoundSky'}}, inplace = True)

### Imputation

In [42]:
df = impute(df, 'dti_percentage', 'median')

In [44]:
df = impute(df, 'in1_is_direct_deposite', 'mode')

In [45]:
df = impute(df, 'pay_day_test_result_amount', 'median')

In [46]:
df = impute(df, 'is_pds_history_found', 'mode')

In [47]:
df = impute(df, 'diff_positive_negative_days', 'median')

In [49]:
df = impute(df, 'EsigTimeSignedDiff_In_SEC', 'median')

### Pycaret preprocessing

In [63]:
from pycaret.classification import *

In [67]:
df_loans = df[['MonthlyGrossIncome', 'Age', 'Reloan', 'LeadProvider', 'LenderCountCred30', 
                'UniqLenderCount', 'LenderAmountDeb', 'LenderAmountCred', 'LenderAmountDeb30',
                'LenderAmountCred30', 'LenderCountDeb', 'LenderCountCred', 'LenderCountDeb30',
                'AccessCount', 'EsigTimeSignedDiff_In_SEC', 'IsFirstDefault', 'dti_percentage',
                'pay_day_test_result_amount', 'diff_positive_negative_days']]

In [55]:
from dtypes import *

In [68]:
df_loans = changing_bool_dtypes_to_str(df_loans)

In [71]:
clf = setup(data = df_loans, train_size = .99, target = 'IsFirstDefault', session_id = 69,
            normalize = True,
            transformation = True,
            remove_outliers = True)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,69
1,Target Type,Binary
2,Label Encoded,"False: 0, True: 1"
3,Original Data,"(4518, 19)"
4,Missing Values,False
5,Numeric Features,16
6,Categorical Features,2
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


# Train

## useful links
https://docs.databricks.com/_static/notebooks/mlflow/mlflow-quick-start-deployment-aws.html

https://towardsdatascience.com/deploying-models-to-production-with-mlflow-and-amazon-sagemaker-d21f67909198

https://www.h2o.ai/blog/a-deep-dive-into-h2os-automl/

## specify features

In [None]:
features_drop = ['LoanCount',
'OriginationDate',             
'BankReportData',                   
'ReportTimeAdded',                  
'Campaign',
'primary_account',
'txn_days_count', 'DateOfBirth',]

In [None]:
df_train = df_loans.drop(columns=features_drop,axis=1)

### H20

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(max_mem_size='16G')

In [None]:
df_h20_train =  h2o.H2OFrame(df_train)

In [None]:
y = "IsFirstDefault" 
x = df_h20_train.columns
x.remove(y)
x.remove('LoanId')

In [None]:
aml = H2OAutoML(max_runtime_secs=120, seed=1)
aml.train(x=x, y=y, training_frame=df_h20_train)

In [None]:
lb = aml.leaderboard
lb.head()

### pycaret

In [72]:
tuned_lr = tune_model('lr', optimize = 'F1')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5953,0.7041,0.7125,0.2767,0.3986,0.1749
1,0.5647,0.6415,0.65,0.2488,0.3599,0.1204
2,0.5506,0.6693,0.7,0.2511,0.3696,0.128
3,0.5929,0.6625,0.675,0.2687,0.3843,0.1575
4,0.5506,0.6196,0.6375,0.2394,0.3481,0.1025
5,0.5647,0.6951,0.8125,0.2766,0.4127,0.1833
6,0.5482,0.6466,0.679,0.2489,0.3642,0.1183
7,0.5741,0.6671,0.7531,0.2748,0.4026,0.1712
8,0.5812,0.6529,0.642,0.2587,0.3688,0.1333
9,0.5731,0.6359,0.6375,0.2512,0.3604,0.123


In [73]:
bagged_lr = ensemble_model(tuned_lr, method = 'Bagging')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5835,0.6962,0.6875,0.2657,0.3833,0.1534
1,0.5647,0.6407,0.65,0.2488,0.3599,0.1204
2,0.5553,0.662,0.6625,0.2465,0.3593,0.1171
3,0.5953,0.6584,0.65,0.2653,0.3768,0.1494
4,0.5506,0.6197,0.625,0.237,0.3436,0.0972
5,0.5529,0.6921,0.8,0.2689,0.4025,0.1681
6,0.5718,0.6541,0.7284,0.2694,0.3933,0.1594
7,0.5859,0.6589,0.7284,0.277,0.4014,0.173
8,0.5929,0.6615,0.6296,0.2629,0.3709,0.1395
9,0.5825,0.6416,0.65,0.2587,0.3701,0.1372


In [74]:
final_model = finalize_model(bagged_lr)

# Predict

### primary account

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    res_primary_accts = pool.map(get_primary_account, df_loan_eval['BankReportData'])

In [None]:
df_loan_eval['primary_account'] = res_primary_accts

In [None]:
df_loan_eval = df_loan_eval.loc[df_loan_eval['primary_account'].notnull(),:]

### filter loans having transaction days >= 60 in primary account

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    txn_days_count = pool.starmap(get_transaction_days_count, zip(df_loan_eval['primary_account'],df_loan_eval['BankReportData']))

In [None]:
df_loan_eval['txn_days_count'] = txn_days_count

In [None]:
df_loan_eval = df_loan_eval.loc[df_loan_eval['txn_days_count']==True,:]

### calculate age

In [None]:
df_loan_eval['Age'] = df_loan_eval.apply(lambda x: calculate_age(x['OriginationDate'],x['DateOfBirth']), axis = 1)

### lead provider

In [None]:
df_loan_eval = df_loan_eval.loc[df_loan_eval['Campaign'].notnull(),:]

In [None]:
df_loan_eval['LeadProvider'] = df_loan_eval['Campaign'].str.extract("(" + "|".join(lead_provider_list) +")",flags = re.IGNORECASE)

In [None]:
df_loan_eval['LeadProvider']=df_loan_eval['LeadProvider'].fillna('Freedom')

### new or reloan

In [None]:
df_loan_eval['Reloan'] = df_loan_eval['LoanCount'].apply(lambda x:True if x>1 else False)

### lender vars

In [None]:
df_lender_vars_eval = pd.DataFrame()
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    df_lender_vars_temp = pool.starmap(create_lender_vars, zip(df_loan_eval['LoanId'],df_loan_eval['BankReportData'],df_loan_eval['ReportTimeAdded'],df_loan_eval['primary_account']))
df_lender_vars_eval=pd.concat(df_lender_vars_temp,ignore_index=True)

In [None]:
df_loan_eval = pd.merge(df_loan_eval,df_lender_vars_eval,on='LoanId',how='left')

### esign variables

In [None]:
df_loan_eval= pd.merge(df_loan_eval,df_esign_eval,on='LoanId',how='left')

### run prediction

## H20

In [None]:
df_loan_predict = df_loan_eval.drop(columns=features_drop,axis=1)

In [None]:
h2o_eval = h2o.H2OFrame(df_loan_predict)

In [None]:
pred = aml.leader.predict(h2o_eval)
pred.head()

In [None]:
#convert to pandas dataframe
df_predictions = h2o.as_list(pred)

In [None]:
df_loan_eval.reset_index(drop=True,inplace=True)

In [None]:
df_predictions['target'] = df_loan_eval['IsFirstDefault']  

In [None]:
df_predictions = df_predictions.rename(columns={'True':'prob'})

In [None]:
df_predictions = df_predictions[['target','prob']]

In [None]:
df_predictions['target'].value_counts(normalize = True)

## Pycaret

# Evaluate

## get BV uncertain and BV Approved loans for model evaluation

In [14]:
username_bank_app = 'bankreview'
password_bank_app = 'Freedom!23'
host_bank_app = '192.168.4.115'
port_bank_app = 3306
db_bank_app = 'bankreviewdb'

In [17]:
bank_app_conn = pymysql.connect(host=host_bank_app,
                                port=port_bank_app,
                                db=db_bank_app,
                                user=username_bank_app,
                                password=password_bank_app)

In [None]:
query_evaluation_loans = '''select loan_id, 
                                final_decision,
                                reasons_for_decision,
                                entered_date
                                
                            from loan 
                            where campaign like '%Production%'
                            and STR_TO_DATE(entered_date ,'%m/%d/%Y') >= STR_TO_DATE('01/01/2020','%m/%d/%Y')
                            and STR_TO_DATE(entered_date ,'%m/%d/%Y') < STR_TO_DATE('04/01/2020','%m/%d/%Y')
                            and final_decision in ('Bank Validation Uncertain','Bank Validation Approved') '''

In [None]:
df_eval_loans = pd.read_sql_query(query_evaluation_loans, con = bank_app_conn)

## get funded and mature loans for the same period

In [None]:
query_funded_mature_loans = ''' select LoanId, 
                                IsFirstDefault
                        from view_FCL_Loan
                        where OriginationDate >= '2020-01-01' 
                        and OriginationDate <= '2020-03-31'
                        and IsFirstDefault IS NOT NULL
                        and MerchantId IN (15, 18)
                        
                     '''

In [None]:
df_funded_mature_loans = pd.read_sql_query(query_funded_mature_loans,con = iloans_conn)

In [None]:
df_funded_mature_loans['LoanId'] = df_funded_mature_loans['LoanId'].astype(int).astype(str)

In [None]:
df_eval = pd.merge(df_funded_mature_loans,df_eval_loans,how = 'inner',left_on = 'LoanId',right_on = 'loan_id')

In [None]:
df_eval.info()

In [None]:
loan_id_list = list(df_eval['LoanId'])

### compute KS

In [None]:
import numpy as np

In [None]:
def get_KS(df_pred):
    """
    Returns KS given scores
    Parameters:
    df_pred (pandas df): DataFrame containing target variable and model score
    
    Returns:
    float: KS value
    """
    df_scores = df_pred.sort_values(by='prob')
    total_good = (df_scores['target'] == False).sum()
    total_bad = (df_scores['target'] == True).sum()
    df_scores['cum_good_perc'] = (df_scores['target'] == False).cumsum()/total_good
    df_scores['cum_bad_perc'] = (df_scores['target'] == True).cumsum()/total_bad
    df_scores['cum_diff'] = np.abs((df_scores['cum_good_perc'] - df_scores['cum_bad_perc']))
    return df_scores['cum_diff'].max()

In [None]:
get_KS()

### quantiling

In [None]:
def quantile_table(df_pred,n = 10):
    """
    Returns a quantile table given model scores (default is decile)
    
    Parameters:
    df_pred (pandas df): DataFrame containing target variable and model score
    
    Returns:
    pandas DataFrame: Pandas dataframe containing quantiles
    
    """
    df_scores = df_pred.sort_values(by='prob')
    df_scores['decile'],score_bin = pd.qcut(df_scores['prob'],10,labels=[1,2,3,4,5,6,7,8,9,10],retbins = True)
    df_scores['target'] = df_scores['target'].astype(int)
    df_scores_deciles = df_scores.groupby('decile',as_index=False).agg({'prob':['count','min','max','mean'],'target':'sum'})
    df_scores_deciles.columns = ['decile','count','min_score','max_score','mean_score','bad_count']
    df_scores_deciles['perc_bad'] = (df_scores_deciles['bad_count']/df_scores_deciles['count']) * 100
    return df_scores_deciles,score_bin

In [None]:
quantile_table, score_bins = quantile_table(df_predictions)

In [None]:
quantile_table

### get bins for quantile assignment

In [None]:
score_bins

In [None]:
score_bins = np.concatenate(([-np.inf], score_bins, [np.inf]))

### upload to sagemaker

In [None]:
import mlflow

In [None]:
import mlflow.h2o as mh2o

In [None]:
import mlflow.sagemaker as mfs

In [None]:
mh2o.save_model(aml.leader,path="path/to/trained/model")

In [None]:
region = "us-east-1"
arn = "arn:aws:iam::757719720041:role/Sagemaker"
appname = "h20-mlflow-deploy"
modeluri = "path/to/saved/model" 
image_url = "757719720041.dkr.ecr.us-east-1.amazonaws.com/freedom-pyfunc:latest"

In [None]:
mfs.deploy(app_name=appname, model_path=modeluri, instance_type='ml.t2.medium',region_name=region, mode="create",execution_role_arn=arn,image_url=image_url)

In [None]:
import boto3

def check_status(app_name):
    sage_client = boto3.client('sagemaker', region_name="us-east-1")
    endpoint_description = sage_client.describe_endpoint(EndpointName=app_name)
    endpoint_status = endpoint_description["EndpointStatus"]
    return endpoint_status

In [None]:
check_status(appname)