In [None]:
!jt -r

# Extract

In [None]:
import pymssql
import pandas as pd
import json
import multiprocessing as mp
import datetime
import pymysql

In [None]:
server = '192.168.4.117'
database = 'FreedomCashLenders'
username = 'FreedomCashLendersAll'
mssql_password = 'Freedom123$'

In [None]:
iloans_conn = pymssql.connect(server, username, mssql_password, database, port = 1433)

In [None]:
start_date = "'2018-01-01'"
end_date = "'2019-12-31'"

In [None]:
query_loan = f'''select LN.LoanId,
                       LC.LoanCount,
                       LN.OriginationDate,
                       GC.BankReportData,
                       GC.TimeAdded as ReportTimeAdded,
                       LN.Campaign,
                       LN.MonthlyGrossIncome,
                       LN.DateOfBirth,
                       LN.IsFirstDefault
                       
                from view_FCL_Loan LN
                LEFT JOIN view_FCL_CustomerLoanCount LC ON LC.CustomerId = LN.CustomerId
                LEFT JOIN view_FCL_GetCreditDataLoan GCD ON LN.LoanId = GCD.LoanId
                LEFT JOIN view_FCL_GetCreditData GC ON GC.BankTransactionId = GCD.BankTransactionId
                
                
                where LN.OriginationDate >= {start_date}
                and LN.OriginationDate <= {end_date} 
                and LN.IsFirstDefault IS NOT NULL
                and LN.MerchantId IN (15, 18)
                and GC.ReportStatus = 'COMPLETE' '''

In [None]:
df_loans = pd.read_sql_query(query_loan,con = iloans_conn)

In [None]:
df_loans = df_loans.drop_duplicates('LoanId')

In [None]:
query_esign = f'''
SELECT
    LN.LoanId,
    ESIG.AccessCount,
    ESIG.EsigTimeSignedDiff_In_SEC
FROM
    view_FCL_Loan LN
    LEFT JOIN view_FCL_EsignatureCustomerData ESIG ON LN.LoanId = ESIG.LoanId
WHERE
    LN.OriginationDate >= {start_date} 
    and LN.OriginationDate <= {end_date}
    and LN.IsFirstDefault IS NOT NULL
    and LN.MerchantId IN (15, 18)

'''


In [None]:
df_esign = pd.read_sql_query(query_esign,con=iloans_conn)

## Extract Data for Evaluation

In [None]:
def stringify_account_ids(loan_id_list):
    """
    Convert account_id list into comma-separated string of account_ids
    :return: string containing comma-separated account_ids
    """
    return '(' + ', '.join([str(i) for i in loan_id_list]) + ')'

In [None]:
loanid_string=stringify_account_ids(loan_id_list)

In [None]:
query_loan_eval = '''select LN.LoanId,
                       LC.LoanCount,
                       LN.OriginationDate,
                       GC.BankReportData,
                       GC.TimeAdded as ReportTimeAdded,
                       LN.Campaign,
                       LN.MonthlyGrossIncome,
                       LN.DateOfBirth,
                       LN.IsFirstDefault
                       
                from view_FCL_Loan LN
                LEFT JOIN view_FCL_CustomerLoanCount LC ON LC.CustomerId = LN.CustomerId
                LEFT JOIN view_FCL_GetCreditDataLoan GCD ON LN.LoanId = GCD.LoanId
                LEFT JOIN view_FCL_GetCreditData GC ON GC.BankTransactionId = GCD.BankTransactionId
                
                
                WHERE
                    GC.ReportStatus = 'COMPLETE'
                    AND LN.LoanId IN %s'''%(loanid_string)

In [None]:
df_loan_eval = pd.read_sql_query(query_loan_eval,con=iloans_conn)

In [None]:
df_loan_eval=df_loan_eval.drop_duplicates('LoanId')

In [None]:
query_esign_eval = '''
SELECT
    LN.LoanId,
    ESIG.AccessCount,
    ESIG.EsigTimeSignedDiff_In_SEC
FROM
    view_FCL_Loan LN
    LEFT JOIN view_FCL_EsignatureCustomerData ESIG ON LN.LoanId = ESIG.LoanId
WHERE
    LN.LoanId IN %s'''%(loanid_string)

In [None]:
df_esign_eval = pd.read_sql_query(query_esign_eval,con=iloans_conn)

In [None]:
df_esign_eval=df_esign_eval.drop_duplicates('LoanId')

# EDA and ML Modelling

## Checking the importance of the new feature

We will be using the following methodology to check whether a new engineered feature is meaningful or not:<br>
1> Calculate the correlation of aal the features with the target(spearman correlation).<br>
2> Check for the significance of that correlation.<br>
3> Check if the feature causes multicolinearity in the dataset.<br>
4> Check the Predictive Power score between all the features and the target.<br>

In [None]:
def feature_importance(dataframe, new_feature, target, threshold = 0.9, return_corr_matrix = True, return_pps_matrix = True):
    """
    Checks how important new_feature is w.r.t. the target on the basis of the PPS and spearman correlation.
    Uses the PPS module to calculate Predictive Power Score.
    ----------
    Parameters:
       dataframe(pandas df) : The dataframe consisting the whole dataset along with the new feature
       feature(string) : Name of the new feature, as in the dataframe
       target(string) : Name of the target, as in the dataframe
       threshold(float) : default = 0.9; Multicolinearity threshold.
       return_corr_matrix : default = True; Return the pps for the entire feature-space(not including the target).
       return_pps_matrix : default = True; Return the pps for the entire feature-space(not including the target).
    ----------   
    Returns:
       seaborn heatmap : **spearman correlation and pps score** b/w all possible features including the new one in a heatmap form.
       float : **PPS score** of the new_feature with the target.
       sloat : **Correlation** of new_feature with target if its numeric.
       (float, float) : the **correlation and p-value** after the hypothesis testing(t-test for corr b/w new_feature and target).
       list of tuples : tuples of all the features combinations which have higher pps, than the threshold, with the new feature.
    """
    
    import warnings
    warnings.filterwarnings('ignore')
    import ppscore as pps
    import pandas as pd
    import seaborn as sns
    from scipy.stats import pointbiserialr, spearmanr, chi2_contingency
    import matplotlib.pyplot as plt
    
    df = dataframe
    
    # pps score
    pps_score = pps.matrix(df)
    pps_feat_tar = {}
    for feature in df.drop([target], axis = 1).columns:
        pps_feat_tar[feature] = pps_score.loc[target, feature]
    
    
    # spearman correlation and pps score b/w all the features
    if return_corr_matrix:
        spearman_feat_2_feat = df.drop([target], axis = 1).corr(method = 'spearman')
        pps_feat_2_feat = pps.matrix(df.drop([target], axis = 1))
    
    # pps score b/w all the features
    if return_pps_matrix:
        pps_feat_2_feat = pps.matrix(df.drop([target], axis = 1))
    
    # correlation testing
    r = {}
    for new_feature in df.drop([target], axis = 1).columns:
        if df[new_feature].dtype == 'bool' or df[new_feature].dtype == 'O':
            r[new_feature] = (chi2_contingency(pd.crosstab(df[new_feature], df[target]))[0], chi2_contingency(pd.crosstab(df[new_feature], df[target]))[1])
        elif df[new_feature].dtype == 'int64' or df[new_feature].dtype == 'float64':
            r[new_feature] = (pointbiserialr(df[new_feature], df[target])[0], pointbiserialr(df[new_feature], df[target])[1])
        else:
            print("check you new feature data type---should be one among [int64, bool, object]")
    
    corr_tuplist = []
    corr = pps_feat_2_feat
    cols = len(corr.columns)
    for i in range(corr.shape[0]):
        for j in range(corr.shape[0]):
            if (abs(corr.iloc[i, j]) > 0.9) and (i != j):
                corr_tuplist.append((corr.index[i], corr.columns[j]))
    
    plt.figure(figsize = (16, 8))
    print("Correlation heatmap")
    sns.heatmap(spearman_feat_2_feat, robust = 1, linewidth = 2, annot = True)
    plt.show()
    
    plt.figure(figsize = (16, 8))
    print("PPS Score heatmap")
    sns.heatmap(pps_feat_2_feat, robust = 1, linewidth = 2, annot = True)
    plt.show()
    
    print("PPS score details\n", pps_feat_tar)
    print("\n")
    
    print("statistics-value & p-value of features wrt target(if feature was of boolean or object type, chi-square value shown) : ", r)
    print("\n")
    
    if len(corr_tuplist) != 0:
        print("Scores of feature_pairs exceeding the pps threshold : ", corr_tuplist)
    else:
        print("No feature pairs exceed the pps threshold")
    
    return 0

##  Importing Libraries¶

In [1]:
#ignore this code
#this is just for increasing rhe width of the jupter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd # for reading the data into dataframes 
from pandas_profiling import ProfileReport # for in-depth eda
import numpy as np # for faster numeric based calculations
from pycaret.classification import * # for machine learning modelling
from matplotlib import pyplot as plt # for plotting
import seaborn as sns # for a high level plotting
from sklearn.impute import KNNImputer as knn # for imputing missing or erraneous values in the dataset
import cufflinks as cf #for annotating the plots
from sklearn.ensemble import RandomForestClassifier # a random forest classifier
from sklearn.tree import DecisionTreeClassifier # a decision tree based classifier
from sklearn.model_selection import GridSearchCV# for hyperparameter tuning
from sklearn.metrics import confusion_matrix as cm # for visualizing the predicted results
from sklearn.naive_bayes import GaussianNB # for probailistic modelling

## Setting up some styles for plotting

In [None]:
sns.set(style = "darkgrid", font = 'fantasy') # for seaborn based plotting
cf.set_config_file(theme = 'ggplot', sharing = 'public', offline = True) # to show the graphs inside the notebook

## Importing the training as well as testing dataset

In [None]:
df_train = pd.read_csv('Downloads/training_data_2020-04-16.csv', header = 0, index_col = None)
df_test = pd.read_csv('Downloads/eval_19-04-2020.csv', header = 0, index_col = None)

## Merging both the datasets

In [None]:
df = pd.concat([df_train, df_test], ignore_index = True)

## Lets see the properties of training as well as testing datasets

In [None]:
df_train.shape
# training data consists of 14216 samples and 17 features

In [None]:
df_test.shape
# testing data consists of 941 samples and 17 features

## Dropping LoanID

 As we can see that LoanID has a very large size and is of no use for prediction as well<br>
 So lets drop this feature before moving ahead from the joined dataset<br>

In [None]:
df_train = df_train.drop(['LoanId'], axis = 1)# deopping the loanID feature

## Converting boolean data types to string

In [None]:
# as boolean data types ane not well supported by the modules we're gonna use, its better to shift to string for both the joint as well as test dataset
for i in range(df_train.shape[0]):
    df_train.iloc[i, 1] = str(df_train.iloc[i, 1])
    df_train.iloc[i, 4] = str(df_train.iloc[i, 4])

# Machine Learning

We will be using the famous pycaret module for preprocessing, training, evaluation and predictions.<br>
We will be following the mentioned workflow:<br>
1. Preprocessing<br>
2. Model training<br>
3. Model hyperparameter tuning<br>
4. Blending the top performing models<br>
5. Plot-Evaluation<br>
6. Prediction<br>
7. Metric-Evaluation<br>
8. Saving the model

## Preprocessing

Data preprocessing is an important step in the data mining process. The phrase "garbage in, garbage out" is particularly applicable to data mining and machine learning projects. Data-gathering methods are often loosely controlled, resulting in out-of-range values (e.g., Income: −100), impossible data combinations (e.g., Sex: Male, Pregnant: Yes), missing values, etc.<br>

Analyzing data that has not been carefully screened for such problems can produce misleading results. Thus, the representation and quality of data is first and foremost before running an analysis. Often, data preprocessing is the most important phase of a machine learning project, especially in computational biology.<br>

If there is much irrelevant and redundant information present or noisy and unreliable data, then knowledge discovery during the training phase is more difficult. Data preparation and filtering steps can take considerable amount of processing time. Data preprocessing includes cleaning, Instance selection, normalization, transformation, feature extraction and selection, etc. The product of data preprocessing is the final training set.<br>

Data pre-processing may affect the way in which outcomes of the final data processing can be interpreted.<br>

On that note, we can easily imagine how tedious and time consuming this step could be. Inorder to solve this dillema, we are gonna use the pycaret's setup function that does all the preprocessing.

Let's summarize how the profile discussed in section **2.8** has helped make critical pre-processing choices with the data.<br>

**Missing Values**: There are no missing values in the data. However, we still need imputers in our pipeline just in case the new unseen data has missing values (not applicable in this case). When you execute the setup() function, imputers are created and stored in the pipeline automatically. By default, it uses a mean imputer for numeric values and a constant imputer for categorical. This can be changed using the numeric_imputation and categorical_imputation parameters in setup().<br>

**Multicollinearity**: There are high correlations between LenderAmountDeb30 and LenderAmountDeb as well as between LenderCountDeb and LenderCountDeb30, which introduces multicollinearity into the data. We will remove multi-collinearity by using the remove_multicollinearity and multicollinearity_threshold parameters in setup.<br>

**Data Scale / Range**: Notice how the scale / range of numeric features are different. For example the Age feature ranges from between 25 to 87, EsigTimeSignedDiff_In_SEC ranges from -264 to 2,62,83,332 and MonthlyGrossIncome ranges from 400 to 1,00,000. This may cause problems for algorithms that assume all features have variance within the same order. In this case, the order of magnitude for all these variables is very different from each other. We will deal with this problem by using the normalize parameter in setup.<br>

**Distribution of Feature Space**: Numeric features are not normally distributed. Look at the distributions of Age, EsigTimeSignedDiff_In_SEC and MonthlyGrossIncome. A few features are also highly skewed such as AccessCount and EsigTimeSignedDiff_In_SEC. This may cause problems for algorithms that assume normal or approximate normal distributions of the data. Examples include Logistic Regression, Linear Discriminant Analysis (LDA) and Naive Bayes. We will deal with this problem by using the transformation parameter in setup.<br>

**Group Features**: From the data description we know that certain features are related with each other such as LenderAmountCred and LenderAmountCred30 and more features which are related from a sample level. We will use the group_features parameter in setup to extract statistical information from these features.<br>

**Bin Numeric Features**: When looking at the correlations between the numeric features and the target variable, we that Age is weak. We will use the bin_numeric_features parameter to remove the noise from these variables which may help linear algorithms.<br>

**Combine Rare Levels**: Sometimes a dataset can have a categorical feature (or multiple categorical features) that has a very high number of levels (i.e. high cardinality features). If such feature (or features) are encoded into numeric values, then the resultant matrix is a sparse matrix. This not only makes experiment slow due to manifold increment in the number of features and hence the size of the dataset, but also introduces noise in the experiment. Sparse matrix can be avoided by combining the rare levels in the feature(or features) having high cardinality.<br>

**Ignore Low Variance**: Sometimes a dataset may have a categorical feature with multiple levels, where distribution of such levels are skewed and one level may dominate over other levels. This means there is not much variation in the information provided by such feature.  For a ML model, such feature may not add a lot of information and thus can be ignored for modeling.Both conditions below must be met for a feature to be considered a low variance feature:<br>
    Count of unique values in a feature  / sample size < 10%<br>
    Count of most common value / Count of second most common value > 20 times.<br>

**Feature Interaction**: It is often seen in machine learning experiments when two features combined through an arithmetic operation becomes more significant in explaining variances in the data, than the same two features separately. Creating a new feature through interaction of existing features is known as feature interaction. It can achieved in PyCaret using feature_interaction and feature_ratio parameters within setup. Feature interaction creates new features by multiplying two variables (a * b), while feature ratios create new features but by calculating the ratios of existing features (a / b).<br>

**Polynomial Features**: In machine learning experiments, the relationship between the dependent and independent variable is often assumed as linear, however this is not always the case. Sometimes the relationship between dependent and independent variables is more complex. Creating new polynomial features sometimes might help in capturing that relationship which otherwise may go unnoticed.<br>

**Remove Outliers**: The Remove Outliers function in PyCaret allows you to identify and remove outliers from the dataset before training the model. Outliers are identified through PCA linear dimensionality reduction using the Singular Value Decomposition technique.

### Imputation

As we saw previously, EsigTimeSignedDiff_In_SEC contains many negative as well as positive extreme value which need to be capped with something sensible. Hence, for the time being, we will be using KNN Imputer which imputes on the basis of nearest neighbors in the high dimensional feature space.

In [None]:
imputer_knn = knn(missing_values = np.nan, n_neighbors = 7)# initiating the imputer with parameters

# imputing the extremeties which dont make any sense for this particukar feature
for i in df_train[df_train['EsigTimeSignedDiff_In_SEC'] < 0 ].index:
    df_train.loc[i, 'EsigTimeSignedDiff_In_SEC'] = np.nan
    
for i in df_train[df_train['EsigTimeSignedDiff_In_SEC'] > 432000 ].index:
    df_train.loc[i, 'EsigTimeSignedDiff_In_SEC'] = np.nan

# finally, imputing all the capped values 
df_train[['EsigTimeSignedDiff_In_SEC']] = imputer_knn.fit_transform(df_train[['EsigTimeSignedDiff_In_SEC']])    

### Over-sampling the minority class to reduce the imbalance to a certain extent, using the Adaptive Synthetic (ADASYN) sampling approach for imbalanced datasets. You can read the documentation here : <br>https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.ADASYN.html#imblearn.over_sampling.ADASYN

In [None]:
df_new = pd.get_dummies(df_train, columns = ['Reloan', 'LeadProvider'])

In [None]:
y = df_new['IsFirstDefault']
X = df_new.drop(['IsFirstDefault'], axis = 1)

In [None]:
oversmp = ADASYN(random_state = 42)

In [None]:
X_res, y_res = oversmp.fit_resample(X, y)

In [None]:
df_new = X_res
df_new['IsFirstDefault'] = y_res

In order to feed the dataset inside the pipeline, all the uint type datatypes need to be converted into int32

In [None]:
df_new[['Reloan_False', 'Reloan_True', 'LeadProvider_EPCVIP',
       'LeadProvider_Freedom', 'LeadProvider_ITMedia', 'LeadProvider_LeadPie',
       'LeadProvider_LeadsMarket', 'LeadProvider_LeapThry',
       'LeadProvider_Nimbus', 'LeadProvider_PingBid', 'LeadProvider_RoundSky',
       'LeadProvider_Roundsky', 'LeadProvider_StopNGo', 'LeadProvider_Zero']] = df_new[['Reloan_False', 'Reloan_True', 'LeadProvider_EPCVIP',
                                                                                        'LeadProvider_Freedom', 'LeadProvider_ITMedia', 'LeadProvider_LeadPie',
                                                                                        'LeadProvider_LeadsMarket', 'LeadProvider_LeapThry',
                                                                                        'LeadProvider_Nimbus', 'LeadProvider_PingBid', 'LeadProvider_RoundSky',
                                                                                        'LeadProvider_Roundsky', 'LeadProvider_StopNGo', 'LeadProvider_Zero']].astype(int)

### Building the preprocessing pipeline

The setup() function initializes the environment in pycaret and creates the transformation pipeline to prepare the data for modeling and deployment. setup() must be called before executing any other function in pycaret. It takes two mandatory parameters: a pandas dataframe and the name of the target column.<br>

When setup() is executed, PyCaret's inference algorithm will automatically infer the data types for all features based on certain properties. The data type should be inferred correctly but this is not always the case. To account for this, PyCaret displays a table containing the features and their inferred data types after setup() is executed. If all of the data types are correctly identified enter can be pressed to continue or quit can be typed to end the expriment. Ensuring that the data types are correct is of fundamental importance in PyCaret as it automatically performs a few pre-processing tasks which are imperative to any machine learning experiment. These tasks are performed differently for each data type which means it is very important for them to be correctly configured.

In [None]:
clf = setup(data = df_new, train_size = .99, target = 'IsFirstDefault', session_id = 769,
            normalize = True,
            combine_rare_levels = True,
            transformation = True,
            ignore_low_variance = True, 
            remove_multicollinearity = True,
            bin_numeric_features = ['Age'],
            group_features = [['LenderAmountCred', 'LenderAmountCred30'], ['LenderCountCred', 'LenderCountCred30']],
            feature_interaction = True, feature_ratio = True,
            feature_selection = True,
            polynomial_features = True,
            remove_outliers = True)

Saving the preprocessed data from the pipeline

In [None]:
X_train = clf[0]
y_train = clf[1]

In [None]:
X_test = clf[3]
y_test = clf[5]

In [None]:
df.shape

In [None]:
clf[0].shape

## Model Training and Tuning

When a model is created using the create_model() function it uses the default hyperparameters. In order to tune hyperparameters, the tune_model() function is used. This function automatically tunes the hyperparameters of a model on a pre-defined search space and scores it using stratified cross validation. The output prints a score grid that shows Accuracy, Recall, Precision, F1 and Kappa by fold. 

In [None]:
tuned_rf = tune_model('rf', optimize = 'F1')

In [None]:
tuned_lgbm = tune_model('lightgbm', optimize = 'F1')

In [None]:
tuned_nb = tune_model('nb', optimize = 'F1')

In [None]:
bagged_nb = ensemble_model(tuned_nb, method = 'Bagging')

## Blending the top performing models

Blending is another common technique for ensembling that can be used in PyCaret. It uses predictions from multiple models to generate a final set of predictions using voting / majority consensus from all of the models passed in the estimator_list parameter. If no list is passed, PyCaret uses all of the models available in the model library by default. The method parameter can be used to define the type of voting. When set to hard, it uses labels for majority rule voting. When set to soft it uses the sum of predicted probabilities instead of the label. we will stick to the soft method. We will be passing our top 2 handpicked models for blending.

In [None]:
blend_specific_soft = blend_models(estimator_list = [tuned_rf, bagged_nb, tuned_lgbm], method = 'soft')

Now, our final model is ready for further processing.

## Plot-Evaluation

Lets look at some plots to see how our model is performing on the test data, which was set as the 5% data from the training dataset passed in the setup function in section 3.1

Confusion Matrix

In [None]:
plot_model(blend_specific_soft, plot = 'confusion_matrix')

AUROC

In [None]:
plot_model(blend_specific_soft, plot = 'auc')

Precision-Recall curve

In [None]:
plot_model(blend_specific_soft, plot = 'pr')

## Prediction

The output dataframe is the test data separated earlier by the setup function which has additional 2 columns, viz the predicted Label and the confidence.

Lets finalize the model and see how it performs on our complete as well as personal holdout set viz df and df_test respectively

In [None]:
final_model = finalize_model(blend_specific_soft)

In [None]:
# on complete dataset
predict_model(final_model)

We need to get the test data in the same format as well in-order to get a prediction

In [None]:
df_test_new = pd.get_dummies(df_test, columns = ['Reloan', 'LeadProvider'])

In [None]:
# on the test or holdout set
predictions = predict_model(blend_specific_soft, data = df_test_new)

## Metric-Evaluation

Lets look at the KS metric score

First we need to modify the predictions dataframe in order to calculate the KS score

In [None]:
for i in range(predictions.shape[0]):
    predictions.iloc[i, 2] = str(predictions.iloc[i, 2])
    
for i in range(predictions.shape[0]):
    if predictions.iloc[i, 1] == 'True':
        predictions.iloc[i, 1] = 1
    else:
        predictions.iloc[i, 1] = 0

In [None]:
# The KS Score        
df_scores = predictions.sort_values(by = 'Score')
total_good = (predictions['IsFirstDefault'] == 0).sum()
total_bad = (df_scores['IsFirstDefault'] == 1).sum()
df_scores['cum_good_perc'] = (df_scores['IsFirstDefault'] == 0).cumsum()/total_good
df_scores['cum_bad_perc'] = (df_scores['IsFirstDefault'] == 1).cumsum()/total_bad
df_scores['cum_diff'] = np.abs((df_scores['cum_good_perc'] - df_scores['cum_bad_perc']))
df_scores['cum_diff'].max()

## Saving the model

Lets now save the model so that it could be loaded next time and could be used for predictions

In [None]:
save_model(final_model, 'final_blend-nb+knc')

# Preprocess

## utility functions

In [None]:
def parse_dates(json_date):
    '''
    Converts json formatted date to pandas datetime.
    
    Parameters:
    JSON date (JSON).
    
    Returns:
    Pandas datetime object.
    
    '''
    
    #return datetime.fromtimestamp(int(json_date)/1000.0).strftime('%Y-%m-%d')
    return datetime.datetime.utcfromtimestamp(int(json_date)/1000).date()


def fetch_checking_acct_txns(json_string):
    """
    Parse all checking account transactions in the bank report
    
    Parameters:
    json_string(json): json containing bank report
    
    Returns:
    dataframe: containing transactions 
    
    """
    j = json.loads(json_string)
    df_txn = pd.DataFrame()
    
    acct_numbers = []
    for accts in j['accounts']:
        
        if ('transactions' in accts.keys()) and (len(accts['transactions']) > 0) and (accts['accountNumber'] not in acct_numbers) and (accts['accountType'].strip().lower() == 'checking'):
            
            df_txn_temp = pd.DataFrame(accts['transactions'])
            df_txn_temp['account_number'] = accts['accountNumber']
            df_txn = df_txn.append(df_txn_temp, ignore_index=True)
            
            df_txn['posted_date'] = df_txn['postedDate'].map(lambda json_date: parse_dates(json_date))
            df_txn['category'] = df_txn['contexts'].map(lambda x: x[0]['categoryName'] if len(x) > 0 else np.nan)
            acct_numbers.append(accts['accountNumber'])
    
    if 'pending' in df_txn.columns:
        df_txn = df_txn[df_txn['pending'] == False]
    return df_txn

## primary account

In [None]:
def get_primary_account(bankreport):
    """
    Flag primary checking account (account having max transaction count)
    
    Parameters:
    bankreport (json)
    loanid (str)
    
    Returns:
    Dataframe containing checking accounts and primary account flag = 1
    """
    df_txn = fetch_checking_acct_txns(bankreport)
    if df_txn.empty is False:
        df_txns_count = df_txn['account_number'].value_counts()
        return df_txns_count.idxmax()

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    res_primary_accts = pool.map(get_primary_account, df_loans['BankReportData'])

In [None]:
df_loans['primary_account'] = res_primary_accts

In [None]:
df_loans = df_loans.loc[df_loans['primary_account'].notnull(),:]

## filter loans having transaction days >= 60 in primary account

In [None]:
def get_transaction_days_count(primary_account,bank_report):
    df_checking_txns = fetch_checking_acct_txns(bank_report)
    if df_checking_txns.empty is False:
        df_primary_account_txns = df_checking_txns[df_checking_txns['account_number']==primary_account]
        df_primary_account_txns= df_primary_account_txns.sort_values(by='posted_date')
        first_txn_date = df_primary_account_txns['posted_date'].iloc[0]
        last_txn_date = df_primary_account_txns['posted_date'].iloc[-1]
        txn_days_count = (last_txn_date - first_txn_date).days
        return txn_days_count >= 60

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    txn_days_count = pool.starmap(get_transaction_days_count, zip(df_loans['primary_account'],df_loans['BankReportData']))

In [None]:
df_loans['txn_days_count'] = txn_days_count

In [None]:
df_loans = df_loans.loc[df_loans['txn_days_count'] == True, :]

## Calculate Age

In [None]:
def calculate_age(current_date, dob):
    age = len(pd.date_range(start=dob,end=current_date,freq='Y'))
    return age

In [None]:
df_loans['Age'] = df_loans.apply(lambda x: calculate_age(x['OriginationDate'],x['DateOfBirth']), axis = 1)

## New or Reloan

In [None]:
df_loans['Reloan'] = df_loans['LoanCount'].apply(lambda x:True if x>1 else False)

### Lead Provider

In [None]:
df_loans = df_loans.loc[df_loans['Campaign'].notnull(),:]

In [None]:
lead_provider_list=['MarketBullet','StopNGo','Nimbus','EPCVIP','PingBid','LeapThry',
'Acquir','RoundSky','Zero','LeadPie',
'ITMedia','LeadsMarket']

In [None]:
df_loans['LeadProvider'] = df_loans['Campaign'].str.extract("(" + "|".join(lead_provider_list) +")",flags = re.IGNORECASE)

In [None]:
df_loans['LeadProvider']=df_loans['LeadProvider'].fillna('Freedom')

### lender vars

In [None]:
def create_lender_vars(loanid,report_string,time_added,pr_acct):

        """
        Function to generate lender variables 
        from primary account transactions

        Paramaters:
            txns(Boolean): True - Return lender txns along with lender variables
                           False - Return only lender variables

        Returns:
            lender_vars(dictionary): Dictionary containing all lender variables
        
        """

        lender_vars = dict()   
        lender_vars['LoanId'] = loanid
        lender_vars['LenderAmountDeb'] = 0.0
        lender_vars['LenderCountCred'] = 0.0
        lender_vars['LenderAmountCred30'] = 0.0
        lender_vars['LenderCountDeb'] = 0.0
        lender_vars['LenderAmountDeb30'] = 0.0
        lender_vars['LenderCountCred30'] = 0.0
        lender_vars['LenderCountDeb30'] = 0.0
        lender_vars['LenderAmountCred'] = 0.0
        lender_vars['UniqLenderCount'] = 0.0

        #load lending company list
        lend_cos=joblib.load('./lend_cos.pkl')

        #get primary checking account transactions
        df_checking_txns = fetch_checking_acct_txns(report_string) 
        df_pr_acct_txns = df_checking_txns[df_checking_txns['account_number']==pr_acct]
        
        
        #prepare lender transactions dataframe
        df_lender_txns=df_pr_acct_txns.loc[df_pr_acct_txns['memo'].str.contains('|'.join(lend_cos),case=False,na=False)]
        
        #check for empty transactions
        if df_lender_txns.empty is False:
            df_lender_txns['lenderName'] = df_lender_txns['memo'].str.extract("(" + "|".join(lend_cos) +")",flags = re.IGNORECASE)
            df_lender_txns['days_diff'] = (time_added.date()-df_lender_txns['posted_date']).dt.days
            df_lender_txns['amount'] = df_lender_txns['amount'].round(2)


            #conditions to determine lender variables
            cond1 = (df_lender_txns['amount']>0)
            cond2 = cond1 & (df_lender_txns['days_diff']<=30)
            cond3 = (df_lender_txns['amount']<0)
            cond4 = cond3 & (df_lender_txns['days_diff']<=30)

            #prepare lender variables
            lender_vars['LenderAmountDeb'] = float(df_lender_txns.loc[cond3,'amount'].sum())
            lender_vars['LenderCountCred'] = float(df_lender_txns[cond1].shape[0])
            lender_vars['LenderAmountCred30'] = float(df_lender_txns.loc[cond2,'amount'].sum())
            lender_vars['LenderCountDeb'] = float(df_lender_txns[cond3].shape[0])
            lender_vars['LenderAmountDeb30'] = float(df_lender_txns.loc[cond4,'amount'].sum())
            lender_vars['LenderCountCred30'] = float(df_lender_txns.loc[cond2].shape[0])
            lender_vars['LenderCountDeb30'] = float(df_lender_txns.loc[cond4].shape[0])
            lender_vars['LenderAmountCred'] = float(df_lender_txns.loc[cond1,'amount'].sum())
            lender_vars['UniqLenderCount'] = float(df_lender_txns['lenderName'].nunique())

        return pd.DataFrame(lender_vars,index=[0])
    

#### in case the lender vars are to be generated for funded loans between 2018-01-01 to 2019-12-31 do not run the below cell, instead download from s3 ( look for "download lender vars from s3" markdown)

In [None]:
df_lender_vars = pd.DataFrame()
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    df_lender_vars_temp = pool.starmap(create_lender_vars, zip(df_loans['LoanId'],df_loans['BankReportData'],df_loans['ReportTimeAdded'],df_loans['primary_account']))
df_lender_vars=pd.concat(df_lender_vars_temp,ignore_index=True)

In [None]:
df_lender_vars.reset_index(drop=True,inplace=True)

#### download lender vars from s3

In [None]:
import boto3

In [None]:
#provide access keys if needed
s3 = boto3.client('s3')

In [None]:
s3.download_file('predicon-bucket', 'lender_vars.csv', 'FILE_NAME')

In [None]:
df_lender_vars = pd.read_csv('lender_vars.csv')

In [None]:
df_loans = pd.merge(df_loans,df_lender_vars,how='left',on='LoanId')

### esign variables

In [None]:
df_loans= pd.merge(df_loans,df_esign,on='LoanId',how='left')

# Train

## useful links
https://docs.databricks.com/_static/notebooks/mlflow/mlflow-quick-start-deployment-aws.html

https://towardsdatascience.com/deploying-models-to-production-with-mlflow-and-amazon-sagemaker-d21f67909198

https://www.h2o.ai/blog/a-deep-dive-into-h2os-automl/

## specify features

In [None]:
features_drop = ['LoanCount',
'OriginationDate',             
'BankReportData',                   
'ReportTimeAdded',                  
'Campaign',
'primary_account',
'txn_days_count', 'DateOfBirth',]

In [None]:
df_train = df_loans.drop(columns=features_drop,axis=1)

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(max_mem_size='16G')

In [None]:
df_h20_train =  h2o.H2OFrame(df_train)

In [None]:
y = "IsFirstDefault" 
x = df_h20_train.columns
x.remove(y)
x.remove('LoanId')

In [None]:
aml = H2OAutoML(max_runtime_secs=120, seed=1)
aml.train(x=x, y=y, training_frame=df_h20_train)

In [None]:
lb = aml.leaderboard
lb.head()

# Predict

### primary account

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    res_primary_accts = pool.map(get_primary_account, df_loan_eval['BankReportData'])

In [None]:
df_loan_eval['primary_account'] = res_primary_accts

In [None]:
df_loan_eval = df_loan_eval.loc[df_loan_eval['primary_account'].notnull(),:]

### filter loans having transaction days >= 60 in primary account

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    txn_days_count = pool.starmap(get_transaction_days_count, zip(df_loan_eval['primary_account'],df_loan_eval['BankReportData']))

In [None]:
df_loan_eval['txn_days_count'] = txn_days_count

In [None]:
df_loan_eval = df_loan_eval.loc[df_loan_eval['txn_days_count']==True,:]

### calculate age

In [None]:
df_loan_eval['Age'] = df_loan_eval.apply(lambda x: calculate_age(x['OriginationDate'],x['DateOfBirth']), axis = 1)

### lead provider

In [None]:
df_loan_eval = df_loan_eval.loc[df_loan_eval['Campaign'].notnull(),:]

In [None]:
df_loan_eval['LeadProvider'] = df_loan_eval['Campaign'].str.extract("(" + "|".join(lead_provider_list) +")",flags = re.IGNORECASE)

In [None]:
df_loan_eval['LeadProvider']=df_loan_eval['LeadProvider'].fillna('Freedom')

### new or reloan

In [None]:
df_loan_eval['Reloan'] = df_loan_eval['LoanCount'].apply(lambda x:True if x>1 else False)

### lender vars

In [None]:
df_lender_vars_eval = pd.DataFrame()
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    df_lender_vars_temp = pool.starmap(create_lender_vars, zip(df_loan_eval['LoanId'],df_loan_eval['BankReportData'],df_loan_eval['ReportTimeAdded'],df_loan_eval['primary_account']))
df_lender_vars_eval=pd.concat(df_lender_vars_temp,ignore_index=True)

In [None]:
df_loan_eval = pd.merge(df_loan_eval,df_lender_vars_eval,on='LoanId',how='left')

### esign variables

In [None]:
df_loan_eval= pd.merge(df_loan_eval,df_esign_eval,on='LoanId',how='left')

### run prediction

In [None]:
df_loan_predict = df_loan_eval.drop(columns=features_drop,axis=1)

In [None]:
h2o_eval = h2o.H2OFrame(df_loan_predict)

In [None]:
pred = aml.leader.predict(h2o_eval)
pred.head()

In [None]:
#convert to pandas dataframe
df_predictions = h2o.as_list(pred)

In [None]:
df_loan_eval.reset_index(drop=True,inplace=True)

In [None]:
df_predictions['target'] = df_loan_eval['IsFirstDefault']  

In [None]:
df_predictions = df_predictions.rename(columns={'True':'prob'})

In [None]:
df_predictions = df_predictions[['target','prob']]

In [None]:
df_predictions['target'].value_counts(normalize = True)

# Evaluate

## get BV uncertain and BV Approved loans for model evaluation

In [None]:
username_bank_app = 'bankreview'
password_bank_app = 'Freedom!23'
host_bank_app = '192.168.4.115'
port_bank_app = 3306
db_bank_app = 'bankreviewdb'

In [None]:
bank_app_conn = pymysql.connect(host=host_bank_app,
                                port=port_bank_app,
                                db=db_bank_app,
                                user=username_bank_app,
                                password=password_bank_app)

In [None]:
query_evaluation_loans = '''select loan_id, 
                                final_decision,
                                reasons_for_decision,
                                entered_date
                                
                            from loan 
                            where campaign like '%Production%'
                            and STR_TO_DATE(entered_date ,'%m/%d/%Y') >= STR_TO_DATE('01/01/2020','%m/%d/%Y')
                            and STR_TO_DATE(entered_date ,'%m/%d/%Y') < STR_TO_DATE('04/01/2020','%m/%d/%Y')
                            and final_decision in ('Bank Validation Uncertain','Bank Validation Approved') '''

In [None]:
df_eval_loans = pd.read_sql_query(query_evaluation_loans, con = bank_app_conn)

## get funded and mature loans for the same period

In [None]:
query_funded_mature_loans = ''' select LoanId, 
                                IsFirstDefault
                        from view_FCL_Loan
                        where OriginationDate >= '2020-01-01' 
                        and OriginationDate <= '2020-03-31'
                        and IsFirstDefault IS NOT NULL
                        and MerchantId IN (15, 18)
                        
                     '''

In [None]:
df_funded_mature_loans = pd.read_sql_query(query_funded_mature_loans,con = iloans_conn)

In [None]:
df_funded_mature_loans['LoanId'] = df_funded_mature_loans['LoanId'].astype(int).astype(str)

In [None]:
df_eval = pd.merge(df_funded_mature_loans,df_eval_loans,how = 'inner',left_on = 'LoanId',right_on = 'loan_id')

In [None]:
df_eval.info()

In [None]:
loan_id_list = list(df_eval['LoanId'])

### compute KS

In [None]:
import numpy as np

In [None]:
def get_KS(df_pred):
    """
    Returns KS given scores
    Parameters:
    df_pred (pandas df): DataFrame containing target variable and model score
    
    Returns:
    float: KS value
    """
    df_scores = df_pred.sort_values(by='prob')
    total_good = (df_scores['target'] == False).sum()
    total_bad = (df_scores['target'] == True).sum()
    df_scores['cum_good_perc'] = (df_scores['target'] == False).cumsum()/total_good
    df_scores['cum_bad_perc'] = (df_scores['target'] == True).cumsum()/total_bad
    df_scores['cum_diff'] = np.abs((df_scores['cum_good_perc'] - df_scores['cum_bad_perc']))
    return df_scores['cum_diff'].max()

In [None]:
get_KS()

### quantiling

In [None]:
def quantile_table(df_pred,n = 10):
    """
    Returns a quantile table given model scores (default is decile)
    
    Parameters:
    df_pred (pandas df): DataFrame containing target variable and model score
    
    Returns:
    pandas DataFrame: Pandas dataframe containing quantiles
    
    """
    df_scores = df_pred.sort_values(by='prob')
    df_scores['decile'],score_bin = pd.qcut(df_scores['prob'],10,labels=[1,2,3,4,5,6,7,8,9,10],retbins = True)
    df_scores['target'] = df_scores['target'].astype(int)
    df_scores_deciles = df_scores.groupby('decile',as_index=False).agg({'prob':['count','min','max','mean'],'target':'sum'})
    df_scores_deciles.columns = ['decile','count','min_score','max_score','mean_score','bad_count']
    df_scores_deciles['perc_bad'] = (df_scores_deciles['bad_count']/df_scores_deciles['count']) * 100
    return df_scores_deciles,score_bin

In [None]:
quantile_table, score_bins = quantile_table(df_predictions)

In [None]:
quantile_table

### get bins for quantile assignment

In [None]:
score_bins

In [None]:
score_bins = np.concatenate(([-np.inf], score_bins, [np.inf]))

### upload to sagemaker

In [None]:
import mlflow

In [None]:
import mlflow.h2o as mh2o

In [None]:
import mlflow.sagemaker as mfs

In [None]:
mh2o.save_model(aml.leader,path="path/to/trained/model")

In [None]:
region = "us-east-1"
arn = "arn:aws:iam::757719720041:role/Sagemaker"
appname = "h20-mlflow-deploy"
modeluri = "path/to/saved/model" 
image_url = "757719720041.dkr.ecr.us-east-1.amazonaws.com/freedom-pyfunc:latest"

In [None]:
mfs.deploy(app_name=appname, model_path=modeluri, instance_type='ml.t2.medium',region_name=region, mode="create",execution_role_arn=arn,image_url=image_url)

In [None]:
import boto3

def check_status(app_name):
    sage_client = boto3.client('sagemaker', region_name="us-east-1")
    endpoint_description = sage_client.describe_endpoint(EndpointName=app_name)
    endpoint_status = endpoint_description["EndpointStatus"]
    return endpoint_status

In [None]:
check_status(appname)