# Extract

In [2]:
import pymssql
import pandas as pd
import json
import multiprocessing as mp
import datetime
import pymysql

In [1]:
server = '192.168.4.117'
database = 'FreedomCashLenders'
username = 'FreedomCashLendersAll'
mssql_password = 'Freedom123$'


In [3]:
iloans_conn = pymssql.connect(server, username, mssql_password, database, port = 1433)

In [None]:
start_date = "'2020-01-01'"
end_date = "'2020-12-31'"

In [None]:
query_loan = f'''select LN.LoanId,
                       LC.LoanCount,
                       LN.OriginationDate,
                       GC.BankReportData,
                       GC.TimeAdded as ReportTimeAdded,
                       LN.Campaign,
                       LN.MonthlyGrossIncome,
                       LN.DateOfBirth,
                       LN.IsFirstDefault
                       
                from view_FCL_Loan LN
                LEFT JOIN view_FCL_CustomerLoanCount LC ON LC.CustomerId = LN.CustomerId
                LEFT JOIN view_FCL_GetCreditDataLoan GCD ON LN.LoanId = GCD.LoanId
                LEFT JOIN view_FCL_GetCreditData GC ON GC.BankTransactionId = GCD.BankTransactionId
                
                
                where LN.OriginationDate >= {start_date}
                and LN.OriginationDate <= {end_date} 
                and LN.IsFirstDefault IS NOT NULL
                and LN.MerchantId IN (15, 18)
                and GC.ReportStatus = 'COMPLETE' '''

In [None]:
df_loans = pd.read_sql_query(query_loan,con = iloans_conn)

In [None]:
df_loans = df_loans.drop_duplicates('LoanId')

In [None]:
query_esign = f'''
SELECT
    LN.LoanId,
    ESIG.AccessCount,
    ESIG.EsigTimeSignedDiff_In_SEC
FROM
    view_FCL_Loan LN
    LEFT JOIN view_FCL_EsignatureCustomerData ESIG ON LN.LoanId = ESIG.LoanId
WHERE
    LN.OriginationDate >= {start_date} 
    and LN.OriginationDate <= {end_date}
    and LN.IsFirstDefault IS NOT NULL
    and LN.MerchantId IN (15, 18)

'''


In [None]:
df_esign = pd.read_sql_query(query_esign,con=iloans_conn)

## Extract Data for Evaluation

In [4]:
def stringify_account_ids(loan_id_list):
    """
    Convert account_id list into comma-separated string of account_ids
    :return: string containing comma-separated account_ids
    """
    return '(' + ', '.join([str(i) for i in loan_id_list]) + ')'

In [None]:
loanid_string=stringify_account_ids(loan_id_list)

In [None]:
query_loan_eval = '''select LN.LoanId,
                       LC.LoanCount,
                       LN.OriginationDate,
                       GC.BankReportData,
                       GC.TimeAdded as ReportTimeAdded,
                       LN.Campaign,
                       LN.MonthlyGrossIncome,
                       LN.DateOfBirth,
                       LN.IsFirstDefault
                       
                from view_FCL_Loan LN
                LEFT JOIN view_FCL_CustomerLoanCount LC ON LC.CustomerId = LN.CustomerId
                LEFT JOIN view_FCL_GetCreditDataLoan GCD ON LN.LoanId = GCD.LoanId
                LEFT JOIN view_FCL_GetCreditData GC ON GC.BankTransactionId = GCD.BankTransactionId
                
                
                WHERE
                    GC.ReportStatus = 'COMPLETE'
                    AND LN.LoanId IN %s'''%(loanid_string)

In [None]:
df_loan_eval = pd.read_sql_query(query_loan_eval,con=iloans_conn)

In [None]:
df_loan_eval=df_loan_eval.drop_duplicates('LoanId')

In [None]:
query_esign_eval = '''
SELECT
    LN.LoanId,
    ESIG.AccessCount,
    ESIG.EsigTimeSignedDiff_In_SEC
FROM
    view_FCL_Loan LN
    LEFT JOIN view_FCL_EsignatureCustomerData ESIG ON LN.LoanId = ESIG.LoanId
WHERE
    LN.LoanId IN %s'''%(loanid_string)

In [None]:
df_esign_eval = pd.read_sql_query(query_esign_eval,con=iloans_conn)

In [None]:
df_esign_eval=df_esign_eval.drop_duplicates('LoanId')

# EDA

## check missing values

## outlier detection

## check data distribution

# Preprocess

## utility functions

In [None]:
def parse_dates(json_date):
    '''
    Converts json formatted date to pandas datetime.
    
    Parameters:
    JSON date (JSON).
    
    Returns:
    Pandas datetime object.
    
    '''
    
    #return datetime.fromtimestamp(int(json_date)/1000.0).strftime('%Y-%m-%d')
    return datetime.datetime.utcfromtimestamp(int(json_date)/1000).date()


def fetch_checking_acct_txns(json_string):
    """
    Parse all checking account transactions in the bank report
    
    Parameters:
    json_string(json): json containing bank report
    
    Returns:
    dataframe: containing transactions 
    
    """
    j = json.loads(json_string)
    df_txn = pd.DataFrame()
    
    acct_numbers = []
    for accts in j['accounts']:
        
        if ('transactions' in accts.keys()) and (len(accts['transactions']) > 0) and (accts['accountNumber'] not in acct_numbers) and (accts['accountType'].strip().lower() == 'checking'):
            
            df_txn_temp = pd.DataFrame(accts['transactions'])
            df_txn_temp['account_number'] = accts['accountNumber']
            df_txn = df_txn.append(df_txn_temp, ignore_index=True)
            
            df_txn['posted_date'] = df_txn['postedDate'].map(lambda json_date: parse_dates(json_date))
            df_txn['category'] = df_txn['contexts'].map(lambda x: x[0]['categoryName'] if len(x) > 0 else np.nan)
            acct_numbers.append(accts['accountNumber'])
    
    if 'pending' in df_txn.columns:
        df_txn = df_txn[df_txn['pending'] == False]
    return df_txn

## primary account

In [None]:
def get_primary_account(bankreport):
    """
    Flag primary checking account (account having max transaction count)
    
    Parameters:
    bankreport (json)
    loanid (str)
    
    Returns:
    Dataframe containing checking accounts and primary account flag = 1
    """
    df_txn = fetch_checking_acct_txns(bankreport)
    if df_txn.empty is False:
        df_txns_count = df_txn['account_number'].value_counts()
        return df_txns_count.idxmax()

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    res_primary_accts = pool.map(get_primary_account, df_loans['BankReportData'])

In [None]:
df_loans['primary_account'] = res_primary_accts

In [None]:
df_loans = df_loans.loc[df_loans['primary_account'].notnull(),:]

## filter loans having transaction days >= 60 in primary account

In [None]:
def get_transaction_days_count(primary_account,bank_report):
    df_checking_txns = fetch_checking_acct_txns(bank_report)
    if df_checking_txns.empty is False:
        df_primary_account_txns = df_checking_txns[df_checking_txns['account_number']==primary_account]
        df_primary_account_txns= df_primary_account_txns.sort_values(by='posted_date')
        first_txn_date = df_primary_account_txns['posted_date'].iloc[0]
        last_txn_date = df_primary_account_txns['posted_date'].iloc[-1]
        txn_days_count = (last_txn_date - first_txn_date).days
        return txn_days_count >= 60

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    txn_days_count = pool.starmap(get_transaction_days_count, zip(df_loans['primary_account'],df_loans['BankReportData']))

In [None]:
df_loans['txn_days_count'] = txn_days_count

In [None]:
df_loans = df_loans.loc[df_loans['txn_days_count'] == True, :]

## Calculate Age

In [None]:
def calculate_age(current_date, dob):
    age = len(pd.date_range(start=dob,end=current_date,freq='Y'))
    return age

In [None]:
df_loans['Age'] = df_loans.apply(lambda x: calculate_age(x['OriginationDate'],x['DateOfBirth']), axis = 1)

## New or Reloan

In [None]:
df_loans['Reloan'] = df_loans['LoanCount'].apply(lambda x:True if x>1 else False)

### Lead Provider

In [None]:
df_loans = df_loans.loc[df_loans['Campaign'].notnull(),:]

In [None]:
lead_provider_list=['MarketBullet','StopNGo','Nimbus','EPCVIP','PingBid','LeapThry',
'Acquir','RoundSky','Zero','LeadPie',
'ITMedia','LeadsMarket']

In [None]:
df_loans['LeadProvider'] = df_loans['Campaign'].str.extract("(" + "|".join(lead_provider_list) +")",flags = re.IGNORECASE)

In [None]:
df_loans['LeadProvider']=df_loans['LeadProvider'].fillna('Freedom')

### lender vars

In [None]:
def create_lender_vars(loanid,report_string,time_added,pr_acct):

        """
        Function to generate lender variables 
        from primary account transactions

        Paramaters:
            txns(Boolean): True - Return lender txns along with lender variables
                           False - Return only lender variables

        Returns:
            lender_vars(dictionary): Dictionary containing all lender variables
        
        """

        lender_vars = dict()
        lender_names = list()
        lender_vars['LoanId'] = loanid
        lender_vars['LenderAmountDeb'] = 0.0
        lender_vars['LenderCountCred'] = 0.0
        lender_vars['LenderAmountCred30'] = 0.0
        lender_vars['LenderCountDeb'] = 0.0
        lender_vars['LenderAmountDeb30'] = 0.0
        lender_vars['LenderCountCred30'] = 0.0
        lender_vars['LenderCountDeb30'] = 0.0
        lender_vars['LenderAmountCred'] = 0.0
        lender_vars['UniqLenderCount'] = 0.0

        #load lending company list
        lend_cos=joblib.load('./lend_cos.pkl')

        #get primary checking account transactions
        df_checking_txns = fetch_checking_acct_txns(report_string) 
        df_pr_acct_txns = df_checking_txns[df_checking_txns['account_number']==pr_acct]
        
        
        #prepare lender transactions dataframe
        df_lender_txns=df_pr_acct_txns.loc[df_pr_acct_txns['memo'].str.contains('|'.join(lend_cos),case=False,na=False)]
        
        #check for empty transactions
        if df_lender_txns.empty is False:
            df_lender_txns['lenderName'] = df_lender_txns['memo'].str.extract("(" + "|".join(lend_cos) +")",flags = re.IGNORECASE)
            df_lender_txns['days_diff'] = (time_added.date()-df_lender_txns['posted_date']).dt.days
            df_lender_txns['amount'] = df_lender_txns['amount'].round(2)


            #conditions to determine lender variables
            cond1 = (df_lender_txns['amount']>0)
            cond2 = cond1 & (df_lender_txns['days_diff']<=30)
            cond3 = (df_lender_txns['amount']<0)
            cond4 = cond3 & (df_lender_txns['days_diff']<=30)

            #prepare lender variables
            lender_vars['LenderAmountDeb'] = float(df_lender_txns.loc[cond3,'amount'].sum())
            lender_vars['LenderCountCred'] = float(df_lender_txns[cond1].shape[0])
            lender_vars['LenderAmountCred30'] = float(df_lender_txns.loc[cond2,'amount'].sum())
            lender_vars['LenderCountDeb'] = float(df_lender_txns[cond3].shape[0])
            lender_vars['LenderAmountDeb30'] = float(df_lender_txns.loc[cond4,'amount'].sum())
            lender_vars['LenderCountCred30'] = float(df_lender_txns.loc[cond2].shape[0])
            lender_vars['LenderCountDeb30'] = float(df_lender_txns.loc[cond4].shape[0])
            lender_vars['LenderAmountCred'] = float(df_lender_txns.loc[cond1,'amount'].sum())
            lender_vars['UniqLenderCount'] = float(df_lender_txns['lenderName'].nunique())
            lender_names.append(list(df_lender_txns['lenderName'].unique()))
            lender_vars['LenderNames'] = lender_names

        return pd.DataFrame(lender_vars,index=[0])
    

#### in case the lender vars are to be generated for funded loans between 2018-01-01 to 2019-12-31 do not run the below cell, instead download from s3 ( look for "download lender vars from s3" markdown)

In [None]:
df_lender_vars = pd.DataFrame()
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    df_lender_vars_temp = pool.starmap(create_lender_vars, zip(df_loans['LoanId'],df_loans['BankReportData'],df_loans['ReportTimeAdded'],df_loans['primary_account']))
df_lender_vars=pd.concat(df_lender_vars_temp,ignore_index=True)

In [None]:
df_lender_vars.reset_index(drop=True,inplace=True)

#### download lender vars from s3

In [None]:
import boto3

In [None]:
#provide access keys if needed
s3 = boto3.client('s3')

In [None]:
s3.download_file('predicon-bucket', 'lender_vars.csv', 'FILE_NAME')

In [None]:
df_lender_vars = pd.read_csv('lender_vars.csv')

In [None]:
df_loans = pd.merge(df_loans,df_lender_vars,how='left',on='LoanId')

### esign variables

In [None]:
df_loans= pd.merge(df_loans,df_esign,on='LoanId',how='left')

# Train

## useful links
https://docs.databricks.com/_static/notebooks/mlflow/mlflow-quick-start-deployment-aws.html

https://towardsdatascience.com/deploying-models-to-production-with-mlflow-and-amazon-sagemaker-d21f67909198

https://www.h2o.ai/blog/a-deep-dive-into-h2os-automl/

## specify features

In [None]:
features_drop = ['LoanCount',
'OriginationDate',             
'BankReportData',                   
'ReportTimeAdded',                  
'Campaign',
'primary_account',
'txn_days_count', 'DateOfBirth',]

In [None]:
df_train = df_loans.drop(columns=features_drop,axis=1)

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(max_mem_size='16G')

In [None]:
df_h20_train =  h2o.H2OFrame(df_train)

In [None]:
y = "IsFirstDefault" 
x = df_h20_train.columns
x.remove(y)
x.remove('LoanId')

In [None]:
aml = H2OAutoML(max_runtime_secs=120, seed=1)
aml.train(x=x, y=y, training_frame=df_h20_train)

In [None]:
lb = aml.leaderboard
lb.head()

# Predict

### primary account

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    res_primary_accts = pool.map(get_primary_account, df_loan_eval['BankReportData'])

In [None]:
df_loan_eval['primary_account'] = res_primary_accts

In [None]:
df_loan_eval = df_loan_eval.loc[df_loan_eval['primary_account'].notnull(),:]

### filter loans having transaction days >= 60 in primary account

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    txn_days_count = pool.starmap(get_transaction_days_count, zip(df_loan_eval['primary_account'],df_loan_eval['BankReportData']))

In [None]:
df_loan_eval['txn_days_count'] = txn_days_count

In [None]:
df_loan_eval = df_loan_eval.loc[df_loan_eval['txn_days_count']==True,:]

### calculate age

In [None]:
df_loan_eval['Age'] = df_loan_eval.apply(lambda x: calculate_age(x['OriginationDate'],x['DateOfBirth']), axis = 1)

### lead provider

In [None]:
df_loan_eval = df_loan_eval.loc[df_loan_eval['Campaign'].notnull(),:]

In [None]:
df_loan_eval['LeadProvider'] = df_loan_eval['Campaign'].str.extract("(" + "|".join(lead_provider_list) +")",flags = re.IGNORECASE)

In [None]:
df_loan_eval['LeadProvider']=df_loan_eval['LeadProvider'].fillna('Freedom')

### new or reloan

In [None]:
df_loan_eval['Reloan'] = df_loan_eval['LoanCount'].apply(lambda x:True if x>1 else False)

### lender vars

In [None]:
df_lender_vars_eval = pd.DataFrame()
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    df_lender_vars_temp = pool.starmap(create_lender_vars, zip(df_loan_eval['LoanId'],df_loan_eval['BankReportData'],df_loan_eval['ReportTimeAdded'],df_loan_eval['primary_account']))
df_lender_vars_eval=pd.concat(df_lender_vars_temp,ignore_index=True)

In [None]:
df_loan_eval = pd.merge(df_loan_eval,df_lender_vars_eval,on='LoanId',how='left')

### esign variables

In [None]:
df_loan_eval= pd.merge(df_loan_eval,df_esign_eval,on='LoanId',how='left')

### run prediction

In [None]:
df_loan_predict = df_loan_eval.drop(columns=features_drop,axis=1)

In [None]:
h2o_eval = h2o.H2OFrame(df_loan_predict)

In [None]:
pred = aml.leader.predict(h2o_eval)
pred.head()

In [None]:
#convert to pandas dataframe
df_predictions = h2o.as_list(pred)

In [None]:
df_loan_eval.reset_index(drop=True,inplace=True)

In [None]:
df_predictions['target'] = df_loan_eval['IsFirstDefault']  

In [None]:
df_predictions = df_predictions.rename(columns={'True':'prob'})

In [None]:
df_predictions = df_predictions[['target','prob']]

In [None]:
df_predictions['target'].value_counts(normalize = True)

# Evaluate

## get BV uncertain and BV Approved loans for model evaluation

In [None]:
username_bank_app = 'bankreview'
password_bank_app = 'Freedom!23'
host_bank_app = '192.168.4.115'
port_bank_app = 3306
db_bank_app = 'bankreviewdb'

In [None]:
bank_app_conn = pymysql.connect(host=host_bank_app,
                                port=port_bank_app,
                                db=db_bank_app,
                                user=username_bank_app,
                                password=password_bank_app)

In [None]:
query_evaluation_loans = '''select loan_id, 
                                final_decision,
                                reasons_for_decision,
                                entered_date
                                
                            from loan 
                            where campaign like '%Production%'
                            and STR_TO_DATE(entered_date ,'%m/%d/%Y') >= STR_TO_DATE('01/01/2020','%m/%d/%Y')
                            and STR_TO_DATE(entered_date ,'%m/%d/%Y') < STR_TO_DATE('04/01/2020','%m/%d/%Y')
                            and final_decision in ('Bank Validation Uncertain','Bank Validation Approved') '''

In [None]:
df_eval_loans = pd.read_sql_query(query_evaluation_loans, con = bank_app_conn)

## get funded and mature loans for the same period

In [None]:
query_funded_mature_loans = ''' select LoanId, 
                                IsFirstDefault
                        from view_FCL_Loan
                        where OriginationDate >= '2020-01-01' 
                        and OriginationDate <= '2020-03-31'
                        and IsFirstDefault IS NOT NULL
                        and MerchantId IN (15, 18)
                        
                     '''

In [None]:
df_funded_mature_loans = pd.read_sql_query(query_funded_mature_loans,con = iloans_conn)

In [None]:
df_funded_mature_loans['LoanId'] = df_funded_mature_loans['LoanId'].astype(int).astype(str)

In [None]:
df_eval = pd.merge(df_funded_mature_loans,df_eval_loans,how = 'inner',left_on = 'LoanId',right_on = 'loan_id')

In [None]:
df_eval.info()

In [None]:
loan_id_list = list(df_eval['LoanId'])

### compute KS

In [None]:
import numpy as np

In [None]:
def get_KS(df_pred):
    """
    Returns KS given scores
    Parameters:
    df_pred (pandas df): DataFrame containing target variable and model score
    
    Returns:
    float: KS value
    """
    df_scores = df_pred.sort_values(by='prob')
    total_good = (df_scores['target'] == False).sum()
    total_bad = (df_scores['target'] == True).sum()
    df_scores['cum_good_perc'] = (df_scores['target'] == False).cumsum()/total_good
    df_scores['cum_bad_perc'] = (df_scores['target'] == True).cumsum()/total_bad
    df_scores['cum_diff'] = np.abs((df_scores['cum_good_perc'] - df_scores['cum_bad_perc']))
    return df_scores['cum_diff'].max()

In [None]:
get_KS()

### quantiling

In [None]:
def quantile_table(df_pred,n = 10):
    """
    Returns a quantile table given model scores (default is decile)
    
    Parameters:
    df_pred (pandas df): DataFrame containing target variable and model score
    
    Returns:
    pandas DataFrame: Pandas dataframe containing quantiles
    
    """
    df_scores = df_pred.sort_values(by='prob')
    df_scores['decile'],score_bin = pd.qcut(df_scores['prob'],10,labels=[1,2,3,4,5,6,7,8,9,10],retbins = True)
    df_scores['target'] = df_scores['target'].astype(int)
    df_scores_deciles = df_scores.groupby('decile',as_index=False).agg({'prob':['count','min','max','mean'],'target':'sum'})
    df_scores_deciles.columns = ['decile','count','min_score','max_score','mean_score','bad_count']
    df_scores_deciles['perc_bad'] = (df_scores_deciles['bad_count']/df_scores_deciles['count']) * 100
    return df_scores_deciles,score_bin

In [None]:
quantile_table, score_bins = quantile_table(df_predictions)

In [None]:
quantile_table

### get bins for quantile assignment

In [None]:
score_bins

In [None]:
score_bins = np.concatenate(([-np.inf], score_bins, [np.inf]))

### upload to sagemaker

In [None]:
import mlflow

In [None]:
import mlflow.h2o as mh2o

In [None]:
import mlflow.sagemaker as mfs

In [None]:
mh2o.save_model(aml.leader,path="path/to/trained/model")

In [None]:
region = "us-east-1"
arn = "arn:aws:iam::757719720041:role/Sagemaker"
appname = "h20-mlflow-deploy"
modeluri = "path/to/saved/model" 
image_url = "757719720041.dkr.ecr.us-east-1.amazonaws.com/freedom-pyfunc:latest"

In [None]:
mfs.deploy(app_name=appname, model_path=modeluri, instance_type='ml.t2.medium',region_name=region, mode="create",execution_role_arn=arn,image_url=image_url)

In [None]:
import boto3

def check_status(app_name):
    sage_client = boto3.client('sagemaker', region_name="us-east-1")
    endpoint_description = sage_client.describe_endpoint(EndpointName=app_name)
    endpoint_status = endpoint_description["EndpointStatus"]
    return endpoint_status

In [None]:
check_status(appname)

### negative esign and extremes positives analysis

In [5]:
df_neg_pos_loanids = pd.read_csv('faulty_loanids.csv')

In [6]:
df_neg_pos_loanids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 828 entries, 0 to 827
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   LoanId  828 non-null    int64
dtypes: int64(1)
memory usage: 6.6 KB


### query all required data

In [7]:
loanid_list_negative_postive = list(df_neg_pos_loanids['LoanId'])

In [8]:
loanid_string = stringify_account_ids(loanid_list_negative_postive) 

In [9]:
query='''SELECT
    LA.LoanID,
    LA.TimeAdded AS LeadTimeAdded,
    T2.NoteDescription,
    T2.NoteTimeAdded,
    T2.LoanStatus,
    ESIG.TimeSigned,
    ESIG.LeadTimeAdded AS LeadTimeAddedEsign,
    ESIG.EsigTimeSignedDiff_In_SEC,
    GC.TimeAdded AS GCTimeAdded,
    GC.ReportStatus,
    LC.LoanCount
FROM
    view_FCL_LeadAccepted LA
LEFT JOIN
    view_FCL_EsignatureCustomerData ESIG ON LA.LoanId=ESIG.LoanId
LEFT JOIN 
    view_FCL_GetCreditDataLoan GCD ON LA.LoanId = GCD.LoanId
LEFT JOIN 
    view_FCL_GetCreditData GC ON GC.BankTransactionId = GCD.BankTransactionId
LEFT JOIN 
    view_FCL_CustomerLoanCount LC ON LA.Customerid = LC.CustomerID
LEFT JOIN(
    SELECT 
        *
    FROM
    (
    SELECT 
        LoanId,
        NoteDescription,
        LoanStatus,
        TimeAdded AS NoteTimeAdded,
        ROW_NUMBER() OVER (PARTITION BY LoanId ORDER BY TimeAdded DESC) AS RowNumber
    FROM 
        view_FCL_Notes
    WHERE 
        NoteDescription LIKE '%%ESignature%%'
    ) T 
    WHERE
        T.RowNumber=1) T2 ON LA.LoanId = T2.LoanId
WHERE
    ESIG.LoanId IN %s'''%(loanid_string)

In [10]:
df_esign_analysis = pd.read_sql_query(query,con=iloans_conn)

In [11]:
df_esign_analysis.shape

(832, 11)

### Negative esigns


#### from below data it is observed that 'LeadTimeAdded'  recorded in Esign Table is not accurate. The lead time added  it closer to gc submit time. Hence it satifies the assumed flow i.e, Lead gets added---->customer Esigns----> customer submits bank report.  But that also raises question about positive esign time difference (in this case it means customer is submitting bank report first and then esigining)


#### we have to determine the flow whether it is gc first or esign first? in order to accurately calculate this field

In [12]:
df_esign_negatives = df_esign_analysis[df_esign_analysis['EsigTimeSignedDiff_In_SEC']<0]

In [13]:
df_esign_negatives.head()

Unnamed: 0,LoanID,LeadTimeAdded,NoteDescription,NoteTimeAdded,LoanStatus,TimeSigned,LeadTimeAddedEsign,EsigTimeSignedDiff_In_SEC,GCTimeAdded,ReportStatus,LoanCount
2,11640980000.0,2019-05-20 12:08:40.780,[Freedom2 Cash Lenders] ESignature Document,2019-05-20 22:40:41.233,Charged Off(NCM),2019-05-20 20:08:05.510,2019-05-20 20:08:40.780,-35,2019-05-20 20:08:40.200,COMPLETE,1
4,13633520000.0,2019-03-04 03:11:10.823,[Freedom2 Cash Lenders] ESignature Document,2019-03-04 22:36:42.473,Charged Off(NCM),2019-03-04 11:09:57.683,2019-03-04 11:11:10.823,-73,2019-03-04 11:11:10.590,COMPLETE,1
6,18614070000.0,2018-09-25 08:29:31.357,[Freedom Cash Lenders] ESignature Document,2018-09-25 23:53:06.030,Charged Off(MCS),2018-09-25 16:29:03.500,2018-09-25 16:29:31.357,-28,2018-09-25 16:29:30.677,COMPLETE,1
8,18633460000.0,2019-03-01 08:32:08.297,[Freedom Cash Lenders] ESignature Document,2019-03-01 20:42:50.177,Charged Off(NCM),2019-03-01 16:31:07.997,2019-03-01 16:32:08.297,-61,2019-03-01 16:32:07.587,COMPLETE,1
10,23642520000.0,2019-06-05 09:55:15.623,,NaT,,2019-06-05 17:53:19.290,2019-06-05 17:55:15.623,-116,2019-06-05 17:55:15.057,COMPLETE,5


#### Difference between esign table's leadTimeAdded and GC TimeAdded

In [None]:
df_esign_negatives['time_diff_gc_esignLeadTimeAdded'] = (df_esign_negatives['LeadTimeAddedEsign'] - df_esign_negatives['GCTimeAdded']).dt.total_seconds()

In [15]:
df_esign_negatives['time_diff_gc_esignLeadTimeAdded'].describe()

count    484.000000
mean       0.470229
std        1.140833
min      -13.307000
25%        0.230000
50%        0.263000
75%        0.427750
max       13.804000
Name: time_diff_gc_esignLeadTimeAdded, dtype: float64

### extreme positive cases

#### agents are sending out esign document days after gc submits, which can mean they are tracking down good customers and are making sure they are funded or customers are approaching the agents directly after many days requesting for a loan(maybe denied by other lenders or desperate)

In [16]:
df_esign_positives = df_esign_analysis[df_esign_analysis['EsigTimeSignedDiff_In_SEC']>0]

In [17]:
df_esign_positives.head(10)

Unnamed: 0,LoanID,LeadTimeAdded,NoteDescription,NoteTimeAdded,LoanStatus,TimeSigned,LeadTimeAddedEsign,EsigTimeSignedDiff_In_SEC,GCTimeAdded,ReportStatus,LoanCount
0,10632030000.0,2019-02-14 02:42:34.883,[Freedom2 Cash Lenders] ESignature Document,2019-02-19 20:30:53.047,Charged Off(NCM),2019-02-19 21:01:12.637,2019-02-14 10:42:34.883,469118,2019-02-14 10:42:34.653,COMPLETE,1
1,10637050000.0,2019-04-09 06:09:38.770,[Freedom Cash Lenders] ESignature Document,2019-04-16 15:09:17.257,Charged Off(NCM),2019-04-16 13:11:38.373,2019-04-09 14:09:38.770,601320,2019-04-09 14:09:38.543,COMPLETE,1
3,12597090000.0,2018-05-11 10:12:36.410,[Freedom2 Cash Lenders] ESignature Document,2018-05-17 22:38:21.330,Charged Off(NCM),2018-05-17 22:41:57.813,2018-05-11 18:12:36.410,534561,2018-05-11 18:12:36.110,COMPLETE,1
5,15639390000.0,2019-05-03 10:33:56.917,[Freedom2 Cash Lenders] ESignature Document,2019-05-08 20:42:20.400,No ACH actions on this loan,2019-05-08 20:41:57.540,2019-05-03 18:33:56.917,439681,2019-05-03 18:33:56.700,COMPLETE,1
7,18631900000.0,2019-02-12 10:12:18.133,[Freedom Cash Lenders] ESignature Document,2019-02-26 17:53:48.067,Customer sent to collections,2019-02-26 17:31:32.177,2019-02-12 18:12:18.133,1207154,2019-02-12 18:12:17.900,COMPLETE,1
9,19597940000.0,2018-05-18 07:19:28.333,[Freedom2 Cash Lenders] ESignature Document,2018-05-23 22:19:33.113,Bankruptcy,2018-05-23 22:33:58.097,2018-05-18 15:19:28.333,458070,2018-05-18 15:19:28.047,COMPLETE,1
12,25661540000.0,2019-11-14 13:37:03.483,[Freedom2 Cash Lenders] ESignature Document,2019-11-22 19:11:17.133,Loan Paid Off,2019-11-22 19:13:25.270,2019-11-14 21:37:03.483,682582,2019-11-14 21:37:02.653,COMPLETE,2
13,26623360000.0,2018-12-06 09:20:42.760,[Freedom Cash Lenders] ESignature Document,2018-12-19 22:32:14.227,Charged Off(NCM),2018-12-19 18:02:09.470,2018-12-06 17:20:42.760,1125687,2018-12-06 17:20:42.307,COMPLETE,1
14,29616930000.0,2018-10-10 05:24:44.210,[Freedom2 Cash Lenders] ESignature Document,2018-10-25 13:09:55.090,Charged Off(NCM),2018-10-25 13:10:38.283,2018-10-10 13:24:44.210,1295154,2018-10-10 13:24:43.833,COMPLETE,1
18,39619070000.0,2018-10-25 07:55:38.977,[Freedom Cash Lenders] ESignature Document,2018-10-31 20:14:00.690,Loan Paid Off by Re-Up,2018-10-31 21:30:38.220,2018-10-25 15:55:38.977,538500,2018-10-25 15:55:38.757,COMPLETE,3


#### The below loanid which has time sign difference of an year, funded date is on 2019-05-06, maybe the reused the same esginature document with esignature id 'ca67b4c3-cd83-4d26-9087-9c38562005c4' and no further actions were taken after esign

In [18]:
df_esign_positives[df_esign_positives['EsigTimeSignedDiff_In_SEC']==26283332]

Unnamed: 0,LoanID,LeadTimeAdded,NoteDescription,NoteTimeAdded,LoanStatus,TimeSigned,LeadTimeAddedEsign,EsigTimeSignedDiff_In_SEC,GCTimeAdded,ReportStatus,LoanCount
743,53639410000.0,2019-05-03 13:10:45.757,[Freedom2 Cash Lenders] ESignature Document,2019-05-03 22:47:22.653,Loan Paid Off by Re-Up,2020-03-03 02:06:17.567,2019-05-03 21:10:45.757,26283332,2019-05-03 21:10:45.547,COMPLETE,2


#### One interesting observation - looking at these loanids and their loan status only 17.94% have loans that are paid off, rest indicate that Freedom Cash has lost money on these customers (charge offs, declared bankruptcy, sent to collections etc)

In [20]:
df_esign_analysis['LoanStatus'].value_counts(normalize=True)

Charged Off(NCM)                          0.466921
Loan Paid Off                             0.179389
No ACH actions on this loan               0.123410
Loan Paid Off by Re-Up                    0.052163
Charged Off                               0.039440
Charged Off(MCS)                          0.030534
Renewed Loan Issued, Good Status          0.026718
Customer sent to collections              0.022901
Returned Item - Partially Paid Off        0.022901
Charged Off (Settled)                     0.017812
Bankruptcy                                0.008906
Returned Debit Item - Paid Off            0.007634
Returned Debit Item (Collections Loan)    0.001272
Name: LoanStatus, dtype: float64