This notebook is for figuring out reputed lenders in a report. Currently it gives credits and debits of different lenders in each report over the last 2 years

In [None]:
import pymssql
import pandas as pd
import json
import datetime
import joblib
import re
import numpy as np
import multiprocessing as mp


In [None]:
%load_ext blackcellmagic

In [None]:
server = '192.168.4.117'
database = 'FreedomCashLenders'
username = 'FreedomCashLendersAll'
mssql_password = 'Freedom123$'

In [None]:
iloans_conn = pymssql.connect(server, username, mssql_password, database, port = 1433)

In [None]:
start_date = "'2018-01-01'"
end_date = "'2019-12-31'"

In [None]:
query_loan = f'''select LN.LoanId,
                       LC.LoanCount,
                       LN.OriginationDate,
                       GC.BankReportData,
                       GC.BankTransactionId,
                       GC.TimeAdded as ReportTimeAdded,
                       LN.Campaign,
                       LN.MonthlyGrossIncome,
                       LN.DateOfBirth,
                       LN.IsFirstDefault
                       
                from view_FCL_Loan LN
                LEFT JOIN view_FCL_CustomerLoanCount LC ON LC.CustomerId = LN.CustomerId
                LEFT JOIN view_FCL_GetCreditDataLoan GCD ON LN.LoanId = GCD.LoanId
                LEFT JOIN view_FCL_GetCreditData GC ON GC.BankTransactionId = GCD.BankTransactionId
                
                
                where LN.OriginationDate >= {start_date}
                and LN.OriginationDate <= {end_date} 
                and LN.IsFirstDefault IS NOT NULL
                and LN.MerchantId IN (15, 18)
                and GC.ReportStatus = 'COMPLETE' '''

In [None]:
df_loans = pd.read_sql_query(query_loan,con = iloans_conn)

In [None]:
df_loans = df_loans.drop_duplicates('LoanId')

## utility functions

In [None]:
def parse_dates(json_date):
    '''
    Converts json formatted date to pandas datetime.
    
    Parameters:
    JSON date (JSON).
    
    Returns:
    Pandas datetime object.
    
    '''
    
    #return datetime.fromtimestamp(int(json_date)/1000.0).strftime('%Y-%m-%d')
    return datetime.datetime.utcfromtimestamp(int(json_date)/1000).date()


def fetch_checking_acct_txns(json_string):
    """
    Parse all checking account transactions in the bank report
    
    Parameters:
    json_string(json): json containing bank report
    
    Returns:
    dataframe: containing transactions 
    
    """
    j = json.loads(json_string)
    df_txn = pd.DataFrame()
    
    acct_numbers = []
    for accts in j['accounts']:
        
        if ('transactions' in accts.keys()) and (len(accts['transactions']) > 0) and (accts['accountNumber'] not in acct_numbers) and (accts['accountType'].strip().lower() == 'checking'):
            
            df_txn_temp = pd.DataFrame(accts['transactions'])
            df_txn_temp['account_number'] = accts['accountNumber']
            df_txn = df_txn.append(df_txn_temp, ignore_index=True)
            
            df_txn['posted_date'] = df_txn['postedDate'].map(lambda json_date: parse_dates(json_date))
            df_txn['category'] = df_txn['contexts'].map(lambda x: x[0]['categoryName'] if len(x) > 0 else np.nan)
            acct_numbers.append(accts['accountNumber'])
    
    if 'pending' in df_txn.columns:
        df_txn = df_txn[df_txn['pending'] == False]
    return df_txn

## primary account

In [None]:
def get_primary_account(bankreport):
    """
    Flag primary checking account (account having max transaction count)
    
    Parameters:
    bankreport (json)
    loanid (str)
    
    Returns:
    Dataframe containing checking accounts and primary account flag = 1
    """
    df_txn = fetch_checking_acct_txns(bankreport)
    if df_txn.empty is False:
        df_txns_count = df_txn['account_number'].value_counts()
        return df_txns_count.idxmax()

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    res_primary_accts = pool.map(get_primary_account, df_loans['BankReportData'])

In [None]:
df_loans['primary_account'] = res_primary_accts

In [None]:
df_loans = df_loans.loc[df_loans['primary_account'].notnull(),:]

## filter loans having transaction days >= 60 in primary account

In [None]:
def get_transaction_days_count(primary_account, bank_report):
    df_checking_txns = fetch_checking_acct_txns(bank_report)
    if df_checking_txns.empty is False:
        df_primary_account_txns = df_checking_txns[
            df_checking_txns["account_number"] == primary_account
        ]
        df_primary_account_txns = df_primary_account_txns.sort_values(by="posted_date")
        first_txn_date = df_primary_account_txns["posted_date"].iloc[0]
        last_txn_date = df_primary_account_txns["posted_date"].iloc[-1]
        txn_days_count = (last_txn_date - first_txn_date).days
        return txn_days_count >= 60

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    txn_days_count = pool.starmap(
        get_transaction_days_count,
        zip(df_loans["primary_account"], df_loans["BankReportData"]),
    )

In [None]:
df_loans['txn_days_count'] = txn_days_count

In [None]:
df_loans = df_loans.loc[df_loans["txn_days_count"] == True, :]

In [None]:
def get_loan_amount_by_lender(reportid,report_string,pr_acct):
    
    #lend_cos=joblib.load('./lend_cos.pkl')
    
    df_checking_txns = fetch_checking_acct_txns(report_string) 
    df_pr_acct_txns = df_checking_txns[df_checking_txns['account_number']==pr_acct]
    
    df_lender_txns=df_pr_acct_txns.loc[df_pr_acct_txns['memo'].str.contains('|'.join(lend_cos),case=False,na=False)]
        
        #check for empty transactions
    if df_lender_txns.empty is False:
        df_lender_txns['lenderName'] = df_lender_txns['memo'].str.extract("(" + "|".join(lend_cos) +")",flags = re.IGNORECASE)
        df_lender_txns = df_lender_txns[['lenderName','amount','posted_date','memo']]
        df_lender_txns['report_id'] = reportid
        return df_lender_txns

In [None]:
lend_cos=joblib.load('./lend_cos.pkl')
lend_cos.remove('VBS')

In [None]:
df_lender_vars = pd.DataFrame()
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1
with mp.Pool(processes=NCPU) as pool:
    df_lender_vars_temp = pool.starmap(get_loan_amount_by_lender, zip(df_loans['BankTransactionId'],df_loans['BankReportData'],df_loans['primary_account']))
df_lender_vars=pd.concat(df_lender_vars_temp,ignore_index=True)

In [None]:
df_lender_vars.reset_index(drop=True,inplace=True)

In [None]:
def clean_string(x):
    x_clean = re.sub(r"\W+", "", x.lower())
    #remove electronic withdrawal
    x_ew = re.sub("electronicwithdrawal","",x_clean)
    x_vbs = re.sub("vbs","",x_ew)
    x_remove_loan = re.sub(r"loa[n]*[s]*$","",x_vbs)
    x_remove_cash = re.sub(r"cas[h]*$","",x_remove_loan)
    x_remove_finance = re.sub(r"fin[a]+[a-z]*[laeiou]+$","",x_remove_cash)
    x_remove_fin = re.sub(r"fin$","",x_remove_finance)
    x_remove_lending = re.sub(r"lendin[g]*$","",x_remove_fin)
    x_remove_digits = re.sub(r"\d*$","",x_remove_lending)
    return x_remove_digits

In [None]:
df_lender_vars['lender_name_clean'] = df_lender_vars['lenderName'].map(lambda x: clean_string(x))

In [None]:
df_lender_vars['type'] = ''
df_lender_vars.loc[df_lender_vars['amount'] < 0,'type'] = 'debit'
df_lender_vars.loc[df_lender_vars['amount'] >= 0,'type'] = 'credit'

In [None]:
lender_amounts = pd.pivot_table(
    df_lender_vars,
    values="amount",
    index=["report_id", "lender_name_clean"],
    columns=["type"],
    aggfunc=np.sum,
).reset_index()

In [None]:
lender_amounts = lender_amounts.dropna()