In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Importing useful libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import sys
import os
import pandas as pd
import numpy as np
import joblib
import multiprocessing as mp
import json

### Adding some paths to use from which we would be fetching useful modules like '/home/shared/utils' has db_utils module which is used to connect to the server without showing the credentials!!

In [2]:
sys.path.insert(0, os.getcwd())
sys.path.insert(0, '/home/shared/utils')
sys.path.insert(0, '/home/vishal/refactoring_pipeline')

### Helper function has fetch_checking_acct_txns which is used to check which account from all the available accounts of candidate is checking account. Similarly, EDA is used to get plots which are useful to analyse

### Query function is now available where we dont even have to add credentials in the python dile to establish connection It uses a yaml config file to establish connection.

In [12]:
from helper import fetch_checking_acct_txns
import EDA as eda
import query as q

### NCPU is for multiprocessing. The function below takes all the CPU cores available in the system except 2 and if the system only has 2 cores, it uses only one of the cores to perform operations!!(FUn fact:The server which we are working on has 16 spu cores!!)

In [13]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1

### Query to fetch required data from iloans!!

In [14]:
def fetch_required_bank_reports(start, end):
    query = f'''
                SELECT
                    LN.LoanId,
                    GCD.TimeAdded,
                    LN.OriginationDate,
                    LN.FirstName,
                    LN.LastName,
                    LN.Campaign,
                    LN.OriginalPrincipal,
                    LN.ReUppedPrincipal,
                    LN.DateOfBirth,
                    LN.BankName,
                    LN.MonthlyGrossIncome,
                    LN.EmployerName,
                    LN.IsFirstDefault,
                    GCD.BankTransactionId,
                    GCD.BankReportData
                FROM view_FCL_Loan LN
                LEFT JOIN view_FCL_GetCreditDataLoan GCDL on LN.LoanId = GCDL.LoanId
                LEFT JOIN view_FCL_GetCreditData GCD on GCD.BankTransactionId = GCDL.BankTransactionId
                WHERE LN.OriginationDate >= {start}
                AND LN.OriginationDate <= {end}
                AND LN.IsFirstDefault IS NOT NULL
                AND LN.MerchantId IN (15, 18)
                AND GCD.ReportStatus  = 'COMPLETE' 
            '''
    df = q.iloans(query)
    return df

### Query to fetch the json data which has all the income related information from bankapp

In [15]:
def fetch_required_bank_app(start, end):
    query = f'''
                SELECT
                    loan_id as LoanId,
                    json
                FROM loan
                WHERE campaign like '%Production%'
                AND STR_TO_DATE(entered_date, '%m/%d/%Y') >= STR_TO_DATE({start}, '%Y-%m-%d')
                AND STR_TO_DATE(entered_date, '%m/%d/%Y') <= STR_TO_DATE({end}, '%Y-%m-%d')
             '''
    df = q.bankapp(query)
    return df

### Fetching the primary checking account coz a candidate may have multiple checking accounts

In [16]:
def get_primary_account(bankreport):
    """
    Flag primary checking account (account having max transaction count)
    
    Args:
    bankreport (json)
    loanid (str)
    
    Returns:
    account number (str) : account number of primary account
    """
    df_txn = fetch_checking_acct_txns(bankreport)
    if df_txn.empty is False:
        df_txns_count = df_txn['account_number'].value_counts()
        return df_txns_count.idxmax()

### We would be working on only those applicants who have atleast 60 days of transactions, from the applied date

In [17]:
def get_transaction_days_count(primary_account, bank_report):
    """Checks if number of transaction days >=60 given an account
    
    Args:
    primary_account (str): Account number of primary account
    bank_report (str): bank report string

    Returns:
    True or False (bool)
    """ 
    df_checking_txns = fetch_checking_acct_txns(bank_report)
    if df_checking_txns.empty is False:
        df_primary_account_txns = df_checking_txns[df_checking_txns['account_number'] == primary_account]
        df_primary_account_txns= df_primary_account_txns.sort_values(by = 'posted_date')
        first_txn_date = df_primary_account_txns['posted_date'].iloc[0]
        last_txn_date = df_primary_account_txns['posted_date'].iloc[-1]
        txn_days_count = (last_txn_date - first_txn_date).days
        return txn_days_count >= 60

### The function below returns all the transactions for a candidate

In [18]:
def get_transaction_time_series(primary_account, bank_report, loan_id):
    """Compute transactions of each day with the dates in timeseries format.

    Args:
    loan_id (float)
    bank_report (str)
    primary_acct (str)

    Returns:
    df_txns(pandas dataframe):
    """
    df_checking_txns = fetch_checking_acct_txns(bank_report)
    if df_checking_txns.empty is False:
        df_txns = df_checking_txns.loc[df_checking_txns['account_number'] == primary_account, :]
        df_txns['posted_date'] = pd.to_datetime(df_txns['posted_date'])
        df_txns['LoanId'] = loan_id
        return df_txns

### This function returns the number of income sources and the income cycle of each income source separated by +. For ex, if income sources are 2, the income cycles would be like in_cycle1+in_cycle2, where in_cycle1 is the income cycle of the first income source and similarly the second part for the second income respectively

In [19]:
def get_income_sources_and_cycle(json_string, loan_id):
    try:
        no_sources = json.loads(json_string)['incomeReview']['data']['incomeSources']
        cycles = []
        for income in range(int(no_sources)):
            cycles.append(json.loads(json_string)['incomeReview']['data']['sources'][income]['incomeCycle'])
        cycles = '+'.join(cycles)
        return [loan_id, no_sources, cycles]
    except:
        return [0, 0, 0]

### The below function checks that if payroll, or paycheck or income category(along with amt > 0 is given because in many cases, income tags have a negative amount) is present in the candidates transaction, it would return all credit transactions that type. If we dont find any of thw above type category, we would just check all the credit type (amt > 0) transactions

In [83]:
def get_income(primary_acct, bank_report, loanid):
    test_transact = get_transaction_time_series(primary_acct, bank_report, loanid).sort_values(by = 'posted_date').reset_index(drop = True)
    test_transact['LoanId'] = loanid
    test_inc = test_transact[((test_transact['amount'] > 0) & (test_transact['category'] == 'Payroll')) | ((test_transact['amount'] > 0) & (test_transact['category'] == 'Income')) | ((test_transact['amount'] > 0) & (test_transact['category'] == 'Paycheck')) | (test_transact['amount'] > 0)][['posted_date', 'amount', 'category', 'type', 'memo']]
    list_ = test_inc.to_dict('records')
    return list_

### The below function transforms the bank statement like the loan ids are of float type, it coverts it into str type and strips off the date part from timeadded feature

In [21]:
def modify_statement(df):
    """Modifies extracted bank statement data
    Args:
        df (pandas df): Dataframe consisting all the required columns from predicon model database
    Returns:
        pandas df: Modified dataframe
    """
    df['LoanId'] = df['LoanId'].astype(str).map(lambda x : x.split('.')[0])
    df['TimeAdded'] = pd.to_datetime(df['TimeAdded'].map(lambda x : x.date()))
    return df

### The below function converts the transaction statement into 4 buckets based on category of the transactions viz. Payroll, Paycheck, Income or Rest

In [22]:
def get_diff_inc_cat(k):
    txn_proll = txn_pcheq = txn_inc = txn_rest = pd.DataFrame()
    if 'Payroll' in pd.DataFrame(income_temp[k]).groupby('category', as_index = False)['posted_date'].count().nlargest(8, 'posted_date').category.values:
        txn_proll = pd.DataFrame(income_temp[k])[pd.DataFrame(income_temp[k])['category'] == 'Payroll']
    if 'Paycheck' in pd.DataFrame(income_temp[k]).groupby('category', as_index = False)['posted_date'].count().nlargest(8, 'posted_date').category.values:
        txn_pcheq = pd.DataFrame(income_temp[k])[pd.DataFrame(income_temp[k])['category'] == 'Paycheck']
    if 'Income' in pd.DataFrame(income_temp[k]).groupby('category', as_index = False)['posted_date'].count().nlargest(8, 'posted_date').category.values:
        txn_inc = pd.DataFrame(income_temp[k])[pd.DataFrame(income_temp[k])['category'] == 'Income']
    txn_rest = pd.DataFrame(income_temp[k])[(pd.DataFrame(income_temp[k])['category'] != 'Income') & (pd.DataFrame(income_temp[k])['category'] != 'Paycheck') & (pd.DataFrame(income_temp[k])['category'] != 'Payroll')]
    return txn_proll, txn_pcheq, txn_inc, txn_rest

### Fetching bankreports and modifying them, then fetching bankapp data

In [23]:
df_bankreports = fetch_required_bank_reports("'2020-01-07'", "'2020-02-07'")

In [24]:
df_bankapp = fetch_required_bank_app("'2020-01-07'", "'2020-02-07'")

In [25]:
df_bankreports = modify_statement(df_bankreports)

In [None]:
df_bankreports.sample(5)

In [None]:
df_bankapp.sample(5)

### Merging the bankapp data with and bankreports

In [26]:
df = pd.merge(df_bankapp, df_bankreports, on = 'LoanId', how = 'inner')

In [None]:
df.sample(5)

### Using the multiprocessing function to fetch the name of primary checking accounts

In [27]:
with mp.Pool(processes = NCPU) as pool:
        result_primary_accts = pool.map(get_primary_account, df['BankReportData'])
    
df['primary_account'] = result_primary_accts

In [None]:
df.info()

### Similarly, filtering only those candidates that have atleast 60 transaction days!!

In [28]:
with mp.Pool(processes=NCPU) as pool:
        txn_days_count = pool.starmap(get_transaction_days_count, zip(df['primary_account'], df['BankReportData']))

df['txn_days_count'] = txn_days_count

has_gt_60_days_txns = (df['txn_days_count'] == True)
df = df[has_gt_60_days_txns]

In [None]:
df.info()

In [29]:
df = df.reset_index(drop = True)

### Fetching the income sources and their cycles respectively

In [30]:
with mp.Pool(processes = NCPU) as pool:
        source_and_cycle = pool.starmap(get_income_sources_and_cycle, zip(df['json'], df['LoanId']))
        
df_source_and_cycle = pd.DataFrame(source_and_cycle, columns = ['LoanId', '#sources', 'in_cycles'])

In [None]:
df_source_and_cycle.sample(5)

### Merging the sources and cycle data we got to our previously available dataset

In [31]:
df = pd.merge(df, df_source_and_cycle, on = 'LoanId', how = 'left')

In [None]:
df.sample(5)

### Taking a look at how income sources and cycles are distributed accross the loanids

In [None]:
eda.countplot_categorical_columns(df, cols = ['#sources', 'in_cycles'], force = True)

### Fetching required data for a particular index from the dataframe to crossverify things

In [122]:
df[df['LoanId'] == '79669686732']

Unnamed: 0,LoanId,json,TimeAdded,OriginationDate,FirstName,LastName,Campaign,OriginalPrincipal,ReUppedPrincipal,DateOfBirth,BankName,MonthlyGrossIncome,EmployerName,IsFirstDefault,BankTransactionId,BankReportData,primary_account,txn_days_count,#sources,in_cycles
75,79669686732,"{""basicValidations"":{""data"":{""priorLoans"":[{""i...",2020-01-07,2020-01-15,Rebecca,Escobedo,F2-StopNGo Select:211,300.0,0.0,1971-10-08,TEXAS TRUST CREDIT UNION,3000.0,Bison,False,49a69016-cd2e-4c24-821b-9c5515316efe,"{""token"":""49a69016-cd2e-4c24-821b-9c5515316efe...",EASY-CHK,True,1,BiWeekly


In [124]:
df.loc[75, ['BankTransactionId', 'primary_account', 'EmployerName', 'LoanId']]

BankTransactionId    49a69016-cd2e-4c24-821b-9c5515316efe
primary_account                                  EASY-CHK
EmployerName                                        Bison
LoanId                                        79669686732
Name: 75, dtype: object

### Checking the json file to see the income status of an applicamt. Here, 47 is the index

In [123]:
json.loads(df.loc[75, 'json'])['incomeReview']['data']

{'que': 'How many income sources are visible?',
 'incomeSources': '1',
 'sources': [{'incomeCycle': 'BiWeekly',
   'sourceName': 'PRODUCERS SERVIC ACH Dep: PRODUCERS SERVIC',
   'directDeposit': 'Yes',
   'records': [{'amount': 1462.03, 'date': '12/20/2019', 'balance': 1351.34},
    {'amount': 3044.91, 'date': '12/6/2019', 'balance': 2645.54},
    {'amount': 637.89, 'date': '11/22/2019', 'balance': 696.51}],
   'estimatedMonthly': 3430,
   'noSaleInc': 4507}],
 'incomeTransactions': [{'postedDate': '12/3/2019',
   'memo': 'APPLE.COM/BILL One Apple Park Wa ADJ Debit Card W/D',
   'categoryName': 'Electronics & Software',
   'amount': 24.93,
   'endingBalance': 645.38,
   'source': '',
   'isSelected': False,
   'isRemove': False},
  {'postedDate': '12/3/2019',
   'memo': 'ATWOOD 01 ENID 5400 W. OWEN K. G ADJ Debit Card W/D',
   'categoryName': 'Clothing',
   'amount': 62.87,
   'endingBalance': 645.38,
   'source': '',
   'isSelected': False,
   'isRemove': False},
  {'postedDate': '12/

### Using the get_income function to fetch all the required income types

In [84]:
with mp.Pool(processes = NCPU) as pool:
        income_temp = pool.starmap(get_income, zip(df['primary_account'], df['BankReportData'], df['LoanId']))

### An example of how our fetched data looks like

In [None]:
income_temp[317]

### Loanids which were faulted by manual agents(I think so because they could have a different definition of how an income should behave in the bank statement. may be correct..may be not!!)

<br>81672524189</br>
<br>20669855168</br>
<br>75671610930</br>
<br>86669962867</br>
<br>79669686732</br>

### The income_temp variable is a list of lists of dictionaries. Converting it to dataframe to analyse it rigorously

In [140]:
pd.DataFrame(income_temp[75]).set_index('posted_date')

Unnamed: 0_level_0,amount,category,type,memo
posted_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-10-12,450.0,Transfer,transfer,"HB XFR Dep From ESCOBEDO,ANTHONY XXXXXXXXXX"
2019-10-15,410.0,Income,deposit,Cash Dep DEPOSITOR: ESTELA MIRELES
2019-10-15,100.0,Income,deposit,KABBAGE ACH Dep: KABBAGE
2019-10-16,829.0,Income,deposit,TWC-BENEFITS ACH Dep: TWC-BENEFITS
2019-10-21,24.0,Transfer,transfer,From Share 01 ACH XFR Dep: From Share 01
...,...,...,...,...
2020-01-07,75.0,Income,deposit,STASH CAPITAL (S ACH Dep: STASH CAPITAL (S
2020-01-07,30.0,Income,deposit,PAYPAL ACH Dep: PAYPAL
2020-01-07,225.0,Transfer,transfer,"HB XFR Dep: half of 450 From ESCOBEDO,ANTHONY ..."
2020-01-08,400.0,Loan Principal,deposit,FFLC ACH Dep: FFLC


### Analysing the buckets returned by get_diff_inc_cat

In [127]:
txn_proll, txn_pcheq, txn_inc, txn_rest = get_diff_inc_cat(75)

In [128]:
txn_proll

In [129]:
txn_inc

Unnamed: 0,posted_date,amount,category,type,memo
1,2019-10-15,410.0,Income,deposit,Cash Dep DEPOSITOR: ESTELA MIRELES
2,2019-10-15,100.0,Income,deposit,KABBAGE ACH Dep: KABBAGE
3,2019-10-16,829.0,Income,deposit,TWC-BENEFITS ACH Dep: TWC-BENEFITS
5,2019-10-21,2.5,Income,deposit,PAYPAL ACH Dep: PAYPAL
9,2019-10-22,193.11,Income,deposit,Square Inc ACH Dep: Square Inc
10,2019-10-22,20.02,Income,deposit,"Dave, Inc ACH Dep: Dave, Inc"
12,2019-10-25,1849.89,Income,deposit,BOFS MANAGEMENT ACH Dep: BOFS MANAGEMENT
18,2019-11-04,130.43,Income,deposit,Square Inc ACH Dep: Square Inc
20,2019-11-04,359.31,Income,deposit,Square Inc ACH Dep: Square Inc
21,2019-11-05,4.67,Income,deposit,Square Inc ACH Dep: Square Inc


In [130]:
txn_pcheq

In [131]:
txn_rest

Unnamed: 0,posted_date,amount,category,type,memo
0,2019-10-12,450.0,Transfer,transfer,"HB XFR Dep From ESCOBEDO,ANTHONY XXXXXXXXXX"
4,2019-10-21,24.0,Transfer,transfer,From Share 01 ACH XFR Dep: From Share 01
6,2019-10-21,75.0,Transfer,transfer,HB XFR Dep: From Share 01 Mobile Banking Transfer
7,2019-10-21,25.0,Transfer,transfer,HB XFR Dep: From Share 01 Mobile Banking Transfer
8,2019-10-21,30.0,Transfer,transfer,"HB XFR Dep From ESCOBEDO,JOSHUA XXXXXXXXXX"
11,2019-10-25,400.0,Loan Principal,deposit,Dep FFLC
13,2019-10-30,125.0,Transfer,transfer,HB XFR Dep: From Share 01 Mobile Banking Transfer
14,2019-10-31,175.0,Transfer,transfer,HB XFR Dep: From Share 01 Mobile Banking Transfer
15,2019-11-01,75.0,Transfer,transfer,HB XFR Dep: From Share 01 Mobile Banking Transfer
16,2019-11-01,0.05,Investments,dividend,DIVIDEND POSTING Div Dep: DIVIDEND POSTING


<br>Occurences like business services and music!! causing problems(index425)</br>
<br>Occurences where income comes from 'income' category with electronic deposits are hard to isolate(index152)</br>
<br>Paychecks are always correct</br>
<br>Mostly whenever there are payrolls, income is not but vice versa not true!!</br>

### Analysing in which categories do most of the income fall!!

In [35]:
df_1_inc = df[df['#sources'] == '1'].reset_index(drop = True)

In [36]:
def analyse_cat_from_json(json_string):
    json_data = json.loads(json_string)['incomeReview']['data']
    source_Name = json_data['sources'][0]['sourceName']
    cat = []
    try:
        for i in json_data['incomeTransactions']:
            if i['memo'] == source_Name:
                cat.append(i['categoryName'])
    finally:
        return set(cat)

In [133]:
df_1_inc.loc[64, ['BankTransactionId', 'primary_account', 'EmployerName', 'LoanId', 'TimeAdded']]

BankTransactionId    49a69016-cd2e-4c24-821b-9c5515316efe
primary_account                                  EASY-CHK
EmployerName                                        Bison
LoanId                                        79669686732
TimeAdded                             2020-01-07 00:00:00
Name: 64, dtype: object

In [134]:
json.loads(df_1_inc.loc[64, 'json'])['incomeReview']['data']

{'que': 'How many income sources are visible?',
 'incomeSources': '1',
 'sources': [{'incomeCycle': 'BiWeekly',
   'sourceName': 'PRODUCERS SERVIC ACH Dep: PRODUCERS SERVIC',
   'directDeposit': 'Yes',
   'records': [{'amount': 1462.03, 'date': '12/20/2019', 'balance': 1351.34},
    {'amount': 3044.91, 'date': '12/6/2019', 'balance': 2645.54},
    {'amount': 637.89, 'date': '11/22/2019', 'balance': 696.51}],
   'estimatedMonthly': 3430,
   'noSaleInc': 4507}],
 'incomeTransactions': [{'postedDate': '12/3/2019',
   'memo': 'APPLE.COM/BILL One Apple Park Wa ADJ Debit Card W/D',
   'categoryName': 'Electronics & Software',
   'amount': 24.93,
   'endingBalance': 645.38,
   'source': '',
   'isSelected': False,
   'isRemove': False},
  {'postedDate': '12/3/2019',
   'memo': 'ATWOOD 01 ENID 5400 W. OWEN K. G ADJ Debit Card W/D',
   'categoryName': 'Clothing',
   'amount': 62.87,
   'endingBalance': 645.38,
   'source': '',
   'isSelected': False,
   'isRemove': False},
  {'postedDate': '12/

In [37]:
with mp.Pool(processes = NCPU) as pool:
        cat_temp_1_inc = pool.map(analyse_cat_from_json, df_1_inc['json'])

In [120]:
cat_temp_1_inc[64]

{'Groceries'}

In [44]:
cat_1_inc = [list(x)[0] for x in cat_temp_1_inc if len(list(x)) != 0]

In [45]:
cat_1_inc

['Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Income',
 'Payroll',
 'Payroll',
 'Income',
 'Payroll',
 'Income',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Income',
 'Home Improvement',
 'Income',
 'Payroll',
 'Payroll',
 'Payroll',
 'Paycheck',
 'Payroll',
 'Payroll',
 'Payroll',
 'Income',
 'Payroll',
 'Payroll',
 'Gifts & Donations',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Gas & Fuel',
 'Payroll',
 'Payroll',
 'Paycheck',
 'Payroll',
 'Income',
 'Income',
 'Payroll',
 'Payroll',
 'uncategorized',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Income',
 'Payroll',
 'Payroll',
 'Payroll',
 'Groceries',
 'Income',
 'Payroll',
 'Payroll',
 'Payroll',
 'Paycheck',
 'Payroll',
 'Payroll',
 'Payroll',
 'Income',
 'Payroll',
 'Payroll',
 'Payroll',
 'Income',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payroll',
 'Payro