In [21]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Importing useful libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import sys
import os
import pandas as pd
import numpy as np
import json
import joblib
import re
import multiprocessing as mp
import datetime

### Adding some paths to use from which we would be fetching useful modules like '/home/shared/utils' has query module which is used to connect to the server without showing the credentials!!

In [2]:
sys.path.insert(0, os.getcwd())

In [57]:
%reload_ext autoreload
%autoreload 2

In [13]:
import extract
import preprocess_inc as prep
import utility as util

### NCPU is for multiprocessing. The function below takes all the CPU cores available in the system except 2 and if the system only has 2 cores, it uses only one of the cores to perform operations!!(Fun fact:The server which we are working on has 16 spu cores!!)

In [5]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1

### Fetching bankreports

In [6]:
df_bankreports = extract.fetch_required_bank_reports("'2020-01-07'", "'2020-02-07'")

In [None]:
df_bankapp = extract.fetch_required_bank_app("'2020-01-07'", "'2020-02-07'")

### Once saved, no need to extract everytime; saved data can be just loaded

In [14]:
df_bankreports = pd.read_csv('bankreports.csv', parse_dates = ['TimeAdded'])

In [15]:
df_bankapp = pd.read_csv('bankapp.csv')

### Modifying

In [16]:
df_bankreports = prep.preprocess_bank_reports(df_bankreports)

In [17]:
df_bankapp_mod = prep.preprocess_bank_app_accounts(df_bankapp)

In [18]:
df_bankreports = df_bankreports.reset_index(drop = True)

In [19]:
df_bankapp_mod = df_bankapp_mod.reset_index(drop = True)

### Fetching all checking account transactions

In [20]:
with mp.Pool(processes = NCPU) as pool:
        temp_checking_txns = pool.starmap(util.fetch_checking_acct_txns, zip(df_bankreports['BankReportData'], df_bankreports['LoanId']))

In [21]:
df_checking_txns = pd.concat(temp_checking_txns, ignore_index = True).reset_index(drop = True)[['LoanId', 'account_number', 'posted_date',
                                                                                                'amount', 'memo', 'category']]

### Fetching all primary checking account transactions by comparing the account name given by the applicant

In [22]:
def return_primary_acct(loanid, pr_acct):
    df_txns = df_checking_txns[df_checking_txns['LoanId'] == loanid]
    return df_txns[df_txns['account_number'] == pr_acct].reset_index(drop = True) 

In [23]:
with mp.Pool(processes = NCPU) as pool:
        df_primary_txns_temp = pool.starmap(return_primary_acct, zip(df_bankapp_mod['LoanId'], df_bankapp_mod['account_auto']))

In [24]:
df_primary_txns = pd.concat(df_primary_txns_temp, ignore_index = True)

### Taking all positive transactions

In [25]:
df_cred_primary_txns = df_primary_txns[df_primary_txns['amount'] > 0]

### Getting ground truth data from bankapp

In [60]:
with mp.Pool(processes = NCPU) as pool:
        df_income_temp = pool.starmap(util.get_ground_truth_income, zip(df_bankapp_mod['json'], df_bankapp_mod['LoanId']))

In [91]:
df_income = pd.concat(df_income_temp, ignore_index = True).drop('balance', axis = 1)