In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Importing useful libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import sys
import os
import pandas as pd
import numpy as np
import json
import joblib
import re
import multiprocessing as mp

### Adding some paths to use from which we would be fetching useful modules like '/home/shared/utils' has query module which is used to connect to the server without showing the credentials!!

In [None]:
sys.path.insert(0, os.getcwd())
sys.path.insert(0, '/home/shared/utils')
#sys.path.insert(0, '/home/vishal/refactoring_pipeline')

In [None]:
#from helper import fetch_checking_acct_txns
#import EDA as eda
import query as q

### NCPU is for multiprocessing. The function below takes all the CPU cores available in the system except 2 and if the system only has 2 cores, it uses only one of the cores to perform operations!!(FUn fact:The server which we are working on has 16 spu cores!!)

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1

### Query to fetch all accepted lead's credit statements for FreedomCash from iloans!!

In [None]:
def fetch_required_bank_reports(start, end):
    query = f'''
                SELECT  
                        GCDL.LoanId,
                        GCD.BankTransactionId,
                        GCD.BankReportData,
                        LA.EmployerName,
                        GCD.TimeAdded
                FROM view_FCL_GetCreditData GCD
                LEFT JOIN view_FCL_GetCreditDataLoan GCDL ON GCD.BankTransactionId = GCDL.BankTransactionId
                LEFT JOIN view_FCL_LeadAccepted  LA ON LA.LoanId = GCDL.LoanId
                WHERE GCD.TimeAdded >= {start}
                AND GCD.TimeAdded <= {end}
                AND GCD.ReportStatus  = 'COMPLETE'
                AND LA.MerchantId in (15, 18)
            '''
    df = q.iloans(query)
    return df

### Query to fetch the json data which has all the income related information from bankapp

In [None]:
def fetch_required_bank_app(start, end):
    query = f'''
                SELECT
                    loan_id as LoanId,
                    json
                FROM loan
                WHERE campaign like '%Production%'
                AND STR_TO_DATE(entered_date, '%m/%d/%Y') >= STR_TO_DATE({start}, '%Y-%m-%d')
                AND STR_TO_DATE(entered_date, '%m/%d/%Y') <= STR_TO_DATE({end}, '%Y-%m-%d')
             '''
    df = q.bankapp(query)
    return df

### The below function transforms the bank statement like the loan ids are of float type, it coverts it into str type and strips off the date part from timeadded feature

In [None]:
def modify_statement(df):
    """Modifies extracted bank statement data
    Args:
        df (pandas df): Dataframe consisting all the required columns from predicon model database
    Returns:
        pandas df: Modified dataframe
    """
    df['LoanId'] = df['LoanId'].astype(str).map(lambda x : x.split('.')[0])
    df['TimeAdded'] = pd.to_datetime(df['TimeAdded'].map(lambda x : x.date()))
    return df

### Fetching bankreports and modifying them, then fetching bankapp data

In [None]:
df_bankreports = fetch_required_bank_reports("'2020-01-07'", "'2020-02-07'")

In [None]:
df_bankapp = fetch_required_bank_app("'2020-01-07'", "'2020-02-07'")

In [None]:
df_bankreports = modify_statement(df_bankreports)

### Merging the bankapp data with and bankreports

In [None]:
df = pd.merge(df_bankapp, df_bankreports, on = 'LoanId', how = 'inner')

In [None]:
df.iloc[:1000, :].to_csv('credit_statements_1000.csv', index = False)