In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Importing useful libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import sys
import os
import pandas as pd
import numpy as np
import json
import multiprocessing as mp
from sklearn.metrics import confusion_matrix as cm

### Adding some paths to use from which we would be fetching useful modules like '/home/shared/utils' has query module which is used to connect to the server without showing the credentials!!

In [None]:
sys.path.insert(0, os.getcwd())

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import extract
import preprocess_inc as prep
import utility as util

### NCPU is for multiprocessing. The function below takes all the CPU cores available in the system except 2 and if the system only has 2 cores, it uses only one of the cores to perform operations!!(Fun fact:The server which we are working on has 16 cpu cores!!)

In [None]:
NCPU = mp.cpu_count() - 2 if mp.cpu_count() > 2 else 1

### Fetching bankreports

In [None]:
df_bankreports = extract.fetch_required_bank_reports("'2020-01-07'", "'2020-02-07'")

In [None]:
df_bankapp = extract.fetch_required_bank_app("'2020-01-07'", "'2020-02-07'")

### Once saved, no need to extract everytime; saved data can be just loaded

In [None]:
df_bankreports = pd.read_csv('bankreports.csv', parse_dates = ['TimeAdded'])

In [None]:
df_bankapp = pd.read_csv('bankapp.csv')

### Modifying

In [None]:
df_bankreports_mod = prep.preprocess_bank_reports(df_bankreports).reset_index(drop = True)

In [None]:
df_bankapp_mod = prep.preprocess_bank_app_accounts(df_bankapp).reset_index(drop = True)

### Merging bankapp columns with bankreports to have employer names in our dataset

In [None]:
df_bankapp_mod = pd.merge(df_bankapp_mod, df_bankreports, on = 'LoanId', how = 'inner')

### Taking all Loan Ids which have an account

In [None]:
df_bankapp_mod = df_bankapp_mod[df_bankapp_mod['account_auto'].notnull()].reset_index(drop = True)

### Taking all positive transactions for each loan id above a particular threshold as returned by the the model which contains the actual incomes as well(flagged as isSelected = 1) from bankapp

In [None]:
with mp.Pool(processes = NCPU) as pool:
        cred_txns_temp = pool.starmap(util.fetch_txns_above_50_from_bankapp, zip(df_bankapp_mod['json'], df_bankapp_mod['LoanId'], df_bankapp_mod['EmployerName'], df_bankapp_mod['account_auto']))

In [None]:
df_cred_primary_txns = pd.concat(cred_txns_temp, ignore_index = True)[['LoanId', 'posted_date', 'memo', 'amount', 'EmployerName', 'isSelected']]

### Extracting all loan ids

In [None]:
loan_ids = df_cred_primary_txns.LoanId.unique()

60673127419 -> Employer name and income name differnt

23672880789 -> Issue with subset checking

### Preprocessing memos

### Converting all the memos to lower case

In [None]:
df_cred_primary_txns['memo'] = df_cred_primary_txns['memo'].str.lower()

### Removing punctuations from the memos

In [None]:
df_cred_primary_txns['memo'] = df_cred_primary_txns['memo'].apply(lambda text: prep.remove_punctuation(text))

### Removing stopwords

In [None]:
df_cred_primary_txns['memo'] = df_cred_primary_txns['memo'].apply(lambda text: prep.remove_stopwords(text))

### Removing numbers

In [None]:
df_cred_primary_txns['memo'] = df_cred_primary_txns['memo'].apply(lambda text: prep.remove_nums(text))

### Checking if employer name present in txn memos using fuzzy match score(flagged as isHit = 1)

In [None]:
df_cred_primary_txns['isHit'] = df_cred_primary_txns.apply(util.is_hit_or_miss, axis = 1)

### Fetching confusion matrix for each loan id

In [None]:
def stats_for_each_loanid(loan_id):
    """custom function to get the confusion matrix for each loan id

    Args:
        loan_id (str): loan id of the applicant

    Returns:
        [list]: the accuracy metrics for each loan id viz. the true pos, true neg, false pos a& false neg
    """
    try:
        txns = df_cred_primary_txns[df_cred_primary_txns['LoanId'] == loan_id]
        y_test = txns['isSelected']
        y_pred = txns['isHit']
        tn, fp, fn, tp = cm(y_test, y_pred).ravel()
        return [loan_id, tn, fp, fn, tp]
    except:
        pass

In [None]:
stats = list(map(stats_for_each_loanid, loan_ids))

### Removing all nones

In [None]:
stats = [x for x in stats if x != None]

In [None]:
df_checks = pd.DataFrame(stats, columns = ['LoanId', 'TN', 'FP', 'FN', 'TP'])