### Import packages

In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import operator

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix

import sparse_dot_topn.sparse_dot_topn as ct

In [9]:
fraud_df = pd.read_csv('Fraud Identity.csv')
dataset_df = pd.read_csv('ppp_transformed.csv')
print(fraud_df.shape)
print(dataset_df.shape)

(887, 1)
(961899, 68)


### Clean Company Names

In [5]:
def clean_name(df, col):
    df[col] = df[col].str.replace(r'[^\w\s]+', '')
    df[col] = df[col].str.lower()
    df[col] = df[col].str.strip()

    return df[col]

### Clean fraud_df

In [10]:
fraud_df.drop_duplicates(inplace=True, ignore_index=True)
dataset_df.dropna(axis=0, subset=['BorrowerName'], inplace=True)

fraud_df['Company Name'] = clean_name(df = fraud_df, col = 'Company Name')
print(fraud_df.shape)
fraud_df.head(10)

(802, 1)


Unnamed: 0,Company Name
0,idesignbuild llc
1,1stellar health llc
2,bestways2 health llc
3,joyous-health4u llc
4,rk painting co.
5,mickies auto and tire llc
6,blankenship rv finance solutions llc
7,rsgg properties llc
8,rsgg holdings llc
9,rsgg investments llc


### Creating a String Match Class
#### Source: https://medium.com/tim-black/fuzzy-string-matching-at-scale-41ae6ac452c2

In [7]:
class StringMatch():
    
    def __init__(self, source_names, target_names):
        self.source_names = source_names
        self.target_names = target_names
        self.ct_vect      = None
        self.tfidf_vect   = None
        self.vocab        = None
        self.sprse_mtx    = None
        
        
    def tokenize(self, analyzer='char_wb', n=3):
        '''
        Tokenizes the list of strings, based on the selected analyzer
        :param str analyzer: Type of analyzer ('char_wb', 'word'). Default is trigram
        :param str n: If using n-gram analyzer, the gram length
        '''
        # Create initial count vectorizer & fit it on both lists to get vocab
        self.ct_vect = CountVectorizer(analyzer=analyzer, ngram_range=(n, n))
        self.vocab   = self.ct_vect.fit(self.source_names + self.target_names).vocabulary_
        
        # Create tf-idf vectorizer
        self.tfidf_vect  = TfidfVectorizer(vocabulary=self.vocab, analyzer=analyzer, ngram_range=(n, n))
        
        
    def match(self, ntop=1, lower_bound=0, output_fmt='df'):
        '''
        Main match function. Default settings return only the top candidate for every source string.
        
        :param int ntop: The number of top-n candidates that should be returned
        :param float lower_bound: The lower-bound threshold for keeping a candidate, between 0-1.
                                    Default set to 0, so consider all canidates
        :param str output_fmt: The output format. Either dataframe ('df') or dict ('dict')
        '''
        self._awesome_cossim_top(ntop, lower_bound)
        
        if output_fmt == 'df':
            match_output = self._make_matchdf()
        elif output_fmt == 'dict':
            match_output = self._make_matchdict()
            
        return match_output
        
        
    def _awesome_cossim_top(self, ntop, lower_bound):
        ''' https://gist.github.com/ymwdalex/5c363ddc1af447a9ff0b58ba14828fd6#file-awesome_sparse_dot_top-py '''
        # To CSR Matrix, if needed
        A = self.tfidf_vect.fit_transform(self.source_names).tocsr()
        B = self.tfidf_vect.fit_transform(self.target_names).transpose().tocsr()
        M, _ = A.shape
        _, N = B.shape

        idx_dtype = np.int32

        nnz_max = M * ntop

        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)

        self.sprse_mtx = csr_matrix((data,indices,indptr), shape=(M,N))
    
    
    def _make_matchdf(self):
        ''' Build dataframe for result return '''
        # CSR matrix -> COO matrix
        cx = self.sprse_mtx.tocoo()

        # COO matrix to list of tuples
        match_list = []
        for row,col,val in zip(cx.row, cx.col, cx.data):
            match_list.append((row, self.source_names[row], col, self.target_names[col], val))

        # List of tuples to dataframe
        colnames = ['Row Idx', 'Title', 'Candidate Idx', 'Candidate Title', 'Score']
        match_df = pd.DataFrame(match_list, columns=colnames)

        return match_df

    
    def _make_matchdict(self):
        ''' Build dictionary for result return '''
        # CSR matrix -> COO matrix
        cx = self.sprse_mtx.tocoo()

        # dict value should be tuple of values
        match_dict = {}
        for row,col,val in zip(cx.row, cx.col, cx.data):
            if match_dict.get(row):
                match_dict[row].append((col,val))
            else:
                match_dict[row] = [(col, val)]

        return match_dict  

### Matching Company Name in fraud_df to BorrowerName in dataset

In [11]:
fraud_list = fraud_df['Company Name'].tolist()
dataset_list = dataset_df['BorrowerName'].tolist()

# Match the company names to dataset and time it
t0 = datetime.now()
name_match = StringMatch(fraud_list, dataset_list)
if name_match:
    name_match.tokenize()
    match_df = name_match.match()
t1 = datetime.now()
print("Matching is completed.")
full_time_tfidf = (t1-t0).total_seconds()
print("Time taken to complete matching: {} seconds".format(full_time_tfidf))

Matching is completed.
Time taken to complete matching: 49.375798 seconds


In [12]:
print(match_df.shape)
match_df.head(10)

(797, 5)


Unnamed: 0,Row Idx,Title,Candidate Idx,Candidate Title,Score
0,0,idesignbuild llc,424526,idesignbuild llc,0.985083
1,1,1stellar health llc,49797,1stellar health llc,0.982882
2,2,bestways2 health llc,49803,bestways2 health llc,0.968716
3,3,joyous-health4u llc,49825,joyoushealth4u llc,0.799247
4,4,rk painting co.,451086,rk painting company,0.75512
5,5,mickies auto and tire llc,776913,mickies auto and tires llc,0.953805
6,6,blankenship rv finance solutions llc,380231,blakenship rv finance solutions llc,0.883167
7,7,rsgg properties llc,374896,rsgg properties llc,0.971342
8,8,rsgg holdings llc,377880,rsgg holdings llc,0.976505
9,9,rsgg investments llc,378722,rsgg investments llc,0.97618


### Set isFraud = 1 if the similarity score is >= 0.8

In [13]:
for i in range(0, match_df.shape[0]):
    score = match_df.at[i, 'Score']
    if score >= 0.8:
        dataset_df.at[i, 'isFraud'] = 1

print(dataset_df[dataset_df['isFraud'] == 1].shape)
dataset_df[dataset_df['isFraud'] == 1].head(5)

(494, 68)


Unnamed: 0,LoanNumber,DateApproved,SBAOfficeCode,ProcessingMethod,BorrowerName,BorrowerCity,BorrowerState,BorrowerZip,LoanStatus,Term,...,MORTGAGE_INTEREST_PROCEED_pct_standardised,RENT_PROCEED_pct_standardised,REFINANCE_EIDL_PROCEED_pct_standardised,HEALTH_CARE_PROCEED_pct_standardised,DEBT_INTEREST_PROCEED_pct_standardised,InitialApprovalAmount_scaled_standardised,CurrentApprovalAmount_scaled_standardised,TOTAL_PROCEED_scaled_standardised,PROCEED_Per_Job_scaled_standardised,isFraud
0,9547507704,2020-05-01,464,PPP,sumter coatings inc,sumter,UNK,29150-9662,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,0.316451,0.321404,0.321407,-0.004019,1
1,9777677704,2020-05-01,464,PPP,pleasant places inc,north charleston,UNK,29420-9000,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,0.272954,0.277532,0.277535,-0.118508,1
2,6223567700,2020-05-01,920,PPP,kirtley construction inc,san bernardino,UNK,92407-1740,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,-0.044994,-0.043156,-0.043153,0.559708,1
5,9794577700,2020-05-01,491,PPP,fruit cove baptist church of jacksonville fl inc,saint johns,UNK,32259-2832,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,-0.326794,-0.327386,-0.327383,-0.456867,1
6,9722187702,2020-05-01,101,PPP,miamitown auto parts and recycling inc,unknown,UNK,0,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,-0.350111,-0.350904,-0.350901,0.091307,1


In [14]:
dataset_df.head()

Unnamed: 0,LoanNumber,DateApproved,SBAOfficeCode,ProcessingMethod,BorrowerName,BorrowerCity,BorrowerState,BorrowerZip,LoanStatus,Term,...,MORTGAGE_INTEREST_PROCEED_pct_standardised,RENT_PROCEED_pct_standardised,REFINANCE_EIDL_PROCEED_pct_standardised,HEALTH_CARE_PROCEED_pct_standardised,DEBT_INTEREST_PROCEED_pct_standardised,InitialApprovalAmount_scaled_standardised,CurrentApprovalAmount_scaled_standardised,TOTAL_PROCEED_scaled_standardised,PROCEED_Per_Job_scaled_standardised,isFraud
0,9547507704,2020-05-01,464,PPP,sumter coatings inc,sumter,UNK,29150-9662,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,0.316451,0.321404,0.321407,-0.004019,1
1,9777677704,2020-05-01,464,PPP,pleasant places inc,north charleston,UNK,29420-9000,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,0.272954,0.277532,0.277535,-0.118508,1
2,6223567700,2020-05-01,920,PPP,kirtley construction inc,san bernardino,UNK,92407-1740,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,-0.044994,-0.043156,-0.043153,0.559708,1
3,9662437702,2020-05-01,101,PPP,aero box llc,unknown,UNK,0,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,-0.222618,-0.222312,-0.222309,0.109201,0
4,9774337701,2020-05-01,101,PPP,hudson extrusions inc,unknown,UNK,0,Paid in Full,24,...,-0.117829,-0.259507,-0.024109,-0.175324,-0.069127,-0.274386,-0.274526,-0.274523,0.121559,0


In [12]:
dataset_df.to_csv('ppp_transformed_updated.csv')