In [1]:
import pandas as pd
import re
from fuzzywuzzy import fuzz, process

# Part 1: load the LOC data and identify non-numeric IDs, storing them separately:

In [2]:
file_path = '/Users/twylazhang/Desktop/Econ_banks_Research/LOC.dta'

# Load the .dta file using pandas read_stata function
loc_df = pd.read_stata(file_path)

loc_df

Unnamed: 0,source,lender_TopRSSDID,lender_TopName
0,1.0,1003790,USE CREDIT UNION
1,1.0,100393,FRANKENMUTH CREDIT UNION
2,3.0,100571,DEPARTMENT OF LABOR FCU
3,1.0,1025608,FIRST HAWAIIAN INC
4,3.0,1026801,FREMONT BANCORPORATION
...,...,...,...
1074,1.0,f96580,EXCEL FEDERAL CREDIT UNION
1075,3.0,f968744,FIRST CHOICE BANK
1076,1.0,f97,CAP
1077,1.0,f98,CAPITAL CITY


In total, there are 1079 rows of lenders/data

Since there are many credit unions that cannot be matched with either the `fr_report` or `call_report`, we will **separate banks and credit unions**.
- This ensures that only the **DataFrame containing banks** is used for the name matching process.

In [3]:
# Function to identify credit unions based on the presence of "credit union" or "cu"
def is_credit_union(name):
    # Check for "credit union" as a phrase
    if re.search(r'credit union', name, re.IGNORECASE):
        return True
    
    # Check for the word "cu"
    words = re.split(r'\W+', name)
    if 'cu' in [word.lower() for word in words]:
        return True
    
    return False

# Separate credit unions and banks in loc_df based on the name
credit_union_df = loc_df[loc_df['lender_TopName'].apply(is_credit_union)]
bank_df = loc_df[~loc_df['lender_TopName'].apply(is_credit_union)]

In [4]:
bank_df

Unnamed: 0,source,lender_TopRSSDID,lender_TopName
2,3.0,100571,DEPARTMENT OF LABOR FCU
3,1.0,1025608,FIRST HAWAIIAN INC
4,3.0,1026801,FREMONT BANCORPORATION
5,3.0,1027004,ZIONS BC
6,1.0,1030040,BANK OF CMRC HOLD
...,...,...,...
1073,1.0,f96,CANYON EDGE LLC
1075,3.0,f968744,FIRST CHOICE BANK
1076,1.0,f97,CAP
1077,1.0,f98,CAPITAL CITY


In [5]:
credit_union_df

Unnamed: 0,source,lender_TopRSSDID,lender_TopName
0,1.0,1003790,USE CREDIT UNION
1,1.0,100393,FRANKENMUTH CREDIT UNION
75,3.0,113777,ADVANCIAL FEDERAL CREDIT UNION
106,1.0,129787,FORT LEE FEDERAL CREDIT UNION
107,1.0,131780,HOPEWELL FEDERAL CU
...,...,...,...
1059,1.0,f899987,WSSC FEDERAL CREDIT UNION
1065,1.0,f924375,INTEGRITY FEDERAL CREDIT UNION
1067,1.0,f932594,FIRST AREA CREDIT UNION
1070,1.0,f947392,PAINESVILLE CREDIT UNION


there are 131 credit unions

# Part2: Name Matching Between FR Report(BHC) and bank_df

## Exact Matching

convert the names of the banks to all lower case in both bank_df and fr_report, then perform an exact match to see how many names match

In [6]:
# Load the call report and FR report
call_df = pd.read_csv('/Users/twylazhang/Desktop/Econ_banks_Research/background/call.csv')
fr_report = pd.read_csv('/Users/twylazhang/Desktop/Econ_banks_Research/background/FR_Report.csv')

# Normalize ID columns in all datasets for consistent integer comparison
#fr_report['id'] = pd.to_numeric(fr_report['id'], errors='coerce').fillna(0).astype(int)
#call_df['id'] = pd.to_numeric(call_df['id'], errors='coerce').fillna(0).astype(int)
#call_df['parent_id'] = pd.to_numeric(call_df['parent_id'], errors='coerce').fillna(0).astype(int)

In [7]:
# Make a copy of the bank_df to avoid SettingWithCopyWarning
bank_df_copy = bank_df.copy()

# Standardize names to all lower case
bank_df_copy.loc[:, 'lender_TopName_lower'] = bank_df_copy['lender_TopName'].str.lower()
fr_report.loc[:, 'name_legal_lower'] = fr_report['name_legal'].str.lower()

# Perform exact matching
matched_names = pd.merge(bank_df_copy, fr_report, left_on='lender_TopName_lower', right_on='name_legal_lower', how='inner')

# Count the number of matches
num_matches = matched_names.shape[0]

# Print the number of matches
print(f"Number of exact matches: {num_matches}")

matched_names[['lender_TopName', 'name_legal']]


Number of exact matches: 75


Unnamed: 0,lender_TopName,name_legal
0,FREMONT BANCORPORATION,FREMONT BANCORPORATION
1,TRICO BANCSHARES,TRICO BANCSHARES
2,SVB FINANCIAL GROUP,SVB FINANCIAL GROUP
3,M&T BANK CORPORATION,M&T BANK CORPORATION
4,TRUSTCO BANK CORP NY,TRUSTCO BANK CORP NY
...,...,...
70,RBB BANCORP,RBB BANCORP
71,VIRGINIA NATIONAL BANKSHARES CORPORATION,VIRGINIA NATIONAL BANKSHARES CORPORATION
72,OP BANCORP,OP BANCORP
73,AMERICAN EXPRESS COMPANY,AMERICAN EXPRESS COMPANY


only 75 are matched out of 948 lenders in bank_df

In [8]:
fr_report

Unnamed: 0,id,date,fed_district_code,charter_type,city,country,parent_id,lei,name_legal,name,org_type,fed_regulator,state,name_legal_lower
0,1020180,20171231,9,500,SAINT PAUL,UNITED STATES,0,549300TXP74T8NJZJW60,BREMER FINANCIAL CORPORATION,BREMER FNCL CORP,1,FRS,MN,bremer financial corporation
1,1020201,20171231,7,500,NEW YORK,UNITED STATES,3232316,549300LBOHZ4QSIWU288,HSBC USA INC.,HSBC USA,1,FRS,NY,hsbc usa inc.
2,1020395,20171231,6,500,ANDALUSIA,UNITED STATES,0,254900YBC9ZTMHLVQ556,SOUTHERN NATIONAL CORPORATION,SOUTHERN NAT CORP,1,FRS,AL,southern national corporation
3,1020582,20171231,7,500,WISCONSIN RAPIDS,UNITED STATES,0,0,WOODTRUST FINANCIAL CORP,WOODTRUST FC,1,FRS,WI,woodtrust financial corp
4,1020591,20171231,7,500,SIOUX CITY,UNITED STATES,0,0,"PINNACLE BANCORP, INC.",PINNACLE BC,1,FRS,IA,"pinnacle bancorp, inc."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4592,5147115,20171231,10,500,DURANGO,UNITED STATES,0,0,"TIG BANCORP, INC.",TIG BC,1,FRS,CO,"tig bancorp, inc."
4593,5158269,20171231,5,500,SPARTANBURG,UNITED STATES,0,0,CAB FINANCIAL CORPORATION,CAB FC,1,FRS,SC,cab financial corporation
4594,5163003,20171231,12,500,CERRITOS,UNITED STATES,0,54930094NOEN2QWZWW84,FIRST CHOICE BANCORP,FIRST CHOICE BC,1,FRS,CA,first choice bancorp
4595,5163898,20171231,12,500,BURLINGTON,UNITED STATES,0,0,"SAVI FINANCIAL CORPORATION, INC.",SAVI FC,1,FRS,WA,"savi financial corporation, inc."


## Additional matching since exact matching does not work as well. 

#### Standardize the names before matching

In [9]:
# Function to remove punctuation and extra spaces
def remove_punctuation_and_spaces(name):
    name = re.sub(r'[^\w\s]', '', name)  # Remove punctuation
    name = re.sub(r'\s+', ' ', name).strip()  # Remove extra spaces
    return name

# Function to expand common abbreviations
def expand_abbreviations(name):
    abbreviations = {
        "assoc": "association",
        "co": "company",
        "corp": "corporation",
        "inc": "incorporated",
        "ltd": "limited",
        "&": "and"
    }
    for abbr, full in abbreviations.items():
        name = re.sub(r'\b' + abbr + r'\b', full, name)
    return name

# Apply normalization functions
bank_df_copy.loc[:, 'normalized_name'] = bank_df_copy['lender_TopName'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))
fr_report.loc[:, 'normalized_name'] = fr_report['name_legal'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))


In [10]:
bank_df_copy

Unnamed: 0,source,lender_TopRSSDID,lender_TopName,lender_TopName_lower,normalized_name
2,3.0,100571,DEPARTMENT OF LABOR FCU,department of labor fcu,department of labor fcu
3,1.0,1025608,FIRST HAWAIIAN INC,first hawaiian inc,first hawaiian incorporated
4,3.0,1026801,FREMONT BANCORPORATION,fremont bancorporation,fremont bancorporation
5,3.0,1027004,ZIONS BC,zions bc,zions bc
6,1.0,1030040,BANK OF CMRC HOLD,bank of cmrc hold,bank of cmrc hold
...,...,...,...,...,...
1073,1.0,f96,CANYON EDGE LLC,canyon edge llc,canyon edge llc
1075,3.0,f968744,FIRST CHOICE BANK,first choice bank,first choice bank
1076,1.0,f97,CAP,cap,cap
1077,1.0,f98,CAPITAL CITY,capital city,capital city


In [11]:
fr_report

Unnamed: 0,id,date,fed_district_code,charter_type,city,country,parent_id,lei,name_legal,name,org_type,fed_regulator,state,name_legal_lower,normalized_name
0,1020180,20171231,9,500,SAINT PAUL,UNITED STATES,0,549300TXP74T8NJZJW60,BREMER FINANCIAL CORPORATION,BREMER FNCL CORP,1,FRS,MN,bremer financial corporation,bremer financial corporation
1,1020201,20171231,7,500,NEW YORK,UNITED STATES,3232316,549300LBOHZ4QSIWU288,HSBC USA INC.,HSBC USA,1,FRS,NY,hsbc usa inc.,hsbc usa incorporated
2,1020395,20171231,6,500,ANDALUSIA,UNITED STATES,0,254900YBC9ZTMHLVQ556,SOUTHERN NATIONAL CORPORATION,SOUTHERN NAT CORP,1,FRS,AL,southern national corporation,southern national corporation
3,1020582,20171231,7,500,WISCONSIN RAPIDS,UNITED STATES,0,0,WOODTRUST FINANCIAL CORP,WOODTRUST FC,1,FRS,WI,woodtrust financial corp,woodtrust financial corporation
4,1020591,20171231,7,500,SIOUX CITY,UNITED STATES,0,0,"PINNACLE BANCORP, INC.",PINNACLE BC,1,FRS,IA,"pinnacle bancorp, inc.",pinnacle bancorp incorporated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4592,5147115,20171231,10,500,DURANGO,UNITED STATES,0,0,"TIG BANCORP, INC.",TIG BC,1,FRS,CO,"tig bancorp, inc.",tig bancorp incorporated
4593,5158269,20171231,5,500,SPARTANBURG,UNITED STATES,0,0,CAB FINANCIAL CORPORATION,CAB FC,1,FRS,SC,cab financial corporation,cab financial corporation
4594,5163003,20171231,12,500,CERRITOS,UNITED STATES,0,54930094NOEN2QWZWW84,FIRST CHOICE BANCORP,FIRST CHOICE BC,1,FRS,CA,first choice bancorp,first choice bancorp
4595,5163898,20171231,12,500,BURLINGTON,UNITED STATES,0,0,"SAVI FINANCIAL CORPORATION, INC.",SAVI FC,1,FRS,WA,"savi financial corporation, inc.",savi financial corporation incorporated


## Fuzzy Matching:

### Explanation:

1. **Fuzzy Matching Function**:
   - `fuzzy_match(row, choices, scorer=fuzz.token_sort_ratio)`: Uses `fuzzywuzzy` to perform fuzzy matching on the normalized names with a similarity score threshold of 80.

2. **Apply Fuzzy Matching**:
   - Apply the `fuzzy_match` function to each row in `bank_df_copy` and add the result to the `fuzzy_matched_name` column.
   - **Fuzzy Matching**: This technique compares strings and evaluates their similarity, even when the strings are not exactly the same. It is useful for matching names that may have slight variations, such as typos, abbreviations, or different spellings. The `fuzz.token_sort_ratio` scorer from the `fuzzywuzzy` library calculates a similarity score between 0 and 100, where a higher score indicates greater similarity.

3. **Merge Based on Fuzzy Matched Names**:
   - Perform an inner join based on the fuzzy matched names and count the number of matches.
   - Print the number of matches and the matched names for verification.

4. **Find Unmatched Names**:
   - Identify and list names in `bank_df_copy` that do not have matches in `fr_report`.

In [None]:
# Fuzzy Matching Function
def fuzzy_match(row, choices, scorer=fuzz.token_sort_ratio):
    match, score = process.extractOne(row['normalized_name'], choices, scorer=scorer)
    return match if score > 80 else None

# Get unique normalized names from fr_report
fr_report_names = fr_report['normalized_name'].unique()

# Apply fuzzy matching
bank_df_copy['fuzzy_matched_name'] = bank_df_copy.apply(fuzzy_match, axis=1, choices=fr_report_names)

bank_df_copy['fuzzy_matched_name']

In [None]:
# Merge based on fuzzy matched names
fuzzy_matched_df = pd.merge(bank_df_copy, fr_report, left_on='fuzzy_matched_name', right_on='normalized_name', how='inner')
num_fuzzy_matches = fuzzy_matched_df.shape[0]

print(f"Number of fuzzy matches: {num_fuzzy_matches}")

# Print the matched names for verification
fuzzy_matched_df[['lender_TopName', 'name_legal']]

In [None]:
# Find the unmatched names
unmatched_names_loc = bank_df_copy[~bank_df_copy['normalized_name'].isin(fuzzy_matched_df['normalized_name_x'])]
unmatched_names_list = unmatched_names_loc['lender_TopName'].tolist()

# Print the unmatched names
print("Unmatched names from LOC:")
print(unmatched_names_list)