In [1]:
import re
import pandas as pd
from fuzzywuzzy import fuzz, process

# Part 1: Load Data and Data Preprocessing

In [2]:
# Load LOC data
loc_df = pd.read_stata('/Users/twylazhang/Desktop/Econ_banks_Research/LOC.dta')

# Load Credit Union data
cu_df = pd.read_excel('/Users/twylazhang/Desktop/Econ_banks_Research/LOC Credit Union Data/THIS_FOICU_Names_ID_excel.xlsx')

# Load FR Report data
fr_report = pd.read_csv('/Users/twylazhang/Desktop/Econ_banks_Research/background/FR_Report.csv')

# Load Call Report data
call_df = pd.read_csv('/Users/twylazhang/Desktop/Econ_banks_Research/background/call.csv')

# Drop rows with non-finite values in RSSD column and ensure RSSD column is integer
cu_df = cu_df.dropna(subset=['RSSD'])
cu_df['RSSD'] = cu_df['RSSD'].astype(int)
cu_df = cu_df[['RSSD', 'CU_NAME']]

### Separate credit unions and banks in loc_df based on the name

In [3]:
# Function to identify credit unions based on the presence of "credit union" or "cu"
def is_credit_union(name):
    # Check for "credit union" as a phrase
    if re.search(r'credit union', name, re.IGNORECASE):
        return True
    
    # Check for the word "cu"
    words = re.split(r'\W+', name)
    if 'cu' in [word.lower() for word in words]:
        return True
    
    return False
    
# Separate credit unions and banks in loc_df based on the name
credit_union_df = loc_df[loc_df['lender_TopName'].apply(is_credit_union)]
bank_df = loc_df[~loc_df['lender_TopName'].apply(is_credit_union)]

# Part 2: Normalize Data

In [4]:
# Function to remove punctuation and extra spaces
def remove_punctuation_and_spaces(name):
    name = re.sub(r'[^\w\s]', '', name)  # Remove punctuation
    name = re.sub(r'\s+', ' ', name).strip()  # Remove extra spaces
    return name

# Function to expand common abbreviations
def expand_abbreviations(name):
    abbreviations = {
        "assoc": "association",
        "co": "company",
        "corp": "corporation",
        "inc": "incorporated",
        "ltd": "limited",
        "&": "and"
    }
    for abbr, full in abbreviations.items():
        name = re.sub(r'\b' + abbr + r'\b', full, name)
    return name

# Normalize names in any DataFrame
def normalize_names(df, column_name):
    df_copy = df.copy()
    df_copy['normalized_name'] = df_copy[column_name].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))
    return df_copy

In [5]:
# Normalize names in LOC, CU, FR, and Call data
credit_union_df = normalize_names(credit_union_df, 'lender_TopName')
bank_df = normalize_names(bank_df, 'lender_TopName')
cu_df = normalize_names(cu_df, 'CU_NAME')
fr_report = normalize_names(fr_report, 'name_legal')
call_df = normalize_names(call_df, 'name_legal')

# Part 3: Name Matching between LOC and Call Report/Credit Union/FR Report(BHC)

### Matching Functions

In [6]:
# Exact Matching
def exact_match(df1, df2, column_name):
    return pd.merge(df1, df2, left_on=column_name, right_on=column_name, how='inner')

# Fuzzy Matching Function
def fuzzy_match(row, choices, scorer=fuzz.token_sort_ratio):
    match, score = process.extractOne(row['normalized_name'], choices, scorer=scorer)
    return match if score > 80 else None

## Perform Matching

### Exact matching: 
- convert the names of the banks to all lower case in both bank_df and fr_report, then perform an exact match to see how many names match

### Fuzzy Matching:

#### Explanation:

1. **Fuzzy Matching Function**:
   - `fuzzy_match(row, choices, scorer=fuzz.token_sort_ratio)`: Uses the `fuzzywuzzy` library to perform fuzzy matching on the normalized names with a similarity score threshold of 80.
   - **Fuzzy Matching**: This technique compares strings and evaluates their similarity, even when the strings are not exactly the same. It is useful for matching names that may have slight variations, such as typos, abbreviations, or different spellings. The `fuzz.token_sort_ratio` scorer from the `fuzzywuzzy` library calculates a similarity score between 0 and 100, where a higher score indicates greater similarity.

2. **Function Definition**:
   - The `fuzzy_match` function uses the `fuzzywuzzy` library to find the best match for a given name from a list of choices (`fr_report_names`).
   - It compares each normalized name in `bank_df_copy` to the normalized names in `fr_report` using the `fuzz.token_sort_ratio` scorer, which calculates a similarity score between 0 and 100.

3. **Applying the Function**:
   - The `fuzzy_match` function is applied to each row in `bank_df_copy` using the `.apply()` method.
   - For each row, it returns the best match from `fr_report_names` if the similarity score is above 80. If no match is found with a score above 80, it returns `None`.

4. **Merge Based on Fuzzy Matched Names**:
   - Perform an inner join based on the fuzzy matched names and count the number of matches.
   - Print the number of matches and the matched names for verification.

5. **Find Unmatched Names**:
   - Identify and list names in `bank_df_copy` that do not have matches in `fr_report`.

In [7]:
# Exact Matching
exact_matched_df_cu = exact_match(credit_union_df, cu_df, 'normalized_name')
exact_matched_df_fr = exact_match(bank_df, fr_report, 'normalized_name')
exact_matched_df_call = exact_match(bank_df, call_df, 'normalized_name')

# Get unique normalized names for fuzzy matching
cu_names = cu_df['normalized_name'].unique()
fr_names = fr_report['normalized_name'].unique()
call_names = call_df['normalized_name'].unique()

# Apply fuzzy matching
credit_union_df['fuzzy_matched_name_cu'] = credit_union_df.apply(fuzzy_match, axis=1, choices=cu_names)
bank_df['fuzzy_matched_name_fr'] = bank_df.apply(fuzzy_match, axis=1, choices=fr_names)
bank_df['fuzzy_matched_name_call'] = bank_df.apply(fuzzy_match, axis=1, choices=call_names)

# Merge based on fuzzy matched names
fuzzy_matched_df_cu = pd.merge(credit_union_df, cu_df, left_on='fuzzy_matched_name_cu', right_on='normalized_name', how='inner')
fuzzy_matched_df_fr = pd.merge(bank_df, fr_report, left_on='fuzzy_matched_name_fr', right_on='normalized_name', how='inner')
fuzzy_matched_df_call = pd.merge(bank_df, call_df, left_on='fuzzy_matched_name_call', right_on='normalized_name', how='inner')

# Drop duplicates
fuzzy_matched_df_cu = fuzzy_matched_df_cu.drop_duplicates(subset=['lender_TopName', 'CU_NAME'])
fuzzy_matched_df_fr = fuzzy_matched_df_fr.drop_duplicates(subset=['lender_TopName', 'name_legal'])
fuzzy_matched_df_call = fuzzy_matched_df_call.drop_duplicates(subset=['lender_TopName', 'name_legal'])

# Combine exact and fuzzy matched DataFrames
combined_matched_df_cu = pd.concat([exact_matched_df_cu, fuzzy_matched_df_cu]).drop_duplicates(subset=['lender_TopName', 'CU_NAME'])
combined_matched_df_fr = pd.concat([exact_matched_df_fr, fuzzy_matched_df_fr]).drop_duplicates(subset=['lender_TopName', 'name_legal'])
combined_matched_df_call = pd.concat([exact_matched_df_call, fuzzy_matched_df_call]).drop_duplicates(subset=['lender_TopName', 'name_legal'])

# Part 4: Match Result Analysis

###  Count Matches

In [16]:
total_cu = credit_union_df.shape[0]
total_fr = bank_df.shape[0]
total_call = bank_df.shape[0]

num_exact_matches_cu = exact_matched_df_cu.shape[0]
num_fuzzy_matches_cu = fuzzy_matched_df_cu.shape[0]
num_combined_matches_cu = combined_matched_df_cu.shape[0]

num_exact_matches_fr = exact_matched_df_fr.shape[0]
num_fuzzy_matches_fr = fuzzy_matched_df_fr.shape[0]
num_combined_matches_fr = combined_matched_df_fr.shape[0]

num_exact_matches_call = exact_matched_df_call.shape[0]
num_fuzzy_matches_call = fuzzy_matched_df_call.shape[0]
num_combined_matches_call = combined_matched_df_call.shape[0]

print("Credit Union")
print(f"Number of exact matches with CU: {num_exact_matches_cu} out of {total_cu}")
print(f"Number of fuzzy matches with CU: {num_fuzzy_matches_cu} out of {total_cu}")
print(f"Total number of combined matches with CU: {num_combined_matches_cu} out of {total_cu}")
print("\n")

print("BHC - FR Report")
print(f"Number of exact matches with FR: {num_exact_matches_fr} out of {total_fr}")
print(f"Number of fuzzy matches with FR: {num_fuzzy_matches_fr} out of {total_fr}")
print(f"Total number of combined matches with FR: {num_combined_matches_fr} out of {total_fr}")
print("\n")

print("Call Report")
print(f"Number of exact matches with Call: {num_exact_matches_call} out of {total_call}")
print(f"Number of fuzzy matches with Call: {num_fuzzy_matches_call} out of {total_call}")
print(f"Total number of combined matches with Call: {num_combined_matches_call} out of {total_call}")


Credit Union
Number of exact matches with CU: 8 out of 131
Number of fuzzy matches with CU: 62 out of 131
Total number of combined matches with CU: 62 out of 131


BHC - FR Report
Number of exact matches with FR: 246 out of 948
Number of fuzzy matches with FR: 283 out of 948
Total number of combined matches with FR: 285 out of 948


Call Report
Number of exact matches with Call: 398 out of 948
Number of fuzzy matches with Call: 216 out of 948
Total number of combined matches with Call: 219 out of 948


In [None]:
# Export matched files to CSV
combined_matched_df_cu.to_csv('LOC_Credit_Union_Name_Matching.csv', index=False)
combined_matched_df_fr.to_csv('_Credit_Union_Name_Matching.csv', index=False)
combined_matched_df_call.to_csv('combined_matched_df_call.csv', index=False)

### Find Unmatched Names

In [9]:
unmatched_names_credit_union = credit_union_df[~credit_union_df['normalized_name'].isin(combined_matched_df_cu['normalized_name_x'])]
unmatched_names_list_cu = unmatched_names_credit_union['lender_TopName'].tolist()

unmatched_names_bank_fr = bank_df[~bank_df['normalized_name'].isin(combined_matched_df_fr['normalized_name_x'])]
unmatched_names_list_fr = unmatched_names_bank_fr['lender_TopName'].tolist()

unmatched_names_bank_call = bank_df[~bank_df['normalized_name'].isin(combined_matched_df_call['normalized_name_x'])]
unmatched_names_list_call = unmatched_names_bank_call['lender_TopName'].tolist()

print("Unmatched names from LOC Credit Unions:")
print(unmatched_names_list_cu)


Unmatched names from LOC Credit Unions:
['FRANKENMUTH CREDIT UNION', 'ADVANCIAL FEDERAL CREDIT UNION', 'FORT LEE FEDERAL CREDIT UNION', 'HOPEWELL FEDERAL CU', 'DOWNRIVER COMMUNITY FEDERAL CU', 'PEOPLE DRIVEN CREDIT UNION', 'SCHOOL EMPL LORAIN CTY CU', 'WINGS FINANCIAL CREDIT UNION', 'WRIGHT PATMAN CONGRESSIONAL FEDERAL CREDIT UNION', 'GUARDIANS CREDIT UNION', 'WESCOM CENTRAL CREDIT UNION', 'WRIGHTPATT CREDIT UNION INC', 'USF FEDERAL CREDIT UNION', 'DESCO FEDERAL CREDIT UNION', 'CREDIT UNION ONE', 'TRUECORE FEDERAL CREDIT UNION', 'UMASSFIVE COLLEGE FEDERAL CREDIT UNION', 'ENTRUST FINANCIAL CREDIT UNION', 'CHIEF FINANCIAL FEDERAL CREDIT UNION', 'DEMOCRACY FEDERAL CREDIT UNION', 'DAY AIR CREDIT UNION', 'COMMUNITYAMERICA CREDIT UNION', 'PREMIER FINANCIAL CREDIT UNION', 'ANDREWS FEDERAL CREDIT UNION', 'AGRICULTURE FEDERAL CREDIT UNION', 'PSE CREDIT UNION INC', 'FIRSTMARK CREDIT UNION', 'FIRST SOURCE FEDERAL CREDIT UNION', 'BLUCURRENT CREDIT UNION', 'TOPSIDE FEDERAL CREDIT UNION', 'COMMONWEA

In [10]:
print("Unmatched names from LOC FR:")
print(unmatched_names_list_fr)

Unmatched names from LOC FR:
['DEPARTMENT OF LABOR FCU', 'FIRST HAWAIIAN INC', 'FREMONT BANCORPORATION', 'ZIONS BC', 'TRICO BANCSHARES', 'SVB FINANCIAL GROUP', 'LEARNER FC', 'M&T BANK CORPORATION', 'JPMORGAN CHASE & CO', 'TRUSTCO BANK CORP NY', 'BALLSTON SPA BANCORP INC', 'VALLEY NATIONAL BANCORP', 'COMMUNITY BANK SYSTEM INC', 'FIRST OF LONG ISLAND CORPORATION THE', 'UMB FINANCIAL CORPORATION', 'QUAIL CREEK BANCSHARES INC', 'ARKANSAS VALLEY BANCSHARES INC', 'SPIRIT BANKCORP INC', 'INTRUST FINANCIAL CORPORATION', 'KEYCORP', 'HUNTINGTON BANCSHARES INCORPORATED', 'PNC FINANCIAL SERVICES GROUP INC THE', 'FIFTH THIRD BANCORP', 'WESBANCO INC', 'COMMUNITY TRUST BANCORP INC', 'BANK OF AMERICA CORPORATION', 'BB&T CORPORATION', 'YORK BANCSHARES INC', 'OLD POINT FINANCIAL CORPORATION', 'VIRGINIA COMMUNITY BANKSHARES INC', 'SYNOVUS FINANCIAL CORP', 'TRUSTMARK CORPORATION', 'VOLUNTEER STATE BANCSHARES INC', 'AMERIS BANCORP', 'COMMUNITY BK OF S FL', 'CLINTON BANCSHARES INC', 'CENTRAL BANCOMPANY INC'

In [11]:
print("Unmatched names from LOC Call:")
print(unmatched_names_list_call)

Unmatched names from LOC Call:
['DEPARTMENT OF LABOR FCU', 'FIRST HAWAIIAN INC', 'FREMONT BANCORPORATION', 'ZIONS BC', 'BANK OF CMRC HOLD', 'TRICO BANCSHARES', 'SVB FINANCIAL GROUP', 'LEARNER FC', 'JPMORGAN CHASE & CO', 'TRUSTCO BANK CORP NY', 'BALLSTON SPA BANCORP INC', 'FIRST OF LONG ISLAND CORPORATION THE', 'UMB FINANCIAL CORPORATION', 'QUAIL CREEK BANCSHARES INC', 'ARKANSAS VALLEY BANCSHARES INC', 'SPIRIT BANKCORP INC', 'INTRUST FINANCIAL CORPORATION', 'KEYCORP', 'HUNTINGTON BANCSHARES INCORPORATED', 'PNC FINANCIAL SERVICES GROUP INC THE', 'FIRSTMERIT CORP', 'BANK OF AMERICA CORPORATION', 'BB&T CORPORATION', 'YORK BANCSHARES INC', 'OLD POINT FINANCIAL CORPORATION', 'VIRGINIA COMMUNITY BANKSHARES INC', 'SYNOVUS FINANCIAL CORP', 'TRUSTMARK CORPORATION', 'VOLUNTEER STATE BANCSHARES INC', 'AMERIS BANCORP', 'COMMUNITY BK OF S FL', 'CLINTON BANCSHARES INC', 'HANCOCK WHITNEY CORPORATION', 'CENTRAL BANCOMPANY INC', 'DELTA BANCSHARES COMPANY', 'FIRST HORIZON CORPORATION', 'SIMMONS FIRST NAT