In [1]:
import pandas as pd
import re
from fuzzywuzzy import fuzz, process

# Part 1: load the LOC data and identify non-numeric IDs, storing them separately:

In [2]:
file_path = '/Users/twylazhang/Desktop/Econ_banks_Research/LOC.dta'

# Load the .dta file using pandas read_stata function
loc_df = pd.read_stata(file_path)

loc_df

Unnamed: 0,source,lender_TopRSSDID,lender_TopName
0,1.0,1003790,USE CREDIT UNION
1,1.0,100393,FRANKENMUTH CREDIT UNION
2,3.0,100571,DEPARTMENT OF LABOR FCU
3,1.0,1025608,FIRST HAWAIIAN INC
4,3.0,1026801,FREMONT BANCORPORATION
...,...,...,...
1074,1.0,f96580,EXCEL FEDERAL CREDIT UNION
1075,3.0,f968744,FIRST CHOICE BANK
1076,1.0,f97,CAP
1077,1.0,f98,CAPITAL CITY


In total, there are 1079 rows of lenders/data

Since there are many credit unions that cannot be matched with either the `fr_report` or `call_report`, we will **separate banks and credit unions**.
- This ensures that only the **DataFrame containing banks** is used for the name matching process.

In [3]:
# Function to identify credit unions based on the presence of "credit union" or "cu"
def is_credit_union(name):
    # Check for "credit union" as a phrase
    if re.search(r'credit union', name, re.IGNORECASE):
        return True
    
    # Check for the word "cu"
    words = re.split(r'\W+', name)
    if 'cu' in [word.lower() for word in words]:
        return True
    
    return False

# Separate credit unions and banks in loc_df based on the name
credit_union_df = loc_df[loc_df['lender_TopName'].apply(is_credit_union)]
bank_df = loc_df[~loc_df['lender_TopName'].apply(is_credit_union)]

In [4]:
bank_df

Unnamed: 0,source,lender_TopRSSDID,lender_TopName
2,3.0,100571,DEPARTMENT OF LABOR FCU
3,1.0,1025608,FIRST HAWAIIAN INC
4,3.0,1026801,FREMONT BANCORPORATION
5,3.0,1027004,ZIONS BC
6,1.0,1030040,BANK OF CMRC HOLD
...,...,...,...
1073,1.0,f96,CANYON EDGE LLC
1075,3.0,f968744,FIRST CHOICE BANK
1076,1.0,f97,CAP
1077,1.0,f98,CAPITAL CITY


In [5]:
credit_union_df

Unnamed: 0,source,lender_TopRSSDID,lender_TopName
0,1.0,1003790,USE CREDIT UNION
1,1.0,100393,FRANKENMUTH CREDIT UNION
75,3.0,113777,ADVANCIAL FEDERAL CREDIT UNION
106,1.0,129787,FORT LEE FEDERAL CREDIT UNION
107,1.0,131780,HOPEWELL FEDERAL CU
...,...,...,...
1059,1.0,f899987,WSSC FEDERAL CREDIT UNION
1065,1.0,f924375,INTEGRITY FEDERAL CREDIT UNION
1067,1.0,f932594,FIRST AREA CREDIT UNION
1070,1.0,f947392,PAINESVILLE CREDIT UNION


there are 131 credit unions

# Part2: Name Matching Between FR Report(BHC) and bank_df

## Exact Matching

convert the names of the banks to all lower case in both bank_df and fr_report, then perform an exact match to see how many names match

In [6]:
# Load the call report and FR report
call_df = pd.read_csv('/Users/twylazhang/Desktop/Econ_banks_Research/background/call.csv')
fr_report = pd.read_csv('/Users/twylazhang/Desktop/Econ_banks_Research/background/FR_Report.csv')

# Normalize ID columns in all datasets for consistent integer comparison
#fr_report['id'] = pd.to_numeric(fr_report['id'], errors='coerce').fillna(0).astype(int)
#call_df['id'] = pd.to_numeric(call_df['id'], errors='coerce').fillna(0).astype(int)
#call_df['parent_id'] = pd.to_numeric(call_df['parent_id'], errors='coerce').fillna(0).astype(int)

In [7]:
# Make a copy of the bank_df to avoid SettingWithCopyWarning
bank_df_copy = bank_df.copy()

# Standardize names to all lower case
bank_df_copy.loc[:, 'lender_TopName_lower'] = bank_df_copy['lender_TopName'].str.lower()
fr_report.loc[:, 'name_legal_lower'] = fr_report['name_legal'].str.lower()

# Perform exact matching
matched_names = pd.merge(bank_df_copy, fr_report, left_on='lender_TopName_lower', right_on='name_legal_lower', how='inner')

# Count the number of matches
num_matches = matched_names.shape[0]

# Print the number of matches
print(f"Number of exact matches: {num_matches}")

matched_names[['lender_TopName', 'name_legal']]


Number of exact matches: 75


Unnamed: 0,lender_TopName,name_legal
0,FREMONT BANCORPORATION,FREMONT BANCORPORATION
1,TRICO BANCSHARES,TRICO BANCSHARES
2,SVB FINANCIAL GROUP,SVB FINANCIAL GROUP
3,M&T BANK CORPORATION,M&T BANK CORPORATION
4,TRUSTCO BANK CORP NY,TRUSTCO BANK CORP NY
...,...,...
70,RBB BANCORP,RBB BANCORP
71,VIRGINIA NATIONAL BANKSHARES CORPORATION,VIRGINIA NATIONAL BANKSHARES CORPORATION
72,OP BANCORP,OP BANCORP
73,AMERICAN EXPRESS COMPANY,AMERICAN EXPRESS COMPANY


only 75 are matched out of 948 lenders in bank_df

In [8]:
fr_report

Unnamed: 0,id,date,fed_district_code,charter_type,city,country,parent_id,lei,name_legal,name,org_type,fed_regulator,state,name_legal_lower
0,1020180,20171231,9,500,SAINT PAUL,UNITED STATES,0,549300TXP74T8NJZJW60,BREMER FINANCIAL CORPORATION,BREMER FNCL CORP,1,FRS,MN,bremer financial corporation
1,1020201,20171231,7,500,NEW YORK,UNITED STATES,3232316,549300LBOHZ4QSIWU288,HSBC USA INC.,HSBC USA,1,FRS,NY,hsbc usa inc.
2,1020395,20171231,6,500,ANDALUSIA,UNITED STATES,0,254900YBC9ZTMHLVQ556,SOUTHERN NATIONAL CORPORATION,SOUTHERN NAT CORP,1,FRS,AL,southern national corporation
3,1020582,20171231,7,500,WISCONSIN RAPIDS,UNITED STATES,0,0,WOODTRUST FINANCIAL CORP,WOODTRUST FC,1,FRS,WI,woodtrust financial corp
4,1020591,20171231,7,500,SIOUX CITY,UNITED STATES,0,0,"PINNACLE BANCORP, INC.",PINNACLE BC,1,FRS,IA,"pinnacle bancorp, inc."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4592,5147115,20171231,10,500,DURANGO,UNITED STATES,0,0,"TIG BANCORP, INC.",TIG BC,1,FRS,CO,"tig bancorp, inc."
4593,5158269,20171231,5,500,SPARTANBURG,UNITED STATES,0,0,CAB FINANCIAL CORPORATION,CAB FC,1,FRS,SC,cab financial corporation
4594,5163003,20171231,12,500,CERRITOS,UNITED STATES,0,54930094NOEN2QWZWW84,FIRST CHOICE BANCORP,FIRST CHOICE BC,1,FRS,CA,first choice bancorp
4595,5163898,20171231,12,500,BURLINGTON,UNITED STATES,0,0,"SAVI FINANCIAL CORPORATION, INC.",SAVI FC,1,FRS,WA,"savi financial corporation, inc."


## Other ways to perform matching since exact matching does not work as well. 

#### Standardize the names before matching

In [9]:
# Function to remove punctuation and extra spaces
def remove_punctuation_and_spaces(name):
    name = re.sub(r'[^\w\s]', '', name)  # Remove punctuation
    name = re.sub(r'\s+', ' ', name).strip()  # Remove extra spaces
    return name

# Function to expand common abbreviations
def expand_abbreviations(name):
    abbreviations = {
        "assoc": "association",
        "co": "company",
        "corp": "corporation",
        "inc": "incorporated",
        "ltd": "limited",
        "&": "and"
    }
    for abbr, full in abbreviations.items():
        name = re.sub(r'\b' + abbr + r'\b', full, name)
    return name

# Apply normalization functions
bank_df_copy.loc[:, 'normalized_name'] = bank_df_copy['lender_TopName'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))
fr_report.loc[:, 'normalized_name'] = fr_report['name_legal'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))


In [10]:
bank_df_copy

Unnamed: 0,source,lender_TopRSSDID,lender_TopName,lender_TopName_lower,normalized_name
2,3.0,100571,DEPARTMENT OF LABOR FCU,department of labor fcu,department of labor fcu
3,1.0,1025608,FIRST HAWAIIAN INC,first hawaiian inc,first hawaiian incorporated
4,3.0,1026801,FREMONT BANCORPORATION,fremont bancorporation,fremont bancorporation
5,3.0,1027004,ZIONS BC,zions bc,zions bc
6,1.0,1030040,BANK OF CMRC HOLD,bank of cmrc hold,bank of cmrc hold
...,...,...,...,...,...
1073,1.0,f96,CANYON EDGE LLC,canyon edge llc,canyon edge llc
1075,3.0,f968744,FIRST CHOICE BANK,first choice bank,first choice bank
1076,1.0,f97,CAP,cap,cap
1077,1.0,f98,CAPITAL CITY,capital city,capital city


In [11]:
fr_report

Unnamed: 0,id,date,fed_district_code,charter_type,city,country,parent_id,lei,name_legal,name,org_type,fed_regulator,state,name_legal_lower,normalized_name
0,1020180,20171231,9,500,SAINT PAUL,UNITED STATES,0,549300TXP74T8NJZJW60,BREMER FINANCIAL CORPORATION,BREMER FNCL CORP,1,FRS,MN,bremer financial corporation,bremer financial corporation
1,1020201,20171231,7,500,NEW YORK,UNITED STATES,3232316,549300LBOHZ4QSIWU288,HSBC USA INC.,HSBC USA,1,FRS,NY,hsbc usa inc.,hsbc usa incorporated
2,1020395,20171231,6,500,ANDALUSIA,UNITED STATES,0,254900YBC9ZTMHLVQ556,SOUTHERN NATIONAL CORPORATION,SOUTHERN NAT CORP,1,FRS,AL,southern national corporation,southern national corporation
3,1020582,20171231,7,500,WISCONSIN RAPIDS,UNITED STATES,0,0,WOODTRUST FINANCIAL CORP,WOODTRUST FC,1,FRS,WI,woodtrust financial corp,woodtrust financial corporation
4,1020591,20171231,7,500,SIOUX CITY,UNITED STATES,0,0,"PINNACLE BANCORP, INC.",PINNACLE BC,1,FRS,IA,"pinnacle bancorp, inc.",pinnacle bancorp incorporated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4592,5147115,20171231,10,500,DURANGO,UNITED STATES,0,0,"TIG BANCORP, INC.",TIG BC,1,FRS,CO,"tig bancorp, inc.",tig bancorp incorporated
4593,5158269,20171231,5,500,SPARTANBURG,UNITED STATES,0,0,CAB FINANCIAL CORPORATION,CAB FC,1,FRS,SC,cab financial corporation,cab financial corporation
4594,5163003,20171231,12,500,CERRITOS,UNITED STATES,0,54930094NOEN2QWZWW84,FIRST CHOICE BANCORP,FIRST CHOICE BC,1,FRS,CA,first choice bancorp,first choice bancorp
4595,5163898,20171231,12,500,BURLINGTON,UNITED STATES,0,0,"SAVI FINANCIAL CORPORATION, INC.",SAVI FC,1,FRS,WA,"savi financial corporation, inc.",savi financial corporation incorporated


## Fuzzy Matching:

### Explanation:

1. **Fuzzy Matching Function**:
   - `fuzzy_match(row, choices, scorer=fuzz.token_sort_ratio)`: Uses the `fuzzywuzzy` library to perform fuzzy matching on the normalized names with a similarity score threshold of 80.
   - **Fuzzy Matching**: This technique compares strings and evaluates their similarity, even when the strings are not exactly the same. It is useful for matching names that may have slight variations, such as typos, abbreviations, or different spellings. The `fuzz.token_sort_ratio` scorer from the `fuzzywuzzy` library calculates a similarity score between 0 and 100, where a higher score indicates greater similarity.

2. **Function Definition**:
   - The `fuzzy_match` function uses the `fuzzywuzzy` library to find the best match for a given name from a list of choices (`fr_report_names`).
   - It compares each normalized name in `bank_df_copy` to the normalized names in `fr_report` using the `fuzz.token_sort_ratio` scorer, which calculates a similarity score between 0 and 100.

3. **Applying the Function**:
   - The `fuzzy_match` function is applied to each row in `bank_df_copy` using the `.apply()` method.
   - For each row, it returns the best match from `fr_report_names` if the similarity score is above 80. If no match is found with a score above 80, it returns `None`.

4. **Merge Based on Fuzzy Matched Names**:
   - Perform an inner join based on the fuzzy matched names and count the number of matches.
   - Print the number of matches and the matched names for verification.

5. **Find Unmatched Names**:
   - Identify and list names in `bank_df_copy` that do not have matches in `fr_report`.

In [12]:
# Fuzzy Matching Function
def fuzzy_match(row, choices, scorer=fuzz.token_sort_ratio):
    match, score = process.extractOne(row['normalized_name'], choices, scorer=scorer)
    return match if score > 80 else None

# Get unique normalized names from fr_report
fr_report_names = fr_report['normalized_name'].unique()

# Apply fuzzy matching
bank_df_copy['fuzzy_matched_name'] = bank_df_copy.apply(fuzzy_match, axis=1, choices=fr_report_names)

bank_df_copy['fuzzy_matched_name']

2                              None
3       first hawaiian incorporated
4            fremont bancorporation
5                              None
6         bank of commerce holdings
                   ...             
1073                           None
1075           first choice bancorp
1076                           None
1077                           None
1078                           None
Name: fuzzy_matched_name, Length: 948, dtype: object

In [13]:
# Merge based on fuzzy matched names
fuzzy_matched_df = pd.merge(bank_df_copy, fr_report, left_on='fuzzy_matched_name', right_on='normalized_name', how='inner')

# Drop duplicates
fuzzy_matched_df = fuzzy_matched_df.drop_duplicates(subset=['lender_TopName', 'name_legal'])

# Count the number of matches after dropping duplicates
num_fuzzy_matches = fuzzy_matched_df.shape[0]

print(f"Number of fuzzy matches after dropping duplicates: {num_fuzzy_matches}")

# Print the matched names for verification
print(fuzzy_matched_df[['lender_TopName', 'name_legal']])


Number of fuzzy matches after dropping duplicates: 283
                   lender_TopName                          name_legal
0              FIRST HAWAIIAN INC                FIRST HAWAIIAN, INC.
1          FREMONT BANCORPORATION              FREMONT BANCORPORATION
2               BANK OF CMRC HOLD           BANK OF COMMERCE HOLDINGS
3                TRICO BANCSHARES                    TRICO BANCSHARES
4             SVB FINANCIAL GROUP                 SVB FINANCIAL GROUP
..                            ...                                 ...
309        WESTSTAR MORTGAGE CORP  FIRST WESTERN MORTGAGE CORPORATION
310      BANKTRUST FINANCIAL CORP        ALERUS FINANCIAL CORPORATION
311                 BRIGHTON BANK                    BRIGHTON BANCORP
312  CALLIDUS CAPITAL CORPORATION       BANKFIRST CAPITAL CORPORATION
313             FIRST CHOICE BANK                FIRST CHOICE BANCORP

[283 rows x 2 columns]


In [14]:
# Find the unmatched names
unmatched_names_loc = bank_df_copy[~bank_df_copy['normalized_name'].isin(fuzzy_matched_df['normalized_name_x'])]
unmatched_names_list = unmatched_names_loc['lender_TopName'].tolist()

print(f"Number of not matched banks in LOC: {len(unmatched_names_list)} out of 948 banks in LOC")

# Print the unmatched names
print("Unmatched names from LOC:")
print(unmatched_names_list)

Number of not matched banks in LOC: 673 out of 948 banks in LOC
Unmatched names from LOC:
['DEPARTMENT OF LABOR FCU', 'ZIONS BC', 'LEARNER FC', 'COMMUNITY BK OF S FL', 'BANK OF MONTREAL', 'BNP PARIBAS', 'ROYAL BANK OF CANADA', 'TORONTODOMINION BANK THE', 'DEUTSCHE BANK AKTIENGESELLSCHAFT', 'TAMPA BAY BKG CO', 'BANCO BILBAO VIZCAYA ARGENTARIA SA', 'GREAT LAKES FNCL RESRCS ESOP', 'GATEWAY BANK FSB', 'UNITY CATHOLIC FCU', 'MUTUAL OF OMAHA INSURANCE COMPANY', 'CREDIT SUISSE GROUP AG', 'FRONT ROYAL FCU', 'CASCADE BC', 'COMMUNITY BANCSHARES EMPLOYEE STOCK OWNERSHIP PLAN', 'H BC LLC', 'SB ONE BANCORP', 'TAYLOR CAP GRP', 'IRONSTONE BANK', 'ALIKAT INV', 'SHORE COMMUNITY BANK', 'FIRST NIAGARA FNCL GROUP', 'TOWER FC', 'SOMERSET HILLS BANK', 'PACIFIC CONTINENTAL CORP', 'METLIFE BANK NA', 'CENTRIX B&TC', 'EMPRESAS JUAN YARUR SPA', 'FNB BC', 'BARCLAYS BANK DELAWARE', 'NORTHRIM BC', '1ST MARINER BANK', 'PREMIER CMRL BC', 'US CENTURY BANK', 'BILTMORE BK OF ARIZONA', 'CLAYTON HC', 'CALWEST BC', 'DCB FC

In [15]:
#fuzzy_matched_df.to_csv('LOC_BHC_Name_Matching_Fuzzy.csv', index=False)

# Part 3: Name Matching Between Call Report and bank_df

### Exact Matching

In [16]:
# Make a copy of the bank_df to avoid SettingWithCopyWarning
bank_df_copy_call = bank_df.copy()

# Standardize names to all lower case
bank_df_copy_call.loc[:, 'lender_TopName_lower'] = bank_df_copy['lender_TopName'].str.lower()
call_df.loc[:, 'name_legal_lower'] = call_df['name_legal'].str.lower()

# Perform exact matching
matched_names_call = pd.merge(bank_df_copy_call, call_df, left_on='lender_TopName_lower', right_on='name_legal_lower', how='inner')

# Count the number of matches
num_matches_call = matched_names_call.shape[0]

# Print the number of matches
print(f"Number of exact matches with call report: {num_matches_call}")

matched_names_call[['lender_TopName', 'name_legal']]

Number of exact matches with call report: 376


Unnamed: 0,lender_TopName,name_legal
0,SHORE COMMUNITY BANK,SHORE COMMUNITY BANK
1,BARCLAYS BANK DELAWARE,BARCLAYS BANK DELAWARE
2,PACIFIC WEST BANK,PACIFIC WEST BANK
3,ANCHOR COMMERCIAL BANK,ANCHOR COMMERCIAL BANK
4,BANESCO USA,BANESCO USA
...,...,...
371,BRIGHTON BANK,BRIGHTON BANK
372,BRIGHTON BANK,BRIGHTON BANK
373,BUSINESS FIRST BANK,BUSINESS FIRST BANK
374,FIRST CHOICE BANK,FIRST CHOICE BANK


For exact matching, 376 are matched out of 948 lenders in bank_df

### Fuzzy Matching

In [17]:
# Apply normalization functions
bank_df_copy_call.loc[:, 'normalized_name'] = bank_df_copy_call['lender_TopName'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))
call_df.loc[:, 'normalized_name'] = call_df['name_legal'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))

# Get unique normalized names from fr_report
call_df_names = call_df['normalized_name'].unique()

# Apply fuzzy matching
bank_df_copy_call['fuzzy_matched_name'] = bank_df_copy.apply(fuzzy_match, axis=1, choices=call_df_names)

bank_df_copy_call['fuzzy_matched_name']

2                    None
3                    None
4                    None
5                    None
6                    None
              ...        
1073                 None
1075    first choice bank
1076                 None
1077    capital city bank
1078                 None
Name: fuzzy_matched_name, Length: 948, dtype: object

In [18]:
# Merge based on fuzzy matched names
fuzzy_matched_df_call = pd.merge(bank_df_copy_call, call_df, left_on='fuzzy_matched_name', right_on='normalized_name', how='inner')

# Drop duplicates
fuzzy_matched_df_call = fuzzy_matched_df_call.drop_duplicates(subset=['lender_TopName', 'name_legal'])

# Count the number of matches after dropping duplicates
num_fuzzy_matches_call = fuzzy_matched_df_call.shape[0]

print(f"Number of fuzzy matches with call report after dropping duplicates: {num_fuzzy_matches_call}")

# Print the matched names for verification
print(fuzzy_matched_df_call[['lender_TopName', 'name_legal']])


Number of fuzzy matches with call report after dropping duplicates: 216
                  lender_TopName                  name_legal
0           M&T BANK CORPORATION        LCA BANK CORPORATION
1              OCULINA BANC CORP        LCA BANK CORPORATION
2        VALLEY NATIONAL BANCORP        VALLEY NATIONAL BANK
4      COMMUNITY BANK SYSTEM INC  COMMUNITY TRUST BANK, INC.
5    COMMUNITY TRUST BANCORP INC  COMMUNITY TRUST BANK, INC.
..                           ...                         ...
604          BUSINESS FIRST BANK         FIRST BUSINESS BANK
605                 CADENCE BANK          CADENCE BANK, N.A.
606    FIDELITY BANK (MINNESOTA)        FIRST MINNESOTA BANK
607            FIRST CHOICE BANK           FIRST CHOICE BANK
609                 CAPITAL CITY           CAPITAL CITY BANK

[216 rows x 2 columns]


In [19]:
# Find the unmatched names
unmatched_names_loc_call = bank_df_copy_call[~bank_df_copy_call['normalized_name'].isin(fuzzy_matched_df_call['normalized_name_x'])]
unmatched_names_list_call = unmatched_names_loc['lender_TopName'].tolist()

print(f"Number of not matched banks in LOC with call report: {len(unmatched_names_list_call)} out of 948 banks in LOC")

# Print the unmatched names
print("Unmatched names from LOC with call report:")
print(unmatched_names_list_call)

Number of not matched banks in LOC with call report: 673 out of 948 banks in LOC
Unmatched names from LOC with call report:
['DEPARTMENT OF LABOR FCU', 'ZIONS BC', 'LEARNER FC', 'COMMUNITY BK OF S FL', 'BANK OF MONTREAL', 'BNP PARIBAS', 'ROYAL BANK OF CANADA', 'TORONTODOMINION BANK THE', 'DEUTSCHE BANK AKTIENGESELLSCHAFT', 'TAMPA BAY BKG CO', 'BANCO BILBAO VIZCAYA ARGENTARIA SA', 'GREAT LAKES FNCL RESRCS ESOP', 'GATEWAY BANK FSB', 'UNITY CATHOLIC FCU', 'MUTUAL OF OMAHA INSURANCE COMPANY', 'CREDIT SUISSE GROUP AG', 'FRONT ROYAL FCU', 'CASCADE BC', 'COMMUNITY BANCSHARES EMPLOYEE STOCK OWNERSHIP PLAN', 'H BC LLC', 'SB ONE BANCORP', 'TAYLOR CAP GRP', 'IRONSTONE BANK', 'ALIKAT INV', 'SHORE COMMUNITY BANK', 'FIRST NIAGARA FNCL GROUP', 'TOWER FC', 'SOMERSET HILLS BANK', 'PACIFIC CONTINENTAL CORP', 'METLIFE BANK NA', 'CENTRIX B&TC', 'EMPRESAS JUAN YARUR SPA', 'FNB BC', 'BARCLAYS BANK DELAWARE', 'NORTHRIM BC', '1ST MARINER BANK', 'PREMIER CMRL BC', 'US CENTURY BANK', 'BILTMORE BK OF ARIZONA', '

# Part 4: Name Matching Between Credit Union and bank_df

In [20]:
# Load your data here
cu_df = pd.read_excel('/Users/twylazhang/Desktop/Econ_banks_Research/LOC Credit Union Data/THIS_FOICU_Names_ID_excel.xlsx')

# Drop rows with non-finite values in RSSD column
cu_df = cu_df[pd.notnull(cu_df['RSSD'])]

# Ensure RSSD column is integer
cu_df['RSSD'] = cu_df['RSSD'].astype(int)

# Keep only the necessary columns from the credit unions file
cu_df = cu_df[['RSSD', 'CU_NAME']]

In [21]:
# Normalize names
credit_union_df_copy = credit_union_df.copy()
credit_union_df_copy.loc[:, 'normalized_name'] = credit_union_df_copy['lender_TopName'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))
cu_df.loc[:, 'normalized_name'] = cu_df['CU_NAME'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))

# Exact Matching
exact_matched_df_cu = pd.merge(credit_union_df_copy, cu_df, left_on='normalized_name', right_on='normalized_name', how='inner')

num_exact_matches_cu = exact_matched_df_cu.shape[0]
print(f"Number of exact matches: {num_exact_matches_cu}")

exact_matched_df_cu

Number of exact matches: 8


Unnamed: 0,source,lender_TopRSSDID,lender_TopName,normalized_name,RSSD,CU_NAME
0,3.0,368492,WRIGHTPATT CREDIT UNION INC,wrightpatt credit union incorporated,368492,"WRIGHT-PATT CREDIT UNION, INC."
1,1.0,373795,USF FEDERAL CREDIT UNION,usf federal credit union,373795,USF FEDERAL CREDIT UNION
2,3.0,443193,CREDIT UNION ONE,credit union one,443193,CREDIT UNION ONE
3,1.0,574275,ANDREWS FEDERAL CREDIT UNION,andrews federal credit union,574275,ANDREWS FEDERAL CREDIT UNION
4,1.0,662293,PSE CREDIT UNION INC,pse credit union incorporated,662293,"PSE CREDIT UNION, INC."
5,1.0,f651998,CREDIT UNION ADVANTAGE,credit union advantage,651998,CREDIT UNION ADVANTAGE
6,3.0,f733090,WHITEFISH CREDIT UNION ASSOCIATION,whitefish credit union association,733090,WHITEFISH CREDIT UNION ASSOCIATION
7,1.0,f947392,PAINESVILLE CREDIT UNION,painesville credit union,947392,PAINESVILLE CREDIT UNION


In [22]:
# Normalize names
credit_union_df_copy = credit_union_df.copy()
credit_union_df_copy.loc[:, 'normalized_name'] = credit_union_df_copy['lender_TopName'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))
cu_df.loc[:, 'normalized_name'] = cu_df['CU_NAME'].apply(lambda x: expand_abbreviations(remove_punctuation_and_spaces(str(x).lower())))

# Exact Matching
exact_matched_df_cu = pd.merge(credit_union_df_copy, cu_df, left_on='normalized_name', right_on='normalized_name', how='inner')

# Fuzzy Matching Function
def fuzzy_match(row, choices, scorer=fuzz.token_sort_ratio):
    match, score = process.extractOne(row['normalized_name'], choices, scorer=scorer)
    return match if score > 80 else None

# Get unique normalized names from cu_df
cu_names = cu_df['normalized_name'].unique()

# Apply fuzzy matching
credit_union_df_copy['fuzzy_matched_name'] = credit_union_df_copy.apply(fuzzy_match, axis=1, choices=cu_names)

# Merge based on fuzzy matched names
fuzzy_matched_df_cu = pd.merge(credit_union_df_copy, cu_df, left_on='fuzzy_matched_name', right_on='normalized_name', how='inner')
fuzzy_matched_df_cu = fuzzy_matched_df_cu.drop_duplicates(subset=['lender_TopName', 'CU_NAME'])

# Combine exact and fuzzy matched DataFrames
combined_matched_df_cu = pd.concat([exact_matched_df_cu, fuzzy_matched_df_cu]).drop_duplicates(subset=['lender_TopName', 'CU_NAME'])

# Count the number of matches
num_fuzzy_matches_cu = fuzzy_matched_df_cu.shape[0]
num_combined_matches_cu = combined_matched_df_cu.shape[0]

print(f"Total number of combined matches: {num_combined_matches_cu}")

# Print the matched names for verification
print(combined_matched_df_cu[['lender_TopName', 'CU_NAME']])

# Find the unmatched names
unmatched_names_credit_union = credit_union_df_copy[~credit_union_df_copy['normalized_name'].isin(combined_matched_df_cu['normalized_name_x'])]
unmatched_names_list_cu = unmatched_names_credit_union['lender_TopName'].tolist()

# Print the unmatched names
print("Unmatched names from LOC with Credit Unions:")
print(num_combined_matches_cu)

Number of exact matches: 8
Total number of combined matches: 62
                       lender_TopName                            CU_NAME
0         WRIGHTPATT CREDIT UNION INC     WRIGHT-PATT CREDIT UNION, INC.
1            USF FEDERAL CREDIT UNION           USF FEDERAL CREDIT UNION
2                    CREDIT UNION ONE                   CREDIT UNION ONE
3        ANDREWS FEDERAL CREDIT UNION       ANDREWS FEDERAL CREDIT UNION
4                PSE CREDIT UNION INC             PSE CREDIT UNION, INC.
..                                ...                                ...
54                 ADVIA CREDIT UNION             CREDIT UNION ADVANTAGE
56                       CREDIT UNION                     CREDIT UNION 1
58  EASTPOINTE COMMUNITY CREDIT UNION      COMMUNITY SPIRIT CREDIT UNION
60      SCHOOL EMPLOYEES CREDIT UNION  MACON-BIBB EMPLOYEES CREDIT UNION
61           ONE DETROIT CREDIT UNION          DESTINATIONS CREDIT UNION

[62 rows x 2 columns]
Unmatched names from LOC with Credit 

62 out of 131 are matched 