# First, load the LOC data and identify non-numeric IDs, storing them separately:

In [1]:
import pandas as pd
 
file_path = '/Users/twylazhang/Desktop/Econ_banks_Research/LOC.dta'

# Load the .dta file using pandas read_stata function
loc_df = pd.read_stata(file_path)

loc_df

Unnamed: 0,source,lender_TopRSSDID,lender_TopName
0,1.0,1003790,USE CREDIT UNION
1,1.0,100393,FRANKENMUTH CREDIT UNION
2,3.0,100571,DEPARTMENT OF LABOR FCU
3,1.0,1025608,FIRST HAWAIIAN INC
4,3.0,1026801,FREMONT BANCORPORATION
...,...,...,...
1074,1.0,f96580,EXCEL FEDERAL CREDIT UNION
1075,3.0,f968744,FIRST CHOICE BANK
1076,1.0,f97,CAP
1077,1.0,f98,CAPITAL CITY


In total, there are 1079 rows of data

In [2]:
# Rename 'lender_TopRSSDID' to 'id' for uniformity with other datasets
loc_df.rename(columns={'lender_TopRSSDID': 'id'}, inplace=True)

# Keep non-numeric IDs in a separate DataFrame called non_numeric_ids_df
non_numeric_ids_loc_df = loc_df[~loc_df['id'].str.isdigit()].copy()
numeric_ids_loc_df = loc_df[loc_df['id'].str.isdigit()].copy()

# Convert 'id' to integer now that non-numeric values are removed from numeric_ids_loc_df
numeric_ids_loc_df['id'] = numeric_ids_loc_df['id'].astype(int)

# Now numeric_ids_loc_df contains only numeric IDs, and non_numeric_ids_loc_df contains the non-numeric IDs


There are 666 rows in non_numeric_ids_df, and 413 rows in loc_df after seperate numberic and non-numeric data

In [3]:
non_numeric_ids_loc_df

Unnamed: 0,source,id,lender_TopName
413,2.0,f1,1/0 HOLDCO LLC
414,1.0,f10,ACRE HOLDINGS LLC
415,1.0,f100,CB LENDING
416,3.0,f101,CB&T SYNOVUS
417,1.0,f102,CBBC
...,...,...,...
1074,1.0,f96580,EXCEL FEDERAL CREDIT UNION
1075,3.0,f968744,FIRST CHOICE BANK
1076,1.0,f97,CAP
1077,1.0,f98,CAPITAL CITY


In [4]:
numeric_ids_loc_df

Unnamed: 0,source,id,lender_TopName
0,1.0,1003790,USE CREDIT UNION
1,1.0,100393,FRANKENMUTH CREDIT UNION
2,3.0,100571,DEPARTMENT OF LABOR FCU
3,1.0,1025608,FIRST HAWAIIAN INC
4,3.0,1026801,FREMONT BANCORPORATION
...,...,...,...
408,3.0,967699,MIDUSA CREDIT UNION
409,3.0,971986,REACH FEDERAL CREDIT UNION
410,3.0,978471,UNITED STATES SENATE FEDERAL CREDIT UNION
411,1.0,986177,ADVIA CREDIT UNION


# Second

1. Initial Matching with BHC Data: Match LOC IDs to BHC data to see if the LOC entries correspond to BHCs.

2. Secondary Matching with Call Reports: For IDs not matched to BHCs, check against parent and subsidiary IDs in the call reports.

3. Tertiary Matching and Parent Assignment: For IDs identified as subsidiaries, assign the corresponding parent ID from the call reports. (commented out)

In [5]:
# Load the call report and FR report
call_df = pd.read_csv('/Users/twylazhang/Desktop/Econ_banks_Research/background/call.csv')
fr_report = pd.read_csv('/Users/twylazhang/Desktop/Econ_banks_Research/background/FR_Report.csv')

# Normalize ID columns in all datasets for consistent integer comparison
fr_report['id'] = pd.to_numeric(fr_report['id'], errors='coerce').fillna(0).astype(int)
call_df['id'] = pd.to_numeric(call_df['id'], errors='coerce').fillna(0).astype(int)
call_df['parent_id'] = pd.to_numeric(call_df['parent_id'], errors='coerce').fillna(0).astype(int)

In [6]:
fr_report

Unnamed: 0,id,date,fed_district_code,charter_type,city,country,parent_id,lei,name_legal,name,org_type,fed_regulator,state
0,1020180,20171231,9,500,SAINT PAUL,UNITED STATES,0,549300TXP74T8NJZJW60,BREMER FINANCIAL CORPORATION,BREMER FNCL CORP,1,FRS,MN
1,1020201,20171231,7,500,NEW YORK,UNITED STATES,3232316,549300LBOHZ4QSIWU288,HSBC USA INC.,HSBC USA,1,FRS,NY
2,1020395,20171231,6,500,ANDALUSIA,UNITED STATES,0,254900YBC9ZTMHLVQ556,SOUTHERN NATIONAL CORPORATION,SOUTHERN NAT CORP,1,FRS,AL
3,1020582,20171231,7,500,WISCONSIN RAPIDS,UNITED STATES,0,0,WOODTRUST FINANCIAL CORP,WOODTRUST FC,1,FRS,WI
4,1020591,20171231,7,500,SIOUX CITY,UNITED STATES,0,0,"PINNACLE BANCORP, INC.",PINNACLE BC,1,FRS,IA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4592,5147115,20171231,10,500,DURANGO,UNITED STATES,0,0,"TIG BANCORP, INC.",TIG BC,1,FRS,CO
4593,5158269,20171231,5,500,SPARTANBURG,UNITED STATES,0,0,CAB FINANCIAL CORPORATION,CAB FC,1,FRS,SC
4594,5163003,20171231,12,500,CERRITOS,UNITED STATES,0,54930094NOEN2QWZWW84,FIRST CHOICE BANCORP,FIRST CHOICE BC,1,FRS,CA
4595,5163898,20171231,12,500,BURLINGTON,UNITED STATES,0,0,"SAVI FINANCIAL CORPORATION, INC.",SAVI FC,1,FRS,WA


In [7]:
call_df

Unnamed: 0,date,id,charter_type,name_legal,name,org_type,fed_district_code,city,country,state,fed_regulator2,lei,parent_id
0,20171231.0,37,200.0,BANK OF HANCOCK COUNTY,BANK OF HANCOCK CTY,1.0,6.0,SPARTA,UNITED STATES,GA,FDIC,0,37
1,20171231.0,242,200.0,FIRST COMMUNITY BANK XENIA-FLORA,FIRST CMNTY BK XENIA FLORA,1.0,8.0,XENIA,UNITED STATES,IL,FRS,0,3088643
2,20171231.0,279,300.0,"MINEOLA COMMUNITY BANK, SSB",MINEOLA CMNTY BK SSB,6.0,11.0,MINEOLA,UNITED STATES,TX,FDIC,0,3619720
3,20171231.0,354,200.0,BISON STATE BANK,BISON ST BK,1.0,10.0,BISON,UNITED STATES,KS,FDIC,0,354
4,20171231.0,457,200.0,LOWRY STATE BANK,LOWRY ST BK,1.0,9.0,LOWRY,UNITED STATES,MN,FDIC,0,1127016
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5957,20171231.0,5086072,200.0,BLUE GATE BANK,BLUE GATE BK,1.0,12.0,COSTA MESA,UNITED STATES,CA,FDIC,0,5086072
5958,20171231.0,5087752,0.0,HUDSON BRANCH,ROYAL BK OF CANADA HUDSON BR,0.0,0.0,JERSEY CITY,UNITED STATES,NJ,OCC,0,5087752
5959,20171231.0,5087949,400.0,"MB FINANCIAL INTERNATIONAL, INC.",MB FNCL INTL,1.0,4.0,ROSEMONT,UNITED STATES,IL,FRS,0,1090987
5960,20171231.0,5113866,0.0,NEW YORK BRANCH,TAIWAN BUS BK NY BR,0.0,0.0,NEW YORK,UNITED STATES,NY,FRS,0,5113866


BHC Matching: IDs from numeric_ids_loc_df are checked against BHC IDs from fr_report.

In [8]:
# Step 2: Initial Matching with BHC Data
numeric_ids_loc_df['Match_Type'] = 'No Match'
numeric_ids_loc_df.loc[numeric_ids_loc_df['id'].isin(fr_report['id']), 'Match_Type'] = 'BHC'

# Step 3: Secondary Matching with Call Reports
condition = numeric_ids_loc_df['Match_Type'] == 'No Match'
numeric_ids_loc_df.loc[condition & numeric_ids_loc_df['id'].isin(call_df['parent_id']), 'Match_Type'] = 'Call_Parent'
numeric_ids_loc_df.loc[condition & numeric_ids_loc_df['id'].isin(call_df['id']), 'Match_Type'] = 'Call_Subsidiary'


numeric_ids_loc_df

Unnamed: 0,source,id,lender_TopName,Match_Type
0,1.0,1003790,USE CREDIT UNION,No Match
1,1.0,100393,FRANKENMUTH CREDIT UNION,No Match
2,3.0,100571,DEPARTMENT OF LABOR FCU,No Match
3,1.0,1025608,FIRST HAWAIIAN INC,BHC
4,3.0,1026801,FREMONT BANCORPORATION,BHC
...,...,...,...,...
408,3.0,967699,MIDUSA CREDIT UNION,No Match
409,3.0,971986,REACH FEDERAL CREDIT UNION,No Match
410,3.0,978471,UNITED STATES SENATE FEDERAL CREDIT UNION,No Match
411,1.0,986177,ADVIA CREDIT UNION,No Match


In [9]:
# Display rows that are classified as BHC
bhc_rows = numeric_ids_loc_df[numeric_ids_loc_df['Match_Type'] == 'BHC']
print("Rows classified as BHC:")
print(bhc_rows)
print("Number of BHC rows:", bhc_rows.shape[0])

# Display rows that are classified as Parent
parent_rows = numeric_ids_loc_df[numeric_ids_loc_df['Match_Type'] == 'Call_Parent']
print("Rows classified as Parent:")
print(parent_rows)
print("Number of Parent rows:", parent_rows.shape[0])

# Display rows that are classified as Subsidiary
subsidiary_rows = numeric_ids_loc_df[numeric_ids_loc_df['Match_Type'] == 'Call_Subsidiary']
print("Rows classified as Subsidiary:")
print(subsidiary_rows)
print("Number of Subsidiary rows:", subsidiary_rows.shape[0])


Rows classified as BHC:
     source       id                    lender_TopName Match_Type
3       1.0  1025608                FIRST HAWAIIAN INC        BHC
4       3.0  1026801            FREMONT BANCORPORATION        BHC
5       3.0  1027004                          ZIONS BC        BHC
6       1.0  1030040                 BANK OF CMRC HOLD        BHC
7       1.0  1030170                  TRICO BANCSHARES        BHC
..      ...      ...                               ...        ...
352     3.0  4876838                    PUGET SOUND BC        BHC
356     2.0  4973353             HARBORONE BANCORP INC        BHC
357     1.0  4980409                        OP BANCORP        BHC
358     1.0  4981648                        MARQUIS BC        BHC
359     1.0  4991076  SMITH & HOOD HOLDING COMPANY LLC        BHC

[242 rows x 4 columns]
Number of BHC rows: 242
Rows classified as Parent:
     source       id                                     lender_TopName  \
105     1.0  1250099              

# Out of 413 banks, 242 can be matched with BHC; 5 banks can be matched with the parent id in the call report; 18 of them can be matched with subsidiary in the call.  

In [10]:
# # Step 4: Tertiary Matching and Parent Assignment
# # For subsidiaries, assign the parent ID from the call report
# merged_data = numeric_ids_loc_df.merge(call_df[['id', 'parent_id']], left_on='id', right_on='id', how='left')
# numeric_ids_loc_df.loc[numeric_ids_loc_df['Match_Type'] == 'Call_Subsidiary', 'id'] = merged_data['parent_id']

In [11]:
numeric_ids_loc_df

Unnamed: 0,source,id,lender_TopName,Match_Type
0,1.0,1003790,USE CREDIT UNION,No Match
1,1.0,100393,FRANKENMUTH CREDIT UNION,No Match
2,3.0,100571,DEPARTMENT OF LABOR FCU,No Match
3,1.0,1025608,FIRST HAWAIIAN INC,BHC
4,3.0,1026801,FREMONT BANCORPORATION,BHC
...,...,...,...,...
408,3.0,967699,MIDUSA CREDIT UNION,No Match
409,3.0,971986,REACH FEDERAL CREDIT UNION,No Match
410,3.0,978471,UNITED STATES SENATE FEDERAL CREDIT UNION,No Match
411,1.0,986177,ADVIA CREDIT UNION,No Match


# Check
1. Check in call_df: Filter the call_df DataFrame to find rows where the 'id' matches the current LOC ID. If matching rows are found, print the details using the display_details function.

2. Check in fr_report if BHC: If the Match Type is 'BHC', filter the fr_report DataFrame to find rows where the 'id' matches the current LOC ID. If matching rows are found, print the details using the display_details function.

In [12]:
sampled_data = numeric_ids_loc_df.sample(10, random_state=1)

# Print the sampled LOC data for manual verification
print("Sampled LOC Data for Manual Verification:")
sampled_data

Sampled LOC Data for Manual Verification:


Unnamed: 0,source,id,lender_TopName,Match_Type
201,3.0,306681,MARKET USA FEDERAL CREDIT UNION,No Match
29,1.0,1073757,BANK OF AMERICA CORPORATION,BHC
102,1.0,1245705,WEST SUBURBAN BANCORP INC,BHC
407,1.0,95051,ONE AMERICAN BANK,Call_Subsidiary
186,1.0,2833891,EMPRESAS JUAN YARUR SPA,No Match
222,1.0,3253825,LAKEVIEW BANCORPORATION INC,BHC
242,1.0,3470154,US METRO BK,Call_Subsidiary
291,1.0,3846405,FIRST FEDERAL BANCORP MHC,BHC
171,1.0,2630746,UNITY BANCSHARES LLC,BHC
213,3.0,3188860,CALWEST BC,BHC


In [13]:
call_df

Unnamed: 0,date,id,charter_type,name_legal,name,org_type,fed_district_code,city,country,state,fed_regulator2,lei,parent_id
0,20171231.0,37,200.0,BANK OF HANCOCK COUNTY,BANK OF HANCOCK CTY,1.0,6.0,SPARTA,UNITED STATES,GA,FDIC,0,37
1,20171231.0,242,200.0,FIRST COMMUNITY BANK XENIA-FLORA,FIRST CMNTY BK XENIA FLORA,1.0,8.0,XENIA,UNITED STATES,IL,FRS,0,3088643
2,20171231.0,279,300.0,"MINEOLA COMMUNITY BANK, SSB",MINEOLA CMNTY BK SSB,6.0,11.0,MINEOLA,UNITED STATES,TX,FDIC,0,3619720
3,20171231.0,354,200.0,BISON STATE BANK,BISON ST BK,1.0,10.0,BISON,UNITED STATES,KS,FDIC,0,354
4,20171231.0,457,200.0,LOWRY STATE BANK,LOWRY ST BK,1.0,9.0,LOWRY,UNITED STATES,MN,FDIC,0,1127016
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5957,20171231.0,5086072,200.0,BLUE GATE BANK,BLUE GATE BK,1.0,12.0,COSTA MESA,UNITED STATES,CA,FDIC,0,5086072
5958,20171231.0,5087752,0.0,HUDSON BRANCH,ROYAL BK OF CANADA HUDSON BR,0.0,0.0,JERSEY CITY,UNITED STATES,NJ,OCC,0,5087752
5959,20171231.0,5087949,400.0,"MB FINANCIAL INTERNATIONAL, INC.",MB FNCL INTL,1.0,4.0,ROSEMONT,UNITED STATES,IL,FRS,0,1090987
5960,20171231.0,5113866,0.0,NEW YORK BRANCH,TAIWAN BUS BK NY BR,0.0,0.0,NEW YORK,UNITED STATES,NY,FRS,0,5113866


In [14]:
# Define columns to display
columns_to_display = ['id', 'name_legal', 'parent_id']

# Function to display details from a DataFrame
def display_details(df, columns):
    if not df.empty:
        print(df[columns].to_string(index=False))
    else:
        print("No details found.")

# Manual verification against call_df and fr_report
for index, row in sampled_data.iterrows():
    print(f"\nReviewing LOC ID: {row['id']} - Match Type: {row['Match_Type']}")

    # Checking in call_df
    call_details = call_df.loc[call_df['id'] == row['id'], columns_to_display]
    if not call_details.empty:
        print("Details from call_df:")
        display_details(call_details, columns_to_display)

    # Checking in fr_report if BHC
    if row['Match_Type'] == 'BHC':
        bhc_details = fr_report.loc[fr_report['id'] == row['id'], columns_to_display]
        if not bhc_details.empty:
            print("Details from fr_report (BHC):")
            display_details(bhc_details, columns_to_display)



Reviewing LOC ID: 306681 - Match Type: No Match

Reviewing LOC ID: 1073757 - Match Type: BHC
Details from fr_report (BHC):
     id                  name_legal  parent_id
1073757 BANK OF AMERICA CORPORATION          0

Reviewing LOC ID: 1245705 - Match Type: BHC
Details from fr_report (BHC):
     id                  name_legal  parent_id
1245705 WEST SUBURBAN BANCORP, INC.          0

Reviewing LOC ID: 95051 - Match Type: Call_Subsidiary
Details from call_df:
   id        name_legal  parent_id
95051 ONE AMERICAN BANK      95051

Reviewing LOC ID: 2833891 - Match Type: No Match

Reviewing LOC ID: 3253825 - Match Type: BHC
Details from fr_report (BHC):
     id                    name_legal  parent_id
3253825 LAKEVIEW BANCORPORATION, INC.          0

Reviewing LOC ID: 3470154 - Match Type: Call_Subsidiary
Details from call_df:
     id    name_legal  parent_id
3470154 US METRO BANK    3470154

Reviewing LOC ID: 3846405 - Match Type: BHC
Details from fr_report (BHC):
     id                

In [15]:
numeric_ids_loc_df.to_csv('analyzed_LOC_data.csv', index=False)

# a function in python (based on chatgpt) that allows you to keep non-merged rows. 

In [16]:
# import pandas as pd

# # Example data for df1
# data1 = {
#     'common_id': [1, 2, 3, 4],
#     'value_df1': ['A', 'B', 'C', 'D']
# }
# df1 = pd.DataFrame(data1)

# # Example data for df2
# data2 = {
#     'common_id': [3, 4, 5, 6],
#     'value_df2': ['X', 'Y', 'Z', 'W']
# }
# df2 = pd.DataFrame(data2)

# # Merge the DataFrames
# merged_df = pd.merge(df1, df2, on='common_id', how='outer')

# # Add a new column to indicate matching status
# merged_df['match_status'] = 'matched'
# merged_df.loc[merged_df['value_df1'].isna() | merged_df['value_df2'].isna(), 'match_status'] = 'notmatched'

# # Optionally, fill missing values
# merged_df.fillna('not available', inplace=True)

# # Display the resulting DataFrame
# print(merged_df)