In [None]:
import os
import zipfile
import pyreadstat
import pandas as pd

# Get a list of variable names (from https://rpubs.com/simaan84/bhc)

In [1]:
# I used the variable name list provided by Majeed Simaan, since I could not find the list.  
data = {
    'Description': ["id", "date", "parent_id", "country", "name", "name_legal", 
                    "fed_district_code", "org_type", "charter_type", "fed_regulator2", 
                    "fed_regulator", "lei", "lei", "city", "state"],
    'Code': ["RSSD9001", "RSSD9999", "RSSD9364", "RSSD9005", "RSSD9010", "RSSD9017", 
             "RSSD9032", "RSSD9047", "RSSD9048", "RSSD9217", "PRIM_FED_REG", "ID.LEI", 
             "ID_LEI", "RSSD9130", "RSSD9200"]
}

# Convert dictionary to DataFrame
var_df = pd.DataFrame(data)



In [2]:
var_df

Unnamed: 0,Description,Code
0,id,RSSD9001
1,date,RSSD9999
2,parent_id,RSSD9364
3,country,RSSD9005
4,name,RSSD9010
5,name_legal,RSSD9017
6,fed_district_code,RSSD9032
7,org_type,RSSD9047
8,charter_type,RSSD9048
9,fed_regulator2,RSSD9217


# import Call Reports

In [3]:


# Set the data directory path 
data_dir = '/Users/twylazhang/Desktop/Econ_DS_Research/background/COM'


# Get a list of names of zip files, since there are four zip files. 
zip_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.zip')]

# Unzip the files
for zip_file in zip_files:
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(data_dir)

# List of names of all .xpt files after unzipping
xpt_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.xpt')]


# Select the variables of interest
variables_of_interest = var_df['Code'].tolist()  

# Since each file have many other columns of data, 
# we only want variables of interest. So we read the .xpt files and filter by variables of interest. 
filtered_data_list = []
for file_path in xpt_files:
    df, meta = pyreadstat.read_xport(file_path, usecols=variables_of_interest)
    filtered_data_list.append(df)

# Combine data from all four files into a single data frame
call = pd.concat(filtered_data_list, ignore_index=True)

# Rename the columns using var_df by mapping 'Code' to 'Description'
var_name_to_label = pd.Series(var_df['Description'].values, index=var_df['Code']).to_dict()
call.rename(columns=var_name_to_label, inplace=True)

# Replace parent_id values that are zero with the corresponding id values
call.loc[call['parent_id'] == 0, 'parent_id'] = call.loc[call['parent_id'] == 0, 'id']

# Drop all rows with any missing values
call.dropna(inplace=True)

# Display the updated DataFrame
call



Unnamed: 0,date,id,charter_type,name_legal,name,org_type,fed_district_code,city,country,state,fed_regulator2,lei,parent_id
0,20170930.0,37.0,200.0,BANK OF HANCOCK COUNTY,BANK OF HANCOCK CTY,1.0,6.0,SPARTA,UNITED STATES,GA,FDIC,0,37.0
1,20170930.0,242.0,200.0,FIRST COMMUNITY BANK XENIA-FLORA,FIRST CMNTY BK XENIA FLORA,1.0,8.0,XENIA,UNITED STATES,IL,FRS,0,3088643.0
2,20170930.0,279.0,300.0,"MINEOLA COMMUNITY BANK, SSB",MINEOLA CMNTY BK SSB,6.0,11.0,MINEOLA,UNITED STATES,TX,FDIC,0,3619720.0
3,20170930.0,354.0,200.0,BISON STATE BANK,BISON ST BK,1.0,10.0,BISON,UNITED STATES,KS,FDIC,0,354.0
4,20170930.0,457.0,200.0,LOWRY STATE BANK,LOWRY ST BK,1.0,9.0,LOWRY,UNITED STATES,MN,FDIC,0,1127016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24204,20170331.0,5050028.0,200.0,INTERNATIONAL BANK OF COMMERCE,INTERNATIONAL BK OF CMRC,1.0,10.0,OKLAHOMA CITY,UNITED STATES,OK,FDIC,0,1104231.0
24205,20170331.0,5083316.0,0.0,DESJARDINS FLORIDA BRANCH,FEDERATION DES CAISSES FL BR,0.0,0.0,HALLANDALE,UNITED STATES,FL,OCC,0,5083316.0
24206,20170331.0,5086072.0,200.0,BLUE GATE BANK,BLUE GATE BK,1.0,12.0,COSTA MESA,UNITED STATES,CA,FDIC,0,5086072.0
24207,20170331.0,5087752.0,0.0,HUDSON BRANCH,ROYAL BK OF CANADA HUDSON BR,0.0,0.0,JERSEY CITY,UNITED STATES,NJ,OCC,0,5087752.0


In [4]:
call.describe()

Unnamed: 0,date,id,charter_type,org_type,fed_district_code,parent_id
count,24207.0,24207.0,24207.0,24207.0,24207.0,24207.0
mean,20170780.0,1010802.0,208.452514,1.301731,7.157847,1970430.0
std,335.4635,1058036.0,54.537115,1.287387,3.198892,1252251.0
min,20170330.0,37.0,0.0,0.0,0.0,37.0
25%,20170330.0,328207.5,200.0,1.0,6.0,1084696.0
50%,20170630.0,653134.0,200.0,1.0,7.0,1249105.0
75%,20170930.0,966731.0,200.0,1.0,10.0,3005332.0
max,20171230.0,5143788.0,400.0,11.0,12.0,5170805.0


# FR Y-9C Reports


In [5]:

# Set the directory paths
data_dir = '/Users/twylazhang/Desktop/Econ_DS_Research/background/HOLD'


# List all CSV files in the directory (assumes you want all CSV files in the directory)
csv_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]


# Read and combine all CSV files into one DataFrame
combined_FR = pd.concat([pd.read_csv(file, low_memory=False) for file in csv_files], ignore_index=True)



Since there were over 2000 columns in combined_FR, only keep those that are important

In [6]:
len(combined_FR.columns)

2486

In [7]:
# Rename the columns using the dictionary
combined_FR.rename(columns=var_name_to_label, inplace=True)

# Filter the DataFrame to keep only renamed columns
# Create a set of new column names from the dictionary for filtering
new_column_names = set(var_name_to_label.values())

# Keep only the columns that are in new_column_names
combined_FR = combined_FR.loc[:, combined_FR.columns.intersection(new_column_names)]

# There are rows with --------, drop those rows
combined_FR = combined_FR[combined_FR['id'] != '--------']

combined_FR

Unnamed: 0,id,date,fed_district_code,charter_type,city,country,parent_id,lei,name_legal,name,org_type,fed_regulator,state
1,1020180,20170930,9,500,SAINT PAUL,UNITED STATES,0,549300TXP74T8NJZJW60,BREMER FINANCIAL CORPORATION,BREMER FNCL CORP,1,FRS,MN
2,1020201,20170930,7,500,NEW YORK,UNITED STATES,3232316,549300LBOHZ4QSIWU288,HSBC USA INC.,HSBC USA,1,FRS,NY
3,1020902,20170930,10,500,OMAHA,UNITED STATES,0,0,"FIRST NATIONAL OF NEBRASKA, INC.",FIRST NAT OF NE,1,FRS,NE
4,1022764,20170930,12,500,HONOLULU,UNITED STATES,0,549300W3YEAOZ4KGG849,CENTRAL PACIFIC FINANCIAL CORP.,CENTRAL PACIFIC FC,1,FRS,HI
5,1023220,20170930,2,500,NEWARK,UNITED STATES,3587146,0,B.N.Y. HOLDINGS (DELAWARE) CORPORATION,BNY HOLD DE CORP,1,FRS,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10848,5110605,20170630,10,500,TOPEKA,UNITED STATES,0,0,"TOPEKA BANCORP, INC.",TOPEKA BC,1,FRS,KS
10849,5113857,20170630,12,500,SANTA MARIA,UNITED STATES,0,549300LRW3N23RUSWN53,COMMUNITY BANCORP OF SANTA MARIA,COMMUNITY BC OF SANTA MARIA,1,FRS,CA
10850,5116344,20170630,12,500,OAKLAND,UNITED STATES,0,549300557SO55H4J8018,CALIFORNIA BANCORP,CALIFORNIA BC,1,FRS,CA
10851,5116353,20170630,12,500,YUBA CITY,UNITED STATES,0,5493007TQPQO80U64N30,RIVER VALLEY COMMUNITY BANCORP,RIVER VALLEY CMNTY BC,1,FRS,CA


In [8]:
combined_FR[combined_FR['parent_id'] == "0"]

Unnamed: 0,id,date,fed_district_code,charter_type,city,country,parent_id,lei,name_legal,name,org_type,fed_regulator,state
1,1020180,20170930,9,500,SAINT PAUL,UNITED STATES,0,549300TXP74T8NJZJW60,BREMER FINANCIAL CORPORATION,BREMER FNCL CORP,1,FRS,MN
3,1020902,20170930,10,500,OMAHA,UNITED STATES,0,0,"FIRST NATIONAL OF NEBRASKA, INC.",FIRST NAT OF NE,1,FRS,NE
4,1022764,20170930,12,500,HONOLULU,UNITED STATES,0,549300W3YEAOZ4KGG849,CENTRAL PACIFIC FINANCIAL CORP.,CENTRAL PACIFIC FC,1,FRS,HI
6,1025309,20170930,12,500,HONOLULU,UNITED STATES,0,5493006Q8BQ8AD8M2U33,BANK OF HAWAII CORPORATION,BANK OF HI CORP,1,FRS,HI
7,1025541,20170930,12,500,SAN RAFAEL,UNITED STATES,0,0,WESTAMERICA BANCORPORATION,WESTAMERICA BC,1,FRS,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10847,5109672,20170630,6,500,ANNISTON,UNITED STATES,0,0,"NOBLE BANCSHARES, INC.",NOBLE BSHRS,1,FRS,AL
10848,5110605,20170630,10,500,TOPEKA,UNITED STATES,0,0,"TOPEKA BANCORP, INC.",TOPEKA BC,1,FRS,KS
10849,5113857,20170630,12,500,SANTA MARIA,UNITED STATES,0,549300LRW3N23RUSWN53,COMMUNITY BANCORP OF SANTA MARIA,COMMUNITY BC OF SANTA MARIA,1,FRS,CA
10850,5116344,20170630,12,500,OAKLAND,UNITED STATES,0,549300557SO55H4J8018,CALIFORNIA BANCORP,CALIFORNIA BC,1,FRS,CA


# 10085 parent ids are zeroes. 
# assign the parent id number the same value of the id column, i.e.

In [9]:
# Step 1: Replace 'parent_id' values of 0 with 'id' values from the same rows
combined_FR.loc[combined_FR['parent_id'] == "0", 'parent_id'] = combined_FR.loc[combined_FR['parent_id'] == "0", 'id']

# Drop all rows that contain any missing values
#combined_FR.dropna(inplace=True)

combined_FR[combined_FR['parent_id'] == "0"]


Unnamed: 0,id,date,fed_district_code,charter_type,city,country,parent_id,lei,name_legal,name,org_type,fed_regulator,state


# Merging the Databset

merger on parent_id of call and id of FR
1. Renaming Columns: appends "_call" to the names of all columns in the call df, except for the "date" column, which is left unchanged for merging purposes.

2. Creating Common Identifiers: It creates a new column common_id_L1 in both call df and FR df. In call df, common_id_call is set to parent_id_call; in FR df, common_id_call is set to id. This common identifier is used for merging the two datasets.

3. Merging Datasets: The datasets are merged on the common_id_call and date.

In [10]:
# Step 1
new_columns = []
for col in call.columns:
    if col != 'date':
        new_columns.append(col + '_call')
    else:
        new_columns.append(col)
call.columns = new_columns

In [11]:
# Step 2
call['common_id'] = call['parent_id_call']
combined_FR['common_id'] = combined_FR['id']

# Step 3: Merge the two datasets on 'common_id_L1' and 'date'
# call_FR_combined = pd.merge(call, combined_FR, on=['common_id', 'date'])


In [12]:
call

Unnamed: 0,date,id_call,charter_type_call,name_legal_call,name_call,org_type_call,fed_district_code_call,city_call,country_call,state_call,fed_regulator2_call,lei_call,parent_id_call,common_id
0,20170930.0,37.0,200.0,BANK OF HANCOCK COUNTY,BANK OF HANCOCK CTY,1.0,6.0,SPARTA,UNITED STATES,GA,FDIC,0,37.0,37.0
1,20170930.0,242.0,200.0,FIRST COMMUNITY BANK XENIA-FLORA,FIRST CMNTY BK XENIA FLORA,1.0,8.0,XENIA,UNITED STATES,IL,FRS,0,3088643.0,3088643.0
2,20170930.0,279.0,300.0,"MINEOLA COMMUNITY BANK, SSB",MINEOLA CMNTY BK SSB,6.0,11.0,MINEOLA,UNITED STATES,TX,FDIC,0,3619720.0,3619720.0
3,20170930.0,354.0,200.0,BISON STATE BANK,BISON ST BK,1.0,10.0,BISON,UNITED STATES,KS,FDIC,0,354.0,354.0
4,20170930.0,457.0,200.0,LOWRY STATE BANK,LOWRY ST BK,1.0,9.0,LOWRY,UNITED STATES,MN,FDIC,0,1127016.0,1127016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24204,20170331.0,5050028.0,200.0,INTERNATIONAL BANK OF COMMERCE,INTERNATIONAL BK OF CMRC,1.0,10.0,OKLAHOMA CITY,UNITED STATES,OK,FDIC,0,1104231.0,1104231.0
24205,20170331.0,5083316.0,0.0,DESJARDINS FLORIDA BRANCH,FEDERATION DES CAISSES FL BR,0.0,0.0,HALLANDALE,UNITED STATES,FL,OCC,0,5083316.0,5083316.0
24206,20170331.0,5086072.0,200.0,BLUE GATE BANK,BLUE GATE BK,1.0,12.0,COSTA MESA,UNITED STATES,CA,FDIC,0,5086072.0,5086072.0
24207,20170331.0,5087752.0,0.0,HUDSON BRANCH,ROYAL BK OF CANADA HUDSON BR,0.0,0.0,JERSEY CITY,UNITED STATES,NJ,OCC,0,5087752.0,5087752.0


In [13]:
print(call['common_id'].dtype, combined_FR['common_id'].dtype)
print(call['date'].dtype, combined_FR['date'].dtype)


float64 object
float64 object


In [14]:
# Convert 'common_id_L1' and 'date' to string in both DataFrames
call['common_id'] = call['common_id'].astype(float)
combined_FR['common_id'] = combined_FR['common_id'].astype(float)

call['date'] = call['date'].astype(float)
combined_FR['date'] = combined_FR['date'].astype(float)

# Now try merging again
call_FR_combined = pd.merge(call, combined_FR, on=['common_id', 'date'])


In [15]:
call_FR_combined

Unnamed: 0,date,id_call,charter_type_call,name_legal_call,name_call,org_type_call,fed_district_code_call,city_call,country_call,state_call,...,charter_type,city,country,parent_id,lei,name_legal,name,org_type,fed_regulator,state
0,20170930.0,2376.0,300.0,ORITANI BANK,ORITANI BK,1.0,2.0,TOWNSHIP OF WASHINGTON,UNITED STATES,NJ,...,500,TOWNSHIP OF WASHINGTON,UNITED STATES,2692892,0,ORITANI FINANCIAL CORP,ORITANI FNCL CORP,1,FRS,NJ
1,20170930.0,3971.0,300.0,MIDCOUNTRY BANK,MIDCOUNTRY BK,1.0,9.0,BLOOMINGTON,UNITED STATES,MN,...,500,MACON,UNITED STATES,3839902,0,MIDCOUNTRY FINANCIAL CORP,MIDCOUNTRY FNCL CORP,1,FRS,GA
2,20170930.0,5210.0,200.0,ACNB BANK,ACNB BK,1.0,3.0,GETTYSBURG,UNITED STATES,PA,...,500,GETTYSBURG,UNITED STATES,1117464,0,ACNB CORPORATION,ACNB CORP,1,FRS,PA
3,20170930.0,7009.0,200.0,BESSEMER TRUST COMPANY,BESSEMER TC,1.0,2.0,WOODBRIDGE,UNITED STATES,NJ,...,500,WOODBRIDGE,UNITED STATES,1246159,0,"BESSEMER GROUP, INCORPORATED, THE",BESSEMER GRP,1,FRS,NJ
4,20170930.0,976703.0,200.0,"BESSEMER TRUST COMPANY, N.A.",BESSEMER TC NA,1.0,2.0,NEW YORK,UNITED STATES,NY,...,500,WOODBRIDGE,UNITED STATES,1246159,0,"BESSEMER GROUP, INCORPORATED, THE",BESSEMER GRP,1,FRS,NJ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10971,20170331.0,4041421.0,200.0,"FLORIDA COMMUNITY BANK, NATIONAL ASSOCIATION",FLORIDA CMNTY BK NA,1.0,6.0,WESTON,UNITED STATES,FL,...,500,WESTON,UNITED STATES,3944628,0,"FCB FINANCIAL HOLDINGS, INC.",FCB FNCL HOLDS,1,FRS,FL
10972,20170331.0,4155841.0,300.0,"BAY BANK, FSB",BAY BK FSB,1.0,5.0,COLUMBIA,UNITED STATES,MD,...,500,COLUMBIA,UNITED STATES,1469800,0,"BAY BANCORP, INC.",BAY BC,1,FRS,MD
10973,20170331.0,4160667.0,200.0,CAPITAL BANK CORPORATION,CAPITAL BK CORP,1.0,5.0,RALEIGH,UNITED STATES,NC,...,500,CHARLOTTE,UNITED STATES,4160939,0,CAPITAL BANK FINANCIAL CORP.,CAPITAL BK FNCL CORP,1,FRS,NC
10974,20170331.0,4210227.0,200.0,NBH BANK,NBH BK,1.0,10.0,GREENWOOD VILLAGE,UNITED STATES,CO,...,500,GREENWOOD VILLAGE,UNITED STATES,3973888,0,NATIONAL BANK HOLDINGS CORPORATION,NATIONAL BK HOLDS CORP,1,FRS,CO


In [16]:
 

# Compute the number of unique subsidiaries for each group
num_sub = call_FR_combined.groupby(['common_id', 'date']).agg(number_subs_1=('id_call', 'nunique')).reset_index()

# Print the results
print(num_sub)





      common_id        date  number_subs_1
0     1020180.0  20170331.0              2
1     1020180.0  20170630.0              2
2     1020180.0  20170930.0              2
3     1020180.0  20171231.0              2
4     1020395.0  20170630.0              1
...         ...         ...            ...
9747  5147115.0  20171231.0              1
9748  5158269.0  20171231.0              1
9749  5163003.0  20171231.0              1
9750  5163898.0  20171231.0              1
9751  5170805.0  20171231.0              1

[9752 rows x 3 columns]


In [17]:
num_sub.describe()

Unnamed: 0,common_id,date,number_subs_1
count,9752.0,9752.0,9752.0
mean,2211539.0,20170890.0,1.125513
std,1178180.0,317.0263,0.673446
min,1020180.0,20170330.0,1.0
25%,1129487.0,20170630.0,1.0
50%,1947102.0,20170630.0,1.0
75%,3160054.0,20171230.0,1.0
max,5170805.0,20171230.0,17.0


# from https://rpubs.com/simaan84/bhc
We observe that, mainly, there is a one-to-one relationship, with the exception for a number of cases. For instance, we note that in the last quarter of 2017, the Wintrust Financial Corporation had 17 subsidiaries:

In [18]:
# Step 1: Find IDs with 17 subsidiaries
id17 = num_sub.loc[num_sub['number_subs_1'] == 17, 'common_id'].unique()

# Step 2: Get the list of names for these IDs at the specific date
WT_list = call_FR_combined[(call_FR_combined['common_id'].isin(id17)) & (call_FR_combined['date'] == 20171231)]['name_call']

# Step 3: Print sorted list of names
print("\n".join(sorted(WT_list.astype(str))))


BARRINGTON B&TC NA
BEVERLY B&TC NA
CHICAGO TC NA
CRYSTAL LAKE B&TC NA
FIFC EDGE INTL CORP
HINSDALE B&TC
LAKE FOREST B&TC NA
LIBERTYVILLE B&TC
NORTHBROOK B&TC
OLD PLANK TRAIL CMNTY BK NA
SCHAUMBURG B&TC NA
ST CHARLES B&TC
STATE BK OF THE LAKES
TOWN BK
VILLAGE B&TC
WHEATON B&TC
WINTRUST BK


On the other hand, if we look at JP Morgan, for instance, we observe that it has two main subsidiaries at the end of 2017. This is in line with the 10-K report from the SEC (see link). Nevertheless, it is also appears that each subsidiary has a number of other subsidiaries in the annual report, which our data does not seem to convey such complexity in the structure:

In [19]:

# Filtering the DataFrame for JP Morgan entries at the end of 2017
JP_list = call_FR_combined[(call_FR_combined['common_id'] == 1039502) & (call_FR_combined['date'] == 20171231)]['name_legal_call']

# Sorting the names and printing each on a new line
sorted_JP_list = sorted(JP_list.astype(str)) 
print("\n".join(sorted_JP_list))

J.P. MORGAN INTERNATIONAL FINANCE LIMITED
JPMORGAN CHASE BANK, NATIONAL ASSOCIATION


# Final Data



In [20]:
ds_final = call_FR_combined[['date', 'name_legal_call', 'lei_call', 'id_call', 'name_legal', 'lei', 'id']].copy()

# Compute the number of subsidiaries for each BHC in the sample
# Using transform to add the count of subsidiaries directly into the dataframe
ds_final['no_sub'] = ds_final.groupby(['id', 'date'])['id'].transform('count')

# Clean up rows where ID might be NaN (although not explicitly done in the previous example, suggested for robustness)
ds_final = ds_final[ds_final['id'].notna()]


ds_final


Unnamed: 0,date,name_legal_call,lei_call,id_call,name_legal,lei,id,no_sub
0,20170930.0,ORITANI BANK,549300XRBLWUN6J76L65,2376.0,ORITANI FINANCIAL CORP,0,2692892,1
1,20170930.0,MIDCOUNTRY BANK,0,3971.0,MIDCOUNTRY FINANCIAL CORP,0,3839902,1
2,20170930.0,ACNB BANK,0,5210.0,ACNB CORPORATION,0,1117464,1
3,20170930.0,BESSEMER TRUST COMPANY,549300NEQHXUX4MZIH22,7009.0,"BESSEMER GROUP, INCORPORATED, THE",0,1246159,4
4,20170930.0,"BESSEMER TRUST COMPANY, N.A.",549300OTTCSOD5OCZ143,976703.0,"BESSEMER GROUP, INCORPORATED, THE",0,1246159,4
...,...,...,...,...,...,...,...,...
10971,20170331.0,"FLORIDA COMMUNITY BANK, NATIONAL ASSOCIATION",549300SGSDB4HQJSLK07,4041421.0,"FCB FINANCIAL HOLDINGS, INC.",0,3944628,1
10972,20170331.0,"BAY BANK, FSB",0,4155841.0,"BAY BANCORP, INC.",0,1469800,1
10973,20170331.0,CAPITAL BANK CORPORATION,5493005LC6991GWTPI71,4160667.0,CAPITAL BANK FINANCIAL CORP.,0,4160939,1
10974,20170331.0,NBH BANK,549300T7AIY7RU1YDP31,4210227.0,NATIONAL BANK HOLDINGS CORPORATION,0,3973888,1


# Publically Traded Banks




In [21]:
# Load the dataset from the Federal Reserve Bank of New York that lists publicly traded banks
url = "https://www.newyorkfed.org/medialibrary/media/research/banking_research/data/crsp_20161231.csv"
crsp = pd.read_csv(url)

# Filter ds_final to include only IDs present in the CRSP dataset to find publicly traded banks
ds_final_public = ds_final[ds_final['id'].isin(crsp['entity'])]

# Count unique publicly traded companies and print the result
unique_public_companies = len(ds_final_public['id'].unique())
print(f"Total publicly traded companies: {unique_public_companies}")

Total publicly traded companies: 0


In [22]:
# Convert 'id' in ds_final and 'entity' in crsp to integers
ds_final['id'] = pd.to_numeric(ds_final['id'], errors='coerce').astype('Int64')
crsp['entity'] = pd.to_numeric(crsp['entity'], errors='coerce').astype('Int64')

# Filter ds_final to include only IDs present in the crsp dataset
ds_final_public = ds_final[ds_final['id'].isin(crsp['entity'].dropna())]

# Count unique publicly traded companies and print the result
unique_public_companies = len(ds_final_public['id'].unique())
print(f"Total publicly traded companies: {unique_public_companies}")

Total publicly traded companies: 421
