# Notebook 01: Data Cleaning & Shock Identification

**Objective:**
1. Isolate the 16 valid exogenous brokerage shocks from the raw M&A data.
2. Clean the raw I/B/E/S analyst forecast data for downstream matching.

In [61]:
from datetime import timedelta

import pandas as pd

We filter the M&A deals to find significant brokerage closure/mergers that server as our instrument.

In [62]:
file_path = 'data/raw_mergers_and_acquisitions.xlsx'
df = pd.read_excel(file_path)

Filter for significant deals to ensure they impact coverage

In [63]:
df = df[df['Rank Value inc. Net Debt of Target\n(USD, Millions)'] > 50]

Apply manual selection of valid exogenous shocks. These indices correspond to verified closures/mergers affecting analysts coverage.

In [64]:
valid_indices = [
    0,   # Cowen Inc (2022)
    5,   # JMP Group (2021)
    14,  # Ladenburg Thalmann (2019)
    15,  # Sandler O'Neill (2019)
    19,  # Leerink Holdings (2018)
    30,  # Simmons & Co (2015)
    33,  # Sterne Agee (2015)
    36,  # ISI Group (2014)
    38,  # SWS Group (2014)
    42,  # Knight Capital (2012)
    44,  # KBW (2012)
    46,  # Morgan Keegan (2012)
    55,  # Thomas Weisel Partners (2010)
    59,  # Fox-Pitt Kelton (2009)
    63,  # Wachovia (2008)
    67  # Merrill Lynch (2008)
]

df_shocks = df.iloc[valid_indices].copy()

column_mapping = {
    'Date Announced': 'Shock_Date',
}

df_shocks = df_shocks.rename(columns=column_mapping)
df_shocks['Shock_Date'] = pd.to_datetime(df_shocks['Shock_Date'])

In [65]:
output_shocks = 'data/final_brokerage_shocks.csv'
df_shocks.to_csv(output_shocks, index=False)
print(df_shocks[['Shock_Date', 'Target Full Name']].head())

    Shock_Date                           Target Full Name
11  2022-08-02                                  Cowen Inc
49  2021-09-08                              JMP Group Inc
108 2019-11-11  Ladenburg Thalmann Financial Services Inc
123 2019-07-09                Sandler O'Neill Partners LP
152 2018-11-13                       Leerink Holdings LLC


Load and clean forecast data (I/B/E/S)

In [66]:
ibes_path = 'data/raw_ibes_forecasts.csv'
cols = ['ESTIMATOR', 'ANNDATS', 'OFTIC']
df_forecasts = pd.read_csv(ibes_path, usecols=cols, low_memory=False)
df_forecasts['ANNDATS'] = pd.to_datetime(df_ibes['ANNDATS'], errors='coerce')
df_forecasts.dropna(subset=['ESTIMATOR', 'ANNDATS'], inplace=True)

Attrition matching algorithm

In [67]:
# formatting
df_shocks['Date Effective'] = pd.to_datetime(df_shocks['Date Effective'], errors='coerce')

# determine stats and lifespan for every estimator
estimator_stats = df_forecasts.groupby('ESTIMATOR')['ANNDATS'].agg(['max', 'count']).reset_index()
estimator_stats.columns = ['ESTIMATOR', 'Last_Seen_Date', 'Total_Estimates']

# match deaths to shocks
matches = []
days_before = 90
days_after = 14

for _, shock in df_shocks.iterrows():
    shock_date = shock['Date Effective']
    broker_name = shock['Target Full Name']

    if pd.isna(shock_date):
        continue

    # define search window
    window_start = shock_date - timedelta(days_before)
    window_end = shock_date + timedelta(days=days_after)

    # find candidates in this window
    candidates = estimator_stats[
        (estimator_stats['Last_Seen_Date'] >= window_start) &
        (estimator_stats['Last_Seen_Date'] <= window_end)
    ].copy()

    if not candidates.empty:
        candidates['Days_Diff'] = (candidates['Last_Seen_Date'] - shock_date).dt.days
        candidates['Suspected_Broker'] = broker_name
        candidates['Shock_Date'] = shock_date
        matches.append(candidates)

In [68]:
# rank and filter results
all_matches = pd.concat(matches)
all_matches = all_matches.sort_values(by=['Suspected_Broker', 'Total_Estimates'], ascending=[True, False])

# select the best match
all_targets = df_shocks['Target Full Name'].unique()
all_matches['Abs_Diff'] = all_matches['Days_Diff'].abs()
all_matches = all_matches.sort_values(by=['Abs_Diff', 'Total_Estimates'], ascending=[True, False])

# drop duplicates
unique_ids = all_matches.drop_duplicates(subset=['ESTIMATOR'], keep='first')
final_list = []

for target in all_targets:
    candidates = unique_ids[unique_ids['Suspected_Broker'] == target]
    if not candidates.empty:
        best_match = candidates.sort_values('Total_Estimates', ascending=False).iloc[0]
        final_list.append({
            'Broker_Name': target,
            'Identified_ID': int(best_match['ESTIMATOR']),
            'Days_Difference': int(best_match['Days_Diff']),
            'Total_Estimates': int(best_match['Total_Estimates']),
            'Last_Date_Seen': best_match['Last_Seen_Date'],
            'Shock_Date': best_match['Shock_Date']
        })

In [69]:
df_final = pd.DataFrame(final_list)
df_final = df_final.sort_values('Broker_Name')
df_final.to_csv('data/final_broker_decoder_ring.csv', index=False)

In [71]:
print(df_final[['Broker_Name', 'Identified_ID', 'Days_Difference']].head())

                                      Broker_Name  Identified_ID  \
0                                       Cowen Inc            183   
13         Fox-Pitt Kelton Cochran Caronia Waller           2299   
7   International Strategy & Investment Group LLC           3102   
1                                   JMP Group Inc             22   
10                                        KBW Inc           2191   

    Days_Difference  
0               -61  
13              -45  
7                 0  
1               -80  
10              -60  
