# Notebook 02: Instrument Construction

**Objective:**
1. **Identify Treated Stocks:** Find all stock covered by the exiting brokers
2. **Select Control Stocks:** Match treated stocks with controls based on pre-shock coverage.
3. **Construct Panel:** Build the monthly dataset with ghost analyst censoring

In [8]:
import numpy as np
import pandas as pd

Helper Function

This function calculates the rolling covergae but **explicitly removes** the specific broker ID if the date is after the shock. This prevents the lag where a dead broker looks active because their last report was < 12 months ago.

In [9]:
MIN_COVERAGE = 3   # Must have at least this many analysts
MAX_COVERAGE = 15  # discard stocks with more than this (Mega-caps)
RECENCY_DAYS = 90  # Analyst must have spoken in last 90 days

In [10]:
def get_rolling_coverage_censored(df_forecast, stock_list, current_date, shock_date, dead_broker_id):
    """
    Calculates 12-month rolling coverage
    IF current_date >= shock_date, explicitly removes dead_broker_id from the set of active analysts.
    """
    # define 12-month rolling window
    window_start = current_date - pd.Timedelta(days=365)

    # filter for activity in this window for these stocks
    mask = (
    (df_forecast['CUSIP'].isin(stock_list)) &
    (df_forecast['ANNDATS'] >= window_start) &
    (df_forecast['ANNDATS'] <= current_date)
    )

    # get list of unique brokers per stock
    active_groups = df_forecast[mask].groupby('CUSIP')['ESTIMATOR'].unique()
    active_groups = active_groups.reindex(stock_list)

    counts = []
    is_post_period = current_date >= shock_date

    for stock in stock_list:
        brokers = active_groups.loc[stock]

        if isinstance(brokers, float):
            counts.append(0)
            continue

        broker_set = set(brokers)

        if is_post_period and (dead_broker_id in broker_set):
            broker_set.remove(dead_broker_id)

        counts.append(len(broker_set))

    return pd.Series(counts, index=stock_list)

Load Data

In [11]:
file_path1 = 'data/final_broker_decoder_ring.csv'
df_shocks = pd.read_csv(file_path1)

file_path2 = 'data/raw_ibes_forecasts.csv'
df_forecast = pd.read_csv(file_path2, dtype={'CUSIP': str})

In [12]:
df_shocks['Shock_Date'] = pd.to_datetime(df_shocks['Shock_Date'])
df_forecast['ANNDATS'] = pd.to_datetime(df_forecast['ANNDATS'])
df_forecast = df_forecast.dropna(subset=['CUSIP', 'ESTIMATOR'])
print(df_shocks[['Broker_Name', 'Identified_ID', 'Shock_Date']].head())

                                     Broker_Name  Identified_ID Shock_Date
0                                      Cowen Inc            183 2023-03-01
1         Fox-Pitt Kelton Cochran Caronia Waller           2299 2009-10-02
2  International Strategy & Investment Group LLC           3102 2014-10-31
3                                  JMP Group Inc             22 2021-11-15
4                                        KBW Inc           2191 2013-02-15


Build the panel (Treatment & Control Selection)

We iterate through each shock, find the treated universe, select controls, and calculate coverage for the event window.

In [13]:
panel_data = []
total_shocks = len(df_shocks)

for idx, row in df_shocks.iterrows():
    broker_id = row['Identified_ID']
    shock_date = row['Shock_Date']
    broker_name = row['Broker_Name']

    # define pre-period (1 year before shock) to identify coverage
    pre_start = shock_date - pd.Timedelta(days=365)

    # Identify treated stocks
    # (stocks covered by this specific brker in the year before the shock
    treated_mask = (
        (df_forecast['ESTIMATOR'] == broker_id) &
        (df_forecast['ANNDATS'] >= pre_start) &
        (df_forecast['ANNDATS'] < shock_date)
    )
    treated_stocks_raw = df_forecast[treated_mask]['CUSIP'].unique()

    # keep only stocks with >= 3 analysts
    raw_coverage = get_rolling_coverage_censored(df_forecast, treated_stocks_raw, shock_date, shock_date, -999)
    treated_stocks = raw_coverage[
        (raw_coverage >= MIN_COVERAGE) &
        (raw_coverage <= MAX_COVERAGE)
    ].index.values

    if len(treated_stocks) == 0:
        continue

    # Select control stocks
    # we want stocks not covered by this broker, but with similar total coverage
    # calculate pre-shock coverage for treated stocks (target baseline)
    target_median = raw_coverage[treated_stocks].median()

    # define matching pool (all active stocks - treated stocks)
    active_mask = (
        (df_forecast['ANNDATS'] >= pre_start) &
        (df_forecast['ANNDATS'] < shock_date)
    )
    all_active = df_forecast[active_mask]['CUSIP'].unique()
    potential_controls = np.setdiff1d(all_active, treated_stocks)

    # filter controls by coverage similarity (+/- 2 analysts)
    universe_mask = (df_forecast['CUSIP'].isin(potential_controls)) & \
                    (df_forecast['ANNDATS'] >= pre_start) & \
                    (df_forecast['ANNDATS'] < shock_date)

    control_counts = df_forecast[universe_mask].groupby('CUSIP')['ESTIMATOR'].nunique()

    # Apply Tolerance (+/- 2 analysts)
    valid_controls = control_counts[
        (control_counts >= max(MIN_COVERAGE, target_median - 2)) &
        (control_counts <= min(MAX_COVERAGE, target_median + 2))
    ].index.values

    # final selection
    if len(valid_controls) >= len(treated_stocks):
        selected_controls = np.random.choice(valid_controls, size=len(treated_stocks), replace=False)
        match_type = "Matched"
    else:
        selected_controls = np.random.choice(potential_controls, size=len(treated_stocks), replace=False)
        match_type = "Random"

    # construct monthly panel (-6 to +6)
    current_universe = np.concatenate([treated_stocks, selected_controls])

    print(f"[{idx+1}/{total_shocks}] {broker_name}: {len(treated_stocks)} Treated, {len(selected_controls)} Controls. Median Cov: {target_median}")

    for m in range(-6, 7):
        # calculate calendar date
        calc_date = shock_date + pd.Timedelta(days=m*30)

        # calculate censored coverage
        # post-shock, dead broker is removed
        counts = get_rolling_coverage_censored(
            df_forecast,
            current_universe,
            calc_date,
            shock_date,
            dead_broker_id=broker_id
        )

        # create df for this month
        df_m = pd.DataFrame({'CUSIP': current_universe, 'Coverage': counts.values})
        df_m['Event_Month'] = m
        df_m['Event_ID'] = broker_id
        df_m['Broker_Name'] = broker_name
        df_m['Treated'] = df_m['CUSIP'].isin(treated_stocks).astype(int)
        df_m['Post'] = int(m >= 0)
        df_m['Treated_Post'] = df_m['Treated'] * df_m['Post']

        # save calendar month
        df_m['Month_ID'] = calc_date.to_period('M')

        panel_data.append(df_m)


[1/16] Cowen Inc: 983 Treated, 983 Controls. Median Cov: 9.0
[2/16] Fox-Pitt Kelton Cochran Caronia Waller: 21 Treated, 21 Controls. Median Cov: 9.0
[3/16] International Strategy & Investment Group LLC: 121 Treated, 121 Controls. Median Cov: 9.0
[4/16] JMP Group Inc: 11 Treated, 11 Controls. Median Cov: 10.0
[5/16] KBW Inc: 43 Treated, 43 Controls. Median Cov: 11.0
[6/16] Knight Capital Group Inc: 109 Treated, 109 Controls. Median Cov: 9.0
[8/16] Leerink Holdings LLC: 1 Treated, 1 Controls. Median Cov: 10.0
[9/16] Merrill Lynch & Co Inc: 487 Treated, 487 Controls. Median Cov: 10.0
[10/16] Morgan Keegan & Co Inc: 207 Treated, 207 Controls. Median Cov: 9.0
[12/16] Sandler O'Neill Partners LP: 237 Treated, 237 Controls. Median Cov: 7.0
[13/16] Simmons & Co International: 36 Treated, 36 Controls. Median Cov: 11.0
[14/16] Sterne Agee Group Inc: 16 Treated, 16 Controls. Median Cov: 7.5
[15/16] Thomas Weisel Partners Group Inc: 244 Treated, 244 Controls. Median Cov: 9.0
[16/16] Wachovia Secur

Save panel data

In [15]:
df_panel = pd.concat(panel_data)
output_path = 'data/instrument_panel.csv'
df_panel.to_csv(output_path, index=False)
print(df_panel.head())

      CUSIP  Coverage  Event_Month  Event_ID Broker_Name  Treated  Post  \
0  90400D10        15           -6       183   Cowen Inc        1     0   
1  71742Q10         4           -6       183   Cowen Inc        1     0   
2  78573M10         7           -6       183   Cowen Inc        1     0   
3  03990B10        13           -6       183   Cowen Inc        1     0   
4  G0751N10         9           -6       183   Cowen Inc        1     0   

   Treated_Post Month_ID  
0             0  2022-09  
1             0  2022-09  
2             0  2022-09  
3             0  2022-09  
4             0  2022-09  
