# Notebook 05: Data merging and control variable construction

In [1]:
import pandas as pd
import numpy as np

load data

In [2]:
file_path = 'data/monthly_outcomes.csv'
df_stock = pd.read_csv(file_path)
df_stock['Month_ID'] = pd.PeriodIndex(df_stock['Month_ID'], freq='M')

In [7]:
file_path = 'data/compustat_annual.csv'
cols_comp = [
    'gvkey', 'tic', 'cusip', 'fyear', 'datadate',
    'at',      # Total Assets
    'ni',      # Net Income
    'dltt',    # Long Term Debt
    'dlc',     # Debt in Current Liab
    'prcc_f',  # Price Close (Fiscal)
    'csho',    # Shares Outstanding
    'ceq',     # Common Equity
    'xrd',     # R&D Expense
    'sich'     # Historical SIC Code (Industry)
]
df_fund = pd.read_csv(file_path, usecols=lambda x: x.lower() in cols_comp)
df_fund.columns = df_fund.columns.str.lower()

Construct control variables

In [8]:
# firm size = log(Total Assets)
df_fund['Size'] = np.log(df_fund['at'].replace(0, np.nan))

# return on assets = Net Income / Total Assets
df_fund['ROA'] = df_fund['ni'] / df_fund['at']

# leverage = (Long Term Debt + Current Debt) / Total Assets
df_fund['Total_Debt'] = df_fund['dltt'].fillna(0) + df_fund['dlc'].fillna(0)
df_fund['Leverage'] = df_fund['Total_Debt'] / df_fund['at']

# market-to-book = (Price * Shares) / Common Equity
df_fund['Mkt_Cap'] = df_fund['prcc_f'] * df_fund['csho']
df_fund['MTB'] = df_fund['Mkt_Cap'] / df_fund['ceq']

# r&d intensity = R&D Expense / Total Assets
df_fund['RnD_Exp'] = df_fund['xrd'].fillna(0)
df_fund['Opaqueness'] = df_fund['RnD_Exp'] / df_fund['at']

# industry fixed effects
# we use the first 2 digits of the SIC code (major group)
df_fund['Industry_SIC2'] = df_fund['sich'].astype(str).str[:2]

Lagging and merging strategy

Stock returns in **2015** should be predicted by financial data from **2014**.

In [9]:
df_fund['Join_Year'] = df_fund['fyear'] + 1
df_stock['Join_Year'] = df_stock['Month_ID'].dt.year

# Compustat has 9 digits, CRSP has 8. We strip the last digit from Compustat
df_fund['CUSIP_8'] = df_fund['cusip'].astype(str).str[:8]

# handle duplicates in Compustat
df_fund = df_fund.sort_values(['CUSIP_8', 'Join_Year', 'datadate']).drop_duplicates(subset=['CUSIP_8', 'Join_Year'], keep='last')

# select columns to merge
controls_to_keep = ['CUSIP_8', 'Join_Year', 'Size', 'ROA', 'Leverage', 'MTB', 'Opaqueness', 'Industry_SIC2']
df_fund_clean = df_fund[controls_to_keep].copy()

In [10]:
df_merge = pd.merge(
    df_stock,
    df_fund_clean,
    left_on=['CUSIP', 'Join_Year'],
    right_on=['CUSIP_8', 'Join_Year'],
    how='left'
)

In [11]:
total_obs = len(df_merge)
matched_obs = len(df_merge.dropna(subset=['Size']))
print(f"Total Observations: {total_obs}")
print(f"Observations with Control Vars: {matched_obs}")
print(f"Match Rate: {matched_obs / total_obs}")

Total Observations: 590284
Observations with Control Vars: 517063
Match Rate: 0.8759563193310339
