In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.weightstats import DescrStatsW
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [12]:
def classify_into_quantitle(data, time_col, stock_col, var_col, quintile_col='quantitle'):
    """
    Classifies stocks into quantitle based on a variable from the previous month.

    Parameters:
    - data (pd.DataFrame): Panel dataset with time, stock, and variable columns.
    - time_col (str): Column name for time (monthly).
    - stock_col (str): Column name for stock identifiers.
    - var_col (str): Column name for the variable used to classify stocks.
    - quintile_col (str): Name for the new column storing quintile classifications.

    Returns:
    - pd.DataFrame: Original data with an added column for quintile classifications.
    """
    # Sort data by time and stock for consistency
    data = data.sort_values(by=[time_col, stock_col])
    
    # Create a column for the previous month's variable
    data['prev_' + var_col] = data.groupby(stock_col)[var_col].shift(1)
    
    # Define a function to calculate quantitle for each month
    def assign_quantitle(df):
        # Check if there are enough unique values to create quantitle
        if df['prev_' + var_col].nunique() >= 5:
            df[quintile_col] = pd.qcut(df['prev_' + var_col], q=5, labels=range(1, 6))
        else:
            # Assign NaN or a single bin if not enough unique values
            df[quintile_col] = pd.cut(df['prev_' + var_col], bins=len(df['prev_' + var_col].unique()), labels=range(1, len(df['prev_' + var_col].unique()) + 1))
        return df
    
    # Apply quintile classification for each time period
    data = data.groupby(time_col).apply(assign_quantitle)
    
    # Clean up intermediate column
    data = data.drop(columns=['prev_' + var_col])
    
    data = data.dropna()
    for i in range(2, 6):
        data[f'{var_col}_Q{i}'] = np.where(data[quintile_col] == i, 1, 0)
        
    data = data.drop(columns=[quintile_col])
    data = data.reset_index(drop=True)
    return data

def fama_macbeth_regression(data, dep_var, indep_vars, time_col):
    """
    Perform Fama-MacBeth regression.

    Parameters:
    - data (pd.DataFrame): Panel data with columns for dependent and independent variables, and time periods.
    - dep_var (str): The dependent variable.
    - indep_vars (list): List of independent variables.
    - time_col (str): Column name representing time periods.

    Returns:
    - pd.DataFrame: Average coefficients and standard errors.
    """
    results = []
    obs = []
    r_squared = []

    for time_period in data[time_col].unique():
        cross_section = data[data[time_col] == time_period]
        obs.append(len(cross_section))
        X = sm.add_constant(cross_section[indep_vars])  # Add intercept
        y = cross_section[dep_var]
        model = sm.OLS(y, X).fit()
        results.append(model.params)
        adjusted_r_squared = model.rsquared_adj
        r_squared.append(adjusted_r_squared)

    # Combine results into a DataFrame
    results_df = pd.DataFrame(results)
    
    # Compute average coefficients and standard errors
    avg_coefficients = results_df.mean().round(4)
    std_errors = results_df.std() / (len(results_df) ** 0.5)  # Standard errors for the averages
    t_stats = avg_coefficients / std_errors
    t_stats = round(t_stats, 2)
    
    obs_average = int(np.mean(obs))
    adjusted_r_squared_avg = np.mean(r_squared).round(3)
    
    summary_df = pd.DataFrame({
        'Coefficient': avg_coefficients,
        't-statistic': t_stats
    })
    
    filtered_data = summary_df[~summary_df.index.str.startswith('ind')]
    
    df = pd.DataFrame({'Coefficient': [obs_average, adjusted_r_squared_avg, 'Yes']}, 
                      index=['Number of Observations', 'Adjusted R-Squared', 'Industry control']
)
    final_df = pd.concat([filtered_data, df])
    
    return final_df

In [64]:
# read dataset
df = pd.read_csv('../Data/mispricing_signal.csv')
df['Date'] = pd.to_datetime(df['date'])
df['Date'] = df['Date'].dt.strftime('%Y-%m')
df.drop(columns= ['Unnamed: 0', 'date'], inplace= True)

df.rename(columns= {'Mispricing_Signal': 'Mispricing Signal-LR'}, inplace=True)
# mispricing_signal_randome_forest = pd.read_csv('../Data/output_mispricing_signal.csv')
# mispricing_signal_randome_forest['Date'] = pd.to_datetime(mispricing_signal_randome_forest['date'])
# mispricing_signal_randome_forest['Date'] = mispricing_signal_randome_forest['Date'].dt.strftime('%Y-%m')
# mispricing_signal_randome_forest.drop(columns=['Unnamed: 0', 'date'], inplace= True)

# # merge two tables by Permno and Date
# df_merged = pd.merge(df, mispricing_signal_randome_forest, on=['PERMNO', 'Date'], how = 'inner')
# df_merged.rename(columns={'Mj,t': 'Mispricing Signal-RF',
#                           'Mispricing_Signal': 'Mispricing Signal-LR'}, inplace=True)

df_merged = df.rename(columns={'beta': 'Beta', 
                        'mkt_cap': 'Market capitalization', 
                        'Book_to_Market': 'Book/market',
                        'Gross_Profitability': 'Gross profitability',
                        'Earnings_Yield': 'Earnings yield',
                        'Short_term_reversal': "Short-term reversal",
                        "Long_term_reversal": "Long-term reversal"})
df_merged.sort_values(by = ['PERMNO', 'Date'], ascending=True, inplace=True)
df_merged['Ret_y'] = df_merged.groupby('PERMNO')['Return'].shift(-1)
df_merged.head()

Unnamed: 0,PERMNO,Return,Share_price,Number_of_shares_outstanding,Beta,industry,Market capitalization,Industry_RET,Short-term reversal,Momentum,Long-term reversal,Mkt_RF,SMB,HML,RMW,CMA,RF,Mom,LT_Rev,ST_Rev,ACOQH,ATQH,SEQQH,LTQH,NOPIQH,SALEQH,TXTQH,APQH,DVPQH,CHEQH,NIQH,PSTKQH,Market_Capitalization,Accruals,Gross profitability,Book/market,Asset_Growth,Investment_Ratio,Leverage,Earnings yield,Dividend_Price,Cash_Flow_Price,Illiquidity,ROE,ROA,Current_Ratio,Quick_Ratio,Net_Profit_Margin,Gross_Margin,ICAPTQH,PSTKRQH,TEQQH,PPENTQH,CEQQH,AOQH,DLTTQH,LOQH,LCOQH,XIDOQH,IBQH,IBADJQH,IBCOMQH,PIQH,DOQH,DVQH,EPS,EPS_lag,EPS_std,SUE,Share_Issuance,Scaled_NOA,Z_Score,Industry_Adjusted_Return,Mispricing Signal-LR,Date,Ret_y
0,10028,0.0,1.17,4914.0,1.048013,Rtail,5749.38,-9.95,-0.082353,-0.302684,1.757715,-10.35,3.08,1.45,3.28,-2.2,0.14,9.13,-0.68,-1.77,7.284,10.395,4.256,6.139,0.106,4.752,-0.015,0.484,0.042908,0.099,-0.03,11.778,5749.38,-0.000206,0.110149,0.00074,-0.010628,0.002982,1.442434,-0.025641,0.145299,0.090598,0.002891,-0.007049,-0.002886,1.437537,0.167949,-0.006313,0.240951,10.395,11.778,4.256,3.962,4.256,-1.105,1.072,38.432,5.067,0.106,-0.03,-0.03,-0.03,-0.045,1.062,0.17,-6e-06,1.569355e-05,1.3e-05,-1.632882,,0.999794,562.662866,0.063854,-49.669632,2002-09,0.042735
1,10028,0.042735,1.22,4914.0,0.987147,Rtail,5995.08,5.82,0.0,-0.352428,1.692848,7.84,-4.31,-3.94,-3.39,0.77,0.14,-5.58,-0.2,7.39,7.284,10.395,4.256,6.139,0.106,4.752,-0.015,0.484,0.042908,0.099,-0.03,11.778,5749.38,-0.000206,0.110149,0.00074,-0.010628,0.002982,1.442434,-0.025641,0.145299,0.090598,0.002891,-0.007049,-0.002886,1.437537,0.167949,-0.006313,0.240951,10.395,11.778,4.256,3.962,4.256,-1.105,1.072,38.432,5.067,0.106,-0.03,-0.03,-0.03,-0.045,1.062,0.17,-6e-06,1.569355e-05,1.3e-05,-1.632882,0.0,0.999794,562.662866,0.003555,-63.73444,2002-10,-0.032787
2,10028,-0.032787,1.18,4914.0,1.240796,Rtail,5798.52,2.97,0.042735,-0.379394,1.469814,5.96,2.93,-1.26,-9.22,5.12,0.12,-16.33,4.22,0.18,7.284,10.395,4.256,6.139,0.106,4.752,-0.015,0.484,0.042908,0.099,-0.03,11.778,5749.38,-0.000206,0.110149,0.00074,-0.010628,0.002982,1.442434,-0.025641,0.145299,0.090598,0.002891,-0.007049,-0.002886,1.437537,0.167949,-0.006313,0.240951,10.395,11.778,4.256,3.962,4.256,-1.105,1.072,38.432,5.067,0.106,-0.03,-0.03,-0.03,-0.045,1.062,0.17,-6e-06,1.569355e-05,1.3e-05,-1.632882,0.0,0.999794,562.662866,-0.095412,-63.900839,2002-11,-0.097458
3,10028,-0.097458,-1.065,4913.0,0.621448,Rtail,5232.345,-7.83,-0.032787,-0.238191,1.496346,-5.76,0.61,2.14,6.3,-1.68,0.11,9.64,2.05,10.74,7.78,10.545,4.752,5.793,0.469,7.316,0.289,0.591,0.04039,0.498,0.546,11.778,5301.127,-0.003789,0.187956,0.000896,0.004764,0.002371,1.219066,0.506024,0.157553,0.434662,0.001095,0.114899,0.051778,2.857143,0.530297,0.074631,0.270913,10.545,11.778,4.752,4.209,4.752,1.486,3.07,38.432,2.723,0.469,0.546,0.546,0.546,0.835,1.062,0.17,0.000111,-6.557377e-07,4.1e-05,2.718924,-0.000204,0.996211,-540.442598,-0.0447,-78.980116,2002-12,-0.014085
4,10028,-0.014085,1.05,4913.0,0.622134,Rtail,5158.65,-5.15,-0.097458,-0.921464,1.889689,-2.57,0.69,-0.81,-0.97,0.7,0.1,1.55,0.31,0.28,7.78,10.545,4.752,5.793,0.469,7.316,0.289,0.591,0.04039,0.498,0.546,11.778,5301.127,-0.003789,0.187956,0.000896,0.004764,0.002371,1.219066,0.506024,0.157553,0.434662,0.001095,0.114899,0.051778,2.857143,0.530297,0.074631,0.270913,10.545,11.778,4.752,4.209,4.752,1.486,3.07,38.432,2.723,0.469,0.546,0.546,0.546,0.835,1.062,0.17,0.000111,-6.557377e-07,4.1e-05,2.718924,0.0,0.996211,-540.442598,0.033383,-57.079281,2003-01,0.095238


In [65]:
# var_list = list(df_merged.columns)
# print(var_list)

# get dummy variables for industries
dummy_variables = pd.get_dummies(df_merged['industry'], prefix='industry').astype(int)
industry_dummies_name = dummy_variables.columns
print(industry_dummies_name)
df = pd.concat([df_merged, dummy_variables], axis=1)

# get quantitles for each variables
# Turn variables into quantile
var_list = [
    "Mispricing Signal-LR",
    # "Mispricing Signal-RF",
    "Beta",
    "Market capitalization",
    "Book/market",
    "Short-term reversal",
    "Momentum",
    "Long-term reversal",
    "Accruals",
    "SUE",
    "Gross profitability",
    "Earnings yield"
]

for var in var_list:
    df = classify_into_quantitle(df, 'Date', 'PERMNO', var, quintile_col='quantitle')

# separate data into before 2013 and after 2013
df_before_2013 = df[pd.to_datetime(df['Date']).dt.year <= 2012]
df_after_2013 = df[pd.to_datetime(df['Date']).dt.year >= 2013]

Index(['industry_Agric', 'industry_Apprl', 'industry_Chair', 'industry_Chems',
       'industry_Cnstr', 'industry_Elctr', 'industry_Garbg', 'industry_Glass',
       'industry_Instr', 'industry_Lethr', 'industry_Machn', 'industry_Manuf',
       'industry_Metal', 'industry_Mines', 'industry_Money', 'industry_MtlPr',
       'industry_Other', 'industry_Paper', 'industry_Phone', 'industry_Print',
       'industry_Ptrlm', 'industry_Rtail', 'industry_Rubbr', 'industry_Smoke',
       'industry_Stone', 'industry_Trans', 'industry_Txtls', 'industry_Utils',
       'industry_Whlsl'],
      dtype='object')


In [66]:
quantile_use = 'Q5'
mispricing_signal_name = 'Mispricing Signal-LR'

dict_specification = {'Specification 1': [mispricing_signal_name],
                      'Specification 2': ['Beta', 'Market capitalization', "Book/market", "Short-term reversal", 'Momentum', 'Long-term reversal'],
                      'Specification 3': [mispricing_signal_name, 'Beta', 'Market capitalization', "Book/market", "Short-term reversal", 'Momentum', 
                                          'Long-term reversal'],
                      'Specification 4': ['Beta', 'Market capitalization', "Book/market", "Short-term reversal", 'Momentum', 'Long-term reversal', 
                                          "Accruals", "SUE", "Gross profitability", "Earnings yield"],
                      'Specification 5': [mispricing_signal_name, 'Beta', 'Market capitalization', "Book/market", "Short-term reversal", 'Momentum', 'Long-term reversal', 
                                          "Accruals", "SUE", "Gross profitability", "Earnings yield"]
}

result_before_2013_LR = pd.DataFrame()
result_after_2013_LR = pd.DataFrame()

for key, val in dict_specification.items():
    indep_vars = val
    indep_vars_Quantile = [f'{var}_{quantile_use}' for var in indep_vars]
    indep_vars_Quantile.extend(industry_dummies_name)

    result1 = fama_macbeth_regression(df_before_2013, 'Ret_y', indep_vars_Quantile, 'Date')
    result1.columns = [f'{var}_{key}' for var in result1.columns]
    result_before_2013_LR = pd.merge(result_before_2013_LR, result1, how = 'outer', left_index= True, right_index= True)
    
    result2 = fama_macbeth_regression(df_after_2013, 'Ret_y', indep_vars_Quantile, 'Date')
    result2.columns = [f'{var}_{key}' for var in result2.columns]
    result_after_2013_LR = pd.merge(result_after_2013_LR, result2, how = 'outer', left_index= True, right_index= True)

# order the regression result in a better way
index_list = [mispricing_signal_name]
index_list.extend(["Beta",
    "Market capitalization",
    "Book/market",
    "Short-term reversal",
    "Momentum",
    "Long-term reversal",
    "Accruals",
    "SUE",
    "Gross profitability",
    "Earnings yield"
])
new_index = [f'{var}_{quantile_use}' for var in index_list]
new_index.extend(['const', 'Number of Observations', 'Adjusted R-Squared', 'Industry control'])
result_before_2013_LR = result_before_2013_LR.reindex(new_index)
result_after_2013_LR = result_after_2013_LR.reindex(new_index)

display(result_before_2013_LR)
display(result_after_2013_LR)

Unnamed: 0,Coefficient_Specification 1,t-statistic_Specification 1,Coefficient_Specification 2,t-statistic_Specification 2,Coefficient_Specification 3,t-statistic_Specification 3,Coefficient_Specification 4,t-statistic_Specification 4,Coefficient_Specification 5,t-statistic_Specification 5
Mispricing Signal-LR_Q5,0.0022,0.98,,,0.0005,0.24,,,0.0002,0.1
Beta_Q5,,,0.005,1.67,0.0047,1.59,0.0052,1.74,0.0049,1.66
Market capitalization_Q5,,,-0.005,-3.24,-0.0051,-3.25,-0.0071,-4.17,-0.007,-4.05
Book/market_Q5,,,0.0069,3.32,0.007,3.47,0.0072,3.48,0.0074,3.67
Short-term reversal_Q5,,,0.0018,1.13,0.002,1.27,0.0018,1.14,0.002,1.28
Momentum_Q5,,,-0.0021,-1.05,-0.0021,-1.06,-0.002,-0.99,-0.002,-1.0
Long-term reversal_Q5,,,-0.002,-1.42,-0.002,-1.44,-0.0021,-1.51,-0.0021,-1.53
Accruals_Q5,,,,,,,-0.0002,-0.21,-0.0002,-0.21
SUE_Q5,,,,,,,0.0061,4.48,0.006,4.41
Gross profitability_Q5,,,,,,,0.0042,2.98,0.0043,3.09


Unnamed: 0,Coefficient_Specification 1,t-statistic_Specification 1,Coefficient_Specification 2,t-statistic_Specification 2,Coefficient_Specification 3,t-statistic_Specification 3,Coefficient_Specification 4,t-statistic_Specification 4,Coefficient_Specification 5,t-statistic_Specification 5
Mispricing Signal-LR_Q5,-0.0006,-0.25,,,-0.0009,-0.38,,,-0.0011,-0.47
Beta_Q5,,,0.002,0.9,0.0021,0.93,0.0022,0.98,0.0023,1.01
Market capitalization_Q5,,,0.001,0.51,0.0009,0.46,-0.0006,-0.3,-0.0007,-0.34
Book/market_Q5,,,0.0043,2.55,0.0045,2.8,0.0043,2.56,0.0045,2.84
Short-term reversal_Q5,,,-0.0,-0.0,0.0003,0.21,0.0,0.0,0.0003,0.21
Momentum_Q5,,,0.0002,0.09,0.0001,0.05,-0.0,-0.0,-0.0001,-0.05
Long-term reversal_Q5,,,-0.0015,-0.97,-0.0015,-0.97,-0.0014,-0.92,-0.0014,-0.92
Accruals_Q5,,,,,,,-0.0015,-1.39,-0.0015,-1.4
SUE_Q5,,,,,,,0.0043,4.06,0.0044,4.15
Gross profitability_Q5,,,,,,,0.0036,2.91,0.0036,2.99


In [67]:
# Define the function to add significance stars based on t-statistic
def significance_marker(t_value):
    if pd.isna(t_value):
        return ""
    elif abs(t_value) >= 2.58:
        return "***"
    elif abs(t_value) >= 1.96:
        return "**"
    elif abs(t_value) >= 1.65:
        return "*"
    else:
        return ""

# Apply the significance markers to the coefficients
for i in range(1, 6):  # Assuming you have up to 5 specifications
    coeff_col = f'Coefficient_Specification {i}'
    t_col = f't-statistic_Specification {i}'
    if coeff_col in result_before_2013_LR.columns and t_col in result_before_2013_LR.columns:
        result_before_2013_LR[coeff_col] = result_before_2013_LR.apply(lambda x: f"{x[coeff_col]}{significance_marker(x[t_col])}" if pd.notna(x[coeff_col]) else np.nan, axis=1)
    
    if coeff_col in result_after_2013_LR.columns and t_col in result_after_2013_LR.columns:
        result_after_2013_LR[coeff_col] = result_after_2013_LR.apply(lambda x: f"{x[coeff_col]}{significance_marker(x[t_col])}" if pd.notna(x[coeff_col]) else np.nan, axis=1)

# Display the updated DataFrame
display(result_before_2013_LR)
display(result_before_2013_LR)

Unnamed: 0,Coefficient_Specification 1,t-statistic_Specification 1,Coefficient_Specification 2,t-statistic_Specification 2,Coefficient_Specification 3,t-statistic_Specification 3,Coefficient_Specification 4,t-statistic_Specification 4,Coefficient_Specification 5,t-statistic_Specification 5
Mispricing Signal-LR_Q5,0.0022,0.98,,,0.0005,0.24,,,0.0002,0.1
Beta_Q5,,,0.005*,1.67,0.0047,1.59,0.0052*,1.74,0.0049*,1.66
Market capitalization_Q5,,,-0.005***,-3.24,-0.0051***,-3.25,-0.0071***,-4.17,-0.007***,-4.05
Book/market_Q5,,,0.0069***,3.32,0.007***,3.47,0.0072***,3.48,0.0074***,3.67
Short-term reversal_Q5,,,0.0018,1.13,0.002,1.27,0.0018,1.14,0.002,1.28
Momentum_Q5,,,-0.0021,-1.05,-0.0021,-1.06,-0.002,-0.99,-0.002,-1.0
Long-term reversal_Q5,,,-0.002,-1.42,-0.002,-1.44,-0.0021,-1.51,-0.0021,-1.53
Accruals_Q5,,,,,,,-0.0002,-0.21,-0.0002,-0.21
SUE_Q5,,,,,,,0.0061***,4.48,0.006***,4.41
Gross profitability_Q5,,,,,,,0.0042***,2.98,0.0043***,3.09


Unnamed: 0,Coefficient_Specification 1,t-statistic_Specification 1,Coefficient_Specification 2,t-statistic_Specification 2,Coefficient_Specification 3,t-statistic_Specification 3,Coefficient_Specification 4,t-statistic_Specification 4,Coefficient_Specification 5,t-statistic_Specification 5
Mispricing Signal-LR_Q5,0.0022,0.98,,,0.0005,0.24,,,0.0002,0.1
Beta_Q5,,,0.005*,1.67,0.0047,1.59,0.0052*,1.74,0.0049*,1.66
Market capitalization_Q5,,,-0.005***,-3.24,-0.0051***,-3.25,-0.0071***,-4.17,-0.007***,-4.05
Book/market_Q5,,,0.0069***,3.32,0.007***,3.47,0.0072***,3.48,0.0074***,3.67
Short-term reversal_Q5,,,0.0018,1.13,0.002,1.27,0.0018,1.14,0.002,1.28
Momentum_Q5,,,-0.0021,-1.05,-0.0021,-1.06,-0.002,-0.99,-0.002,-1.0
Long-term reversal_Q5,,,-0.002,-1.42,-0.002,-1.44,-0.0021,-1.51,-0.0021,-1.53
Accruals_Q5,,,,,,,-0.0002,-0.21,-0.0002,-0.21
SUE_Q5,,,,,,,0.0061***,4.48,0.006***,4.41
Gross profitability_Q5,,,,,,,0.0042***,2.98,0.0043***,3.09


In [63]:
result_before_2013_LR.to_csv('../Result/Table3_Panel A.csv')
result_after_2013_LR.to_csv('../Result/Table3_Panel B.csv')