# Chapter 6.Fama and MacBeth Regression Analysis

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
# 导入各个指标数据
df = pd.read_csv('alldata_mktcap.csv', index_col=0)
df.head()

Unnamed: 0_level_0,year,beta,rt+1,bm,size,mktcap
permno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10001,1988,0.267605,60.422343,1.145192,1.850382,6.36225
10002,1988,0.02397,-35.235672,,2.286519,9.840625
10003,1988,0.213007,-61.669376,,3.730165,41.686
10005,1988,0.619461,-41.703333,1.632601,-0.241753,0.78525
10008,1988,0.869109,,,,


主要涉及：
- 缩尾: `pd.Series.clip()`
- 回归：`sm.OLS()`

## 6.1 Periodic FM Regression Results

>All independent variables are winsorized at the 0.5% level on an annual basis prior to running the regressions.

In [3]:
def FM_regression1(independent, level=0.005):
    '''
    FM回归的第一步，即周期的截面回归
    按课本展示结果

    Parameters
    ----------
    independent: 自变量名  (list)
    level: 缩尾水平，是绝对量，不是百分比  (float, default is 0.005)

    Returns
    -------
    回归结果  (pd.DataFrame)
    '''
    coefs = []
    R_square = []
    adj_R = []
    number = []
    # 筛选出所需指标数据
    FM_df = df[(['year', 'rt+1'] + independent)].copy()
    for i in range(df['year'].min(), df['year'].max()):  # 不能取2012年数据
        temp = FM_df[FM_df['year'] == i].copy()
        temp = temp.dropna()  # 剔除缺失值
        number.append(len(temp))  # 样本量
        temp = temp.drop(columns='year')
        temp[independent] = temp[independent].apply(
            lambda x: x.clip(
                np.percentile(x, level * 100),
                np.percentile(x, (1 - level) * 100)
            )
        )  # 自变量缩尾
        Y = temp['rt+1']  # 因变量
        X = temp[independent]  # 自变量
        model = sm.OLS(Y.values, sm.add_constant(X).values).fit()
        #number.append(model.nobs)
        coefs.append(model.params)
        R_square.append(model.rsquared)
        adj_R.append(model.rsquared_adj)
    # 按课本Tabel 6.1
    result = pd.DataFrame(
        coefs,
        index=range(df['year'].min(), df['year'].max()),
        columns=['coef' + str(j) for j in range(len(independent) + 1)]
    )
    result['R_square'] = R_square
    result['adj_R'] = adj_R
    result['n'] = number

    return result

### 6.1.1 Panel A  
$$r_{i,t+1}=\delta _{0,t}+\delta _{1,t}\beta _{i,t}+\epsilon _{i,t}$$

In [4]:
panelA_result = FM_regression1(['beta'], level=0.005)

In [5]:
panelA_result.round({
    'coef0': 2,
    'coef1': 2,
    'R_square': 3,
    'adj_R': 3
})

Unnamed: 0,coef0,coef1,R_square,adj_R,n
1988,3.08,4.05,0.001,0.001,5192
1989,-29.71,0.83,0.0,-0.0,5063
1990,44.85,10.17,0.002,0.001,5029
1991,38.82,-19.32,0.009,0.009,4835
1992,29.72,-10.49,0.011,0.011,5099
1993,-5.63,-0.83,0.0,-0.0,5296
1994,27.42,2.23,0.0,0.0,5685
1995,19.81,-7.12,0.006,0.006,5845
1996,34.24,-20.21,0.031,0.031,5991
1997,-10.2,5.1,0.001,0.001,6006


### 6.1.2 Panel B
$$r_{i,t+1}=\delta _{0,t}+\delta _{1,t}Size _{i,t}+\epsilon _{i,t}$$


In [6]:
panelB_result = FM_regression1(['size'], level=0.005)
panelB_result.round({
    'coef0': 2,
    'coef1': 2,
    'R_square': 3,
    'adj_R': 3
})

Unnamed: 0,coef0,coef1,R_square,adj_R,n
1988,-4.83,2.52,0.007,0.007,5516
1989,-34.91,1.42,0.004,0.004,5369
1990,78.41,-7.5,0.013,0.013,5308
1991,61.75,-8.71,0.026,0.026,5272
1992,48.38,-6.03,0.025,0.024,5586
1993,-3.89,-0.71,0.001,0.001,6030
1994,37.8,-1.86,0.002,0.001,6232
1995,21.59,-1.68,0.002,0.002,6478
1996,19.05,-0.18,0.0,-0.0,6793
1997,-18.34,2.15,0.004,0.003,6533


### 6.1.3 Panel C
$$r_{i,t+1}=\delta _{0,t}+\delta _{1,t}BM _{i,t}+\epsilon _{i,t}$$

In [12]:
panelC_result = FM_regression1(['bm'], level=0.005)
panelC_result.round({
    'coef0': 2,
    'coef1': 2,
    'R_square': 3,
    'adj_R': 3
})

Unnamed: 0,coef0,coef1,R_square,adj_R,n
1988,3.82,2.2,0.001,0.0,4332
1989,-27.37,0.42,0.0,-0.0,4254
1990,58.67,-4.21,0.002,0.001,4264
1991,9.74,13.74,0.014,0.014,4232
1992,15.76,8.39,0.006,0.006,4545
1993,-15.9,13.75,0.019,0.019,4985
1994,26.95,1.59,0.0,-0.0,5306
1995,4.01,15.42,0.013,0.013,5518
1996,4.03,14.7,0.013,0.013,5825
1997,-10.84,7.29,0.002,0.002,5653


### 6.1.4 Panel D
$$r_{i,t+1}=\delta _{0,t}+\delta _{1,t}\beta _{i,t}+\delta _{2,t}Size_{i,t}+\delta _{3,t}BM_{i,t}+\epsilon _{i,t}$$

In [11]:
panelD_result = FM_regression1(['beta', 'size', 'bm'], level=0.005)
panelD_result.round({
    'coef0': 2,
    'coef1': 2,
    'coef2': 2,
    'coef3': 2,
    'R_square': 3,
    'adj_R': 3
})

Unnamed: 0,coef0,coef1,coef2,coef3,R_square,adj_R,n
1988,-7.37,-1.39,2.91,3.24,0.009,0.008,4147
1989,-32.32,-2.58,1.51,1.04,0.003,0.003,4062
1990,96.33,28.62,-13.9,-7.84,0.033,0.033,4085
1991,46.52,-5.88,-6.92,9.81,0.032,0.031,3941
1992,45.03,-5.24,-5.05,3.92,0.027,0.027,4247
1993,-15.89,2.11,-0.16,13.98,0.02,0.019,4526
1994,36.98,5.67,-3.03,0.28,0.004,0.004,4953
1995,16.5,-5.17,-1.11,11.24,0.015,0.015,5094
1996,7.04,-13.28,2.09,11.59,0.026,0.026,5243
1997,-26.41,3.17,2.33,10.79,0.008,0.008,5274


## 6.2 Summarized FM Regression Results
>Standard errors, t-statistics, and p-values are calculated using the Newey and West (1987) adjustment with six lags.

In [21]:
def NWtest_1sample(a, lags=6):
    '''
    一个序列的NW检验

    Parameters
    ----------
    a: 需要检验的序列  (array-like)
    lags: NW检验的最大滞后阶数  (float)

    Returns
    -------
    序列均值  (float)
    NW调整后标准误  (float)
    NW调整后标准误  (float)
    p值  (float)
    '''
    adj_a = np.array(a)
    # 对常数回归
    model = sm.OLS(adj_a, [1] * len(adj_a)).fit(cov_type='HAC', cov_kwds={'maxlags': lags})

    return adj_a.mean(), float(np.sqrt(model.cov_params())), float(model.tvalues), float(model.pvalues)

def test_table61(data, name, **kwds):
    '''
    将table 6.1中的一个panel转化成6.2中一列的形式并保存

    Parameters
    ----------
    data: 6.1中一个panel样式的数据  (pd.DaFrame)
    name: 结果名，含扩展名  (str)
    '''
    # 系数的NW检验
    result1 = data.iloc[:, :-3].apply(NWtest_1sample, **kwds)
    result1 = np.array([list(x) for x in result1.values]).reshape(-1)  # 转化成一维
    # 最后三列求平均值即可
    result2 = data.iloc[:, -3:].mean().values

    # 转成df保存
    result = pd.DataFrame(list(result1) + list(result2))
    result.to_csv(name)

In [20]:
# 四个panel全部检验
[test_table61(i, 'panel' + j + 'test.csv') for i, j in zip([panelA_result, panelB_result, panelC_result, panelD_result], ['A', 'B', 'C', 'D'])]

[None, None, None, None]

Table 6.2结果如下：

![Table6_2.png](./Table6_2.png)

## 6.3 FM Regression Results

Table 6.3如下：

![Table6_3.png](./Table6_3.png)

### 思考
尽管与课本结果不太一样，但是整体系数及显著性反应的规律基本一样，可能问题主要还是出在：
- 样本股票池。
- 计算年化r时，可能与数据库计算年收益率的规则不一样。