In [39]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [40]:
def wls_regression(data):
    
    y_name = 'ret_excess_lead'
    X_names = ['beta','log_mktcap','bm']
    weights_name = 'log_mktcap'
    
   
    formula = f"{y_name} ~ {' + '.join(X_names)}"
    
    
    model = smf.wls(formula, data=data, weights=data[weights_name]).fit()
    
    
    return model.summary()  

In [41]:
def ols_regression_quantile(data, formula, weight_col, quantile):
    quantile_value = data[weight_col].quantile(quantile)
    filtered_data = data[data[weight_col] <= quantile_value]
    model = smf.ols(formula, data=filtered_data).fit()
    return model.params

In [44]:
beta=pd.read_csv(r"C:\Users\29075\Desktop\金融科技研讨班\beta.csv")[['date', 'permno', 'beta_monthly']]
crsp_monthly=pd.read_csv(r"C:\Users\29075\Desktop\金融科技研讨班\crsp_monthly.csv")[['permno','gvkey', 'date', 'ret_excess', 'mktcap']]
compustat=pd.read_csv(r"C:\Users\29075\Desktop\金融科技研讨班\compustat.csv")[['datadate', 'gvkey', 'be']]
print(beta.head())
print(crsp_monthly.head())
print(compustat.head())

                  date   permno  beta_monthly
0  1986-03-01 00:00:00  10000.0           NaN
1  1986-04-01 00:00:00  10000.0           NaN
2  1986-05-01 00:00:00  10000.0           NaN
3  1986-06-01 00:00:00  10000.0           NaN
4  1986-07-01 00:00:00  10000.0           NaN
   permno    gvkey                 date  ret_excess     mktcap
0   10028  12096.0  1993-03-01 00:00:00   -0.102500   6.329250
1   10028  12096.0  1993-04-01 00:00:00    0.386489   8.790625
2   10028  12096.0  1993-05-01 00:00:00    0.197800  10.548750
3   10028  12096.0  1993-06-01 00:00:00   -0.135833   9.044750
4   10028  12096.0  1993-07-01 00:00:00    0.189908  10.784125
              datadate  gvkey       be
0  1960-01-31 00:00:00   6260   57.680
1  1960-01-31 00:00:00   5270      NaN
2  1960-01-31 00:00:00  11156      NaN
3  1960-01-31 00:00:00   5410   14.880
4  1960-01-31 00:00:00   4611  240.827


In [49]:
#把日数据转化为月数据，然后由M月变成M月1号便于和crsp_montyly合并,继续和beta合并
beta['date']=pd.to_datetime(beta['date'])
crsp_monthly['date']=pd.to_datetime(crsp_monthly['date'])
compustat['date']=pd.to_datetime(compustat['datadate']).dt.to_period("M").dt.to_timestamp()
characteristics=compustat.merge(crsp_monthly,how="left",on=['gvkey','date']).merge(beta,how="left",on=['permno','date'])
print(characteristics.head())

              datadate  gvkey       be       date  permno  ret_excess  mktcap  \
0  1960-01-31 00:00:00   6260   57.680 1960-01-01     NaN         NaN     NaN   
1  1960-01-31 00:00:00   5270      NaN 1960-01-01     NaN         NaN     NaN   
2  1960-01-31 00:00:00  11156      NaN 1960-01-01     NaN         NaN     NaN   
3  1960-01-31 00:00:00   5410   14.880 1960-01-01     NaN         NaN     NaN   
4  1960-01-31 00:00:00   4611  240.827 1960-01-01     NaN         NaN     NaN   

   beta_monthly  
0           NaN  
1           NaN  
2           NaN  
3           NaN  
4           NaN  


In [50]:
#计算bm/log_mktcap/beta 三个指标，同时设置sorting_date
characteristics['bm']=characteristics['be']/characteristics['mktcap']
characteristics['log_mktcap']=np.log(characteristics['mktcap'])
characteristics['sorting_date']=characteristics['date'].apply(lambda x:x+pd.DateOffset(months=6))
characteristics=characteristics[["gvkey", "bm", "log_mktcap", "beta_monthly", "sorting_date"]].rename(columns={"beta_monthly": "beta"})
print(characteristics.tail())

         gvkey        bm  log_mktcap      beta sorting_date
549421   64766  0.203789    8.280595  1.218872   2023-06-01
549422   36776       NaN         NaN       NaN   2023-06-01
549423   38573       NaN         NaN       NaN   2023-06-01
549424   29914  1.321067    3.796824  0.743790   2023-06-01
549425  353945       NaN         NaN       NaN   2023-06-01


In [52]:
#再把ret_excess加入实现T和T+1数据通过sorting_date与date进行了缝合
#对于缺失的数据，采用了前向填充
data_fama_macbeth = crsp_monthly.merge(characteristics, how="left",left_on=["gvkey", "date"], right_on=["gvkey", "sorting_date"]).sort_values(["date", "permno"])
data_fama_macbeth =data_fama_macbeth.groupby("permno").apply(lambda x: x.assign(beta=x["beta"].fillna(method="ffill"),bm=x["bm"].fillna(method="ffill"),log_mktcap=x["log_mktcap"].fillna(method="ffill"))).reset_index(drop=True)
print(data_fama_macbeth.tail())

  data_fama_macbeth =data_fama_macbeth.groupby("permno").apply(lambda x: x.assign(beta=x["beta"].fillna(method="ffill"),bm=x["bm"].fillna(method="ffill"),log_mktcap=x["log_mktcap"].fillna(method="ffill"))).reset_index(drop=True)


         permno     gvkey       date  ret_excess         mktcap        bm  \
3326348   93436  184996.0 2022-08-01   -0.074389  863615.620800  0.027663   
3326349   93436  184996.0 2022-09-01   -0.039489  837659.500000  0.027663   
3326350   93436  184996.0 2022-10-01   -0.144468  718514.868879  0.027663   
3326351   93436  184996.0 2022-11-01   -0.147226  614814.304763  0.027663   
3326352   93436  184996.0 2022-12-01   -0.370634  388971.892324  0.027663   

         log_mktcap      beta sorting_date  
3326348   13.903696  1.993018          NaT  
3326349   13.903696  1.993018          NaT  
3326350   13.903696  1.993018          NaT  
3326351   13.903696  1.993018          NaT  
3326352   13.903696  1.993018          NaT  


In [53]:
##将date减去一个月，使得ret_excess与滞后一期的因子合并
data_fama_macbeth_lagged =data_fama_macbeth[['permno','date','ret_excess']].rename(columns={'ret_excess':'ret_excess_lead'})
data_fama_macbeth_lagged['date']=data_fama_macbeth_lagged['date']-pd.DateOffset(months=1)
data_fama_macbeth=data_fama_macbeth.merge(data_fama_macbeth_lagged,how='left',on=['permno','date'])[["permno", "date", "ret_excess_lead", "beta", "log_mktcap", "bm"]].dropna()
print(data_fama_macbeth.tail())

         permno       date  ret_excess_lead      beta  log_mktcap        bm
3326347   93436 2022-07-01        -0.074389  1.993018   13.903696  0.027663
3326348   93436 2022-08-01        -0.039489  1.993018   13.903696  0.027663
3326349   93436 2022-09-01        -0.144468  1.993018   13.903696  0.027663
3326350   93436 2022-10-01        -0.147226  1.993018   13.903696  0.027663
3326351   93436 2022-11-01        -0.370634  1.993018   13.903696  0.027663


In [54]:
#按照样例的思路
formula="ret_excess_lead ~ beta + log_mktcap + bm"
risk_premiums1= data_fama_macbeth.groupby("date").apply(lambda x: smf.ols(formula,x).fit().params).reset_index()
price_of_risk1= (risk_premiums1
  .melt(id_vars="date", var_name="factor", value_name="estimate")
  .groupby("factor")["estimate"]
  .apply(lambda x: pd.Series({
      "risk_premium": 100*x.mean(),
      "t_statistic": x.mean()/x.std()*np.sqrt(len(x))
    })
  )
  .reset_index()
  .pivot(index="factor", columns="level_1", values="estimate")
  .reset_index()
)

In [55]:
print(price_of_risk1)

level_1      factor  risk_premium  t_statistic
0         Intercept      1.215502     4.771331
1              beta      0.005146     0.049929
2                bm      0.150568     3.222465
3        log_mktcap     -0.104206    -2.941909


In [25]:
def ols_regression_quantile(data, formula, weight_col, quantile):
    quantile_value = data[weight_col].quantile(quantile)
    filtered_data = data[data[weight_col] <= quantile_value]
    model = smf.ols(formula, data=filtered_data).fit()
    return model.params


formula="ret_excess_lead ~ beta + log_mktcap + bm"

# 对每个日期的数据子集计算80%分位数，并选择低于或等于该分位数的行进行OLS回归
results1 = (data_fama_macbeth.groupby("date")
            .apply(lambda x: ols_regression_quantile(x, formula, 'log_mktcap', 0.8))
            .reset_index())

price_of_risk3 = (results1
  .melt(id_vars="date", var_name="factor", value_name="estimate")
  .groupby("factor")["estimate"]
  .apply(lambda x: pd.Series({
      "risk_premium": 100*x.mean(),
      "t_statistic": x.mean()/x.std()*np.sqrt(len(x))
    })
  )
  .reset_index()
  .pivot(index="factor", columns="level_1", values="estimate")
  .reset_index()
)

In [56]:
print(price_of_risk3)

level_1      factor  risk_premium  t_statistic
0         Intercept      1.305305     4.501963
1              beta      0.021327     0.210112
2                bm      0.153046     3.272670
3        log_mktcap     -0.138049    -2.842572


In [57]:
risk_premiums = data_fama_macbeth.groupby('date').apply(lambda x:sm.WLS.from_formula(formula,x,weights=x['log_mktcap']).fit().params).reset_index(drop=True)

  return np.sqrt(self.weights)[:, None] * x
  return x * np.sqrt(self.weights)


LinAlgError: SVD did not converge