Week Six Replicating Fama and French Factors

一、导入库包

In [1]:
import pandas as pd
import numpy as np
import sqlite3

import statsmodels.formula.api as smf
from regtabletotext import prettify_result

二、准备数据

In [2]:
tidy_finance = sqlite3.connect(
  database="data/tidy_finance_python.sqlite"
)

In [3]:
crsp_monthly = (pd.read_sql_query(
    sql=("SELECT permno, gvkey, month, ret_excess, mktcap, "
         "mktcap_lag, exchange FROM crsp_monthly_new"),
    con=tidy_finance,
    parse_dates={"month"})
)

compustat = (pd.read_sql_query(
    sql="SELECT gvkey, datadate, be, op, inv FROM compustat_new",
    con=tidy_finance,
    parse_dates={"datadate"})
)

factors_ff3_monthly = (pd.read_sql_query(
    sql="SELECT month, smb, hml FROM factors_ff3_monthly",
    con=tidy_finance,
    parse_dates={"month"})
)

factors_ff5_monthly = (pd.read_sql_query(
    sql=("SELECT month, smb, hml, rmw, cma "
         "FROM factors_ff5_monthly"),
    con=tidy_finance,
    parse_dates={"month"})
)

In [4]:
crsp_monthly

Unnamed: 0,permno,gvkey,month,ret_excess,mktcap,mktcap_lag,exchange
0,10028.0,012096,1993-03-01,-0.102500,6.329250,7.032500,AMEX
1,10028.0,012096,1993-04-01,0.386489,8.790625,6.329250,AMEX
2,10028.0,012096,1993-05-01,0.197800,10.548750,8.790625,AMEX
3,10028.0,012096,1993-06-01,-0.135833,9.044750,10.548750,AMEX
4,10028.0,012096,1993-07-01,0.189908,10.784125,9.044750,AMEX
...,...,...,...,...,...,...,...
3326348,10042.0,012139,2005-02-01,-0.215192,23.583960,29.989479,AMEX
3326349,10042.0,012139,2005-03-01,-0.113211,21.029761,23.583960,AMEX
3326350,10042.0,012139,2005-04-01,-0.071544,19.569360,21.029761,AMEX
3326351,10042.0,012139,2005-05-01,0.102078,25.538140,19.569360,AMEX


In [5]:
compustat

Unnamed: 0,gvkey,datadate,be,op,inv
0,011358,1960-01-31,8.658,0.092400,
1,002796,1960-01-31,,,
2,007068,1960-01-31,,,
3,006303,1960-01-31,,,
4,005410,1960-01-31,14.880,0.174731,
...,...,...,...,...,...
549421,104833,2022-12-31,23179.951,0.263820,0.008102
549422,183907,2022-12-31,,,
549423,137611,2022-12-31,1357.256,0.022417,-0.119339
549424,037527,2022-12-31,1266.455,0.192951,0.168621


In [6]:
factors_ff3_monthly

Unnamed: 0,month,smb,hml
0,1960-01-01,0.0209,0.0278
1,1960-02-01,0.0051,-0.0193
2,1960-03-01,-0.0049,-0.0294
3,1960-04-01,0.0032,-0.0228
4,1960-05-01,0.0121,-0.0370
...,...,...,...
751,2022-08-01,0.0140,0.0029
752,2022-09-01,-0.0081,0.0005
753,2022-10-01,0.0006,0.0801
754,2022-11-01,-0.0352,0.0138


In [7]:
factors_ff5_monthly

Unnamed: 0,month,smb,hml,rmw,cma
0,1963-07-01,-0.0041,-0.0097,0.0068,-0.0118
1,1963-08-01,-0.0080,0.0180,0.0036,-0.0035
2,1963-09-01,-0.0052,0.0013,-0.0071,0.0029
3,1963-10-01,-0.0139,-0.0010,0.0280,-0.0201
4,1963-11-01,-0.0088,0.0175,-0.0051,0.0224
...,...,...,...,...,...
709,2022-08-01,0.0152,0.0029,-0.0475,0.0129
710,2022-09-01,-0.0105,0.0005,-0.0151,-0.0080
711,2022-10-01,0.0189,0.0801,0.0334,0.0664
712,2022-11-01,-0.0274,0.0138,0.0638,0.0318


三、合并表格

Fama和French采用了特定的方法形成公司规模和账面市值比（book-to-market ratio）的投资组合。以下是关键步骤：

1. 公司规模投资组合形成：
   - 投资组合是在年度\(t\)的六月份形成的。
   - 七月份的回报被视为相应投资组合的第一个月度回报。
   - 公司规模是通过在年度\(t\)的六月份记录的市值确定的。
   - 市值保持不变直到年度\(t+1\)的六月份。

2. 账面市值比计算：
   - Fama和French使用年度\(t-1\)末的市值和年度\(t-1\)报告的账面市值。
   - 账面市值的datadate在最后一年内，允许长达18个月的时间差。
   - 市值和账面市值可能不一定反映相同的时间点。
   - 其他排序变量类似地来自年度\(t-1\)。

3. 使用sorting_date进行实施：
   - 使用临时的`sorting_date`列来处理时间滞后。
   - 目标是确保每年每支股票只有一次观测。

In [8]:
size = (crsp_monthly[crsp_monthly["month"].dt.month == 6]
    .assign(sorting_date = lambda x: x["month"] + pd.DateOffset(months=1))
    .rename(columns={"mktcap" : "size"})
    .get(["permno","exchange","sorting_date","size"])
)
size

Unnamed: 0,permno,exchange,sorting_date,size
3,10028.0,AMEX,1993-07-01,9.044750
15,10028.0,AMEX,1994-07-01,13.209750
27,10028.0,AMEX,1995-07-01,9.192187
39,10028.0,AMEX,1996-07-01,8.367688
51,10028.0,AMEX,1997-07-01,7.735000
...,...,...,...,...
3326304,10042.0,AMEX,2001-07-01,14.687200
3326316,10042.0,AMEX,2002-07-01,39.130002
3326328,10042.0,AMEX,2003-07-01,28.366201
3326340,10042.0,AMEX,2004-07-01,108.659373


In [9]:
market_equity = (crsp_monthly[crsp_monthly["month"].dt.month == 12]
    .assign(sorting_date = lambda x: pd.to_datetime((x["month"] + pd.DateOffset(years=1)).dt.year.astype(str) + "-" +  "07-01"))
    .rename(columns={"mktcap" : "me"})
    .get(["permno","gvkey","sorting_date","me"])
)
market_equity

Unnamed: 0,permno,gvkey,sorting_date,me
9,10028.0,012096,1994-07-01,13.567125
21,10028.0,012096,1995-07-01,13.126500
33,10028.0,012096,1996-07-01,7.287500
45,10028.0,012096,1997-07-01,6.158250
57,10028.0,012096,1998-07-01,11.929500
...,...,...,...,...
3326298,10042.0,012139,2001-07-01,11.624000
3326310,10042.0,012139,2002-07-01,14.834060
3326322,10042.0,012139,2003-07-01,22.565760
3326334,10042.0,012139,2004-07-01,96.315269


In [10]:
book_to_market =(compustat
    .assign(sorting_date = lambda x: pd.to_datetime((x["datadate"].dt.year+1).astype(str) + "-" +  "07-01"))
    .get(["gvkey","sorting_date","be"])
    .merge(market_equity,how="inner",on=["gvkey","sorting_date"])
    .assign(bm= lambda x:x["be"]/x["me"])
    .get(["permno","sorting_date","me","bm"])
)
book_to_market

Unnamed: 0,permno,sorting_date,me,bm
0,18884.0,1961-07-01,110.418750,
1,19035.0,1961-07-01,8.881500,1.675393
2,19107.0,1961-07-01,24.792500,0.826379
3,10559.0,1961-07-01,125.102000,
4,17304.0,1961-07-01,116.592000,1.393835
...,...,...,...,...
253555,15067.0,2023-07-01,2534.015422,0.407089
253556,18215.0,2023-07-01,86.882750,0.141754
253557,18719.0,2023-07-01,431.904725,1.240857
253558,20280.0,2023-07-01,380.655000,0.473760


In [11]:
sorting_variables = (size
    .merge(book_to_market,how="inner",on=["permno","sorting_date"])
    .dropna()
    .drop_duplicates(subset=["permno","sorting_date"])
)
sorting_variables

Unnamed: 0,permno,exchange,sorting_date,size,me,bm
0,10028.0,AMEX,1993-07-01,9.044750,7.735750,0.104967
1,10028.0,AMEX,1994-07-01,13.209750,13.567125,0.119922
2,10028.0,AMEX,1995-07-01,9.192187,13.126500,0.133699
3,10028.0,AMEX,1996-07-01,8.367688,7.287500,0.243431
4,10028.0,AMEX,1997-07-01,7.735000,6.158250,0.345228
...,...,...,...,...,...,...
242271,10042.0,AMEX,2001-07-01,14.687200,11.624000,2.952168
242272,10042.0,AMEX,2002-07-01,39.130002,14.834060,2.249553
242273,10042.0,AMEX,2003-07-01,28.366201,22.565760,1.174124
242274,10042.0,AMEX,2004-07-01,108.659373,96.315269,0.235622


四、投资组合排序

In [12]:
def assign_portfolio(data,sorting_variable,percentiles):
    breakpoints = np.quantile(data.query('exchange == "NYSE"')[sorting_variable].dropna(),percentiles,method="linear")
    assigned_portfolios = np.digitize(data[sorting_variable].dropna(),breakpoints)
    assigned_portfolios[assigned_portfolios == len(percentiles)] = len(percentiles)-1
    assigned_portfolios[assigned_portfolios == 0] = 1
    return assigned_portfolios

In [13]:
portfolios = (sorting_variables
    .groupby(["sorting_date"],group_keys=False)
    .apply(lambda x: x.assign(
        portfolio_size = assign_portfolio(data=x,sorting_variable="size",percentiles=[0,0.5,1]),
        portfolio_bm = assign_portfolio(data=x,sorting_variable="bm",percentiles=[0,0.3,0.7,1])       
        )
    )
    .get(["permno","sorting_date","portfolio_size","portfolio_bm"])
)
portfolios

Unnamed: 0,permno,sorting_date,portfolio_size,portfolio_bm
0,10028.0,1993-07-01,1,1
1,10028.0,1994-07-01,1,1
2,10028.0,1995-07-01,1,1
3,10028.0,1996-07-01,1,1
4,10028.0,1997-07-01,1,1
...,...,...,...,...
242271,10042.0,2001-07-01,1,3
242272,10042.0,2002-07-01,1,3
242273,10042.0,2003-07-01,1,3
242274,10042.0,2004-07-01,1,1


In [14]:
portfolios=(crsp_monthly
    .assign(sorting_date = lambda x: np.where(x["month"].dt.month <= 6,pd.to_datetime((x["month"].dt.year-1).astype(str) + "-" +  "07-01"),
                                              pd.to_datetime((x["month"].dt.year).astype(str) + "-" +  "07-01")))
    .merge(portfolios,how="inner",on=["permno","sorting_date"])
)
portfolios

Unnamed: 0,permno,gvkey,month,ret_excess,mktcap,mktcap_lag,exchange,sorting_date,portfolio_size,portfolio_bm
0,10028.0,012096,1993-03-01,-0.102500,6.329250,7.032500,AMEX,1992-07-01,1,1
1,10028.0,012096,1993-04-01,0.386489,8.790625,6.329250,AMEX,1992-07-01,1,1
2,10028.0,012096,1993-05-01,0.197800,10.548750,8.790625,AMEX,1992-07-01,1,1
3,10028.0,012096,1993-06-01,-0.135833,9.044750,10.548750,AMEX,1992-07-01,1,1
4,10028.0,012096,1992-07-01,0.105596,8.976000,8.096000,AMEX,1992-07-01,1,1
...,...,...,...,...,...,...,...,...,...,...
2628353,10042.0,012139,2005-02-01,-0.215192,23.583960,29.989479,AMEX,2004-07-01,1,1
2628354,10042.0,012139,2005-03-01,-0.113211,21.029761,23.583960,AMEX,2004-07-01,1,1
2628355,10042.0,012139,2005-04-01,-0.071544,19.569360,21.029761,AMEX,2004-07-01,1,1
2628356,10042.0,012139,2005-05-01,0.102078,25.538140,19.569360,AMEX,2004-07-01,1,1


五、复制Fama and French Three Factor Model（三因子模型）

（一）复制因子

In [15]:
factors_replicated =(portfolios
    .groupby(["portfolio_size","portfolio_bm","month"])
    .apply(lambda x:np.average(x["ret_excess"],weights=x["mktcap_lag"]))
    .reset_index(name="ret")
    .groupby("month",group_keys=False)
    .apply(lambda x:pd.Series(
        {"smb_replicated" : ((x["ret"][x["portfolio_size"] == 1]).mean() - (x["ret"][x["portfolio_size"] == 2]).mean()),
        "hml_replicated" : ((x["ret"][x["portfolio_bm"] == 3]).mean() - (x["ret"][x["portfolio_bm"] == 1]).mean())}))
)
factors_replicated

Unnamed: 0_level_0,smb_replicated,hml_replicated
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1961-07-01,-0.017210,-0.003597
1961-08-01,0.000980,-0.022819
1961-09-01,-0.004308,-0.020638
1961-10-01,-0.012085,0.004427
1961-11-01,-0.000050,0.004789
...,...,...
2022-08-01,0.013347,0.008983
2022-09-01,-0.007915,-0.004019
2022-10-01,-0.000423,0.078149
2022-11-01,-0.039395,0.015106


In [16]:
factors_replicated =(portfolios
    .groupby(["portfolio_size","portfolio_bm","month"])
    .apply(lambda x:np.average(x["ret_excess"],weights=x["mktcap_lag"]))
    .reset_index(name="ret")
    .groupby("month",group_keys=False)
    .apply(lambda x:x.assign(
        smb_replicated = np.nanmean(x.loc[x["portfolio_size"] == 1,"ret"]) - np.nanmean(x.loc[x["portfolio_size"] == 2,"ret"]),
        hml_replicated = np.nanmean(x.loc[x["portfolio_bm"] == 3,"ret"]) - np.nanmean(x.loc[x["portfolio_bm"] == 1,"ret"])).tail(1))
)
factors_replicated

Unnamed: 0,portfolio_size,portfolio_bm,month,ret,smb_replicated,hml_replicated
3690,2,3,1961-07-01,0.017020,-0.017210,-0.003597
3691,2,3,1961-08-01,0.008214,0.000980,-0.022819
3692,2,3,1961-09-01,-0.043239,-0.004308,-0.020638
3693,2,3,1961-10-01,0.037808,-0.012085,0.004427
3694,2,3,1961-11-01,0.055716,-0.000050,0.004789
...,...,...,...,...,...,...
4423,2,3,2022-08-01,-0.013739,0.013347,0.008983
4424,2,3,2022-09-01,-0.091959,-0.007915,-0.004019
4425,2,3,2022-10-01,0.147404,-0.000423,0.078149
4426,2,3,2022-11-01,0.045013,-0.039395,0.015106


（二）检验复制因子的准确性

In [17]:
test =(factors_ff3_monthly
    .merge(factors_replicated,how="inner",on=["month"])
    .assign(smb_replicated =lambda x: x["smb_replicated"].round(4),
            hml_replicated =lambda x: x["hml_replicated"].round(4))
)
test

Unnamed: 0,month,smb,hml,portfolio_size,portfolio_bm,ret,smb_replicated,hml_replicated
0,1961-07-01,-0.0190,-0.0009,2,3,0.017020,-0.0172,-0.0036
1,1961-08-01,-0.0175,-0.0028,2,3,0.008214,0.0010,-0.0228
2,1961-09-01,-0.0107,-0.0061,2,3,-0.043239,-0.0043,-0.0206
3,1961-10-01,-0.0165,0.0015,2,3,0.037808,-0.0121,0.0044
4,1961-11-01,0.0126,-0.0123,2,3,0.055716,-0.0000,0.0048
...,...,...,...,...,...,...,...,...
733,2022-08-01,0.0140,0.0029,2,3,-0.013739,0.0133,0.0090
734,2022-09-01,-0.0081,0.0005,2,3,-0.091959,-0.0079,-0.0040
735,2022-10-01,0.0006,0.0801,2,3,0.147404,-0.0004,0.0781
736,2022-11-01,-0.0352,0.0138,2,3,0.045013,-0.0394,0.0151


In [18]:
model_smb = (smf.ols(
    formula="smb ~ smb_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_smb)

OLS Model:
smb ~ smb_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept         -0.000       0.000     -0.996     0.32
smb_replicated     0.993       0.004    229.091     0.00

Summary statistics:
- Number of observations: 738
- R-squared: 0.986, Adjusted R-squared: 0.986
- F-statistic: 52,482.523 on 1 and 736 DF, p-value: 0.000



In [19]:
model_hml = (smf.ols(
    formula="hml ~ hml_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_hml)

OLS Model:
hml ~ hml_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          0.000       0.000      1.330    0.184
hml_replicated     0.963       0.007    132.345    0.000

Summary statistics:
- Number of observations: 738
- R-squared: 0.960, Adjusted R-squared: 0.960
- F-statistic: 17,515.273 on 1 and 736 DF, p-value: 0.000



六、复制Fama and French Five Factor Model（五因子模型）

（一）复制因子

In [20]:
other_sorting_variables = (compustat
    .assign(sorting_date = lambda x: pd.to_datetime((x["datadate"].dt.year+1).astype(str) + "-" +  "07-01"))
    .get(["gvkey","sorting_date","be","op","inv"])
    .merge(market_equity,how="inner",on=["gvkey", "sorting_date"])
    .assign(bm = lambda x: x["be"]/x["me"])
    .get(["permno","sorting_date","me","be","bm","op","inv"])
)
other_sorting_variables

Unnamed: 0,permno,sorting_date,me,be,bm,op,inv
0,18884.0,1961-07-01,110.418750,,,,
1,19035.0,1961-07-01,8.881500,14.880,1.675393,0.174731,
2,19107.0,1961-07-01,24.792500,20.488,0.826379,0.243557,
3,10559.0,1961-07-01,125.102000,,,,
4,17304.0,1961-07-01,116.592000,162.510,1.393835,0.208541,
...,...,...,...,...,...,...,...
253555,15067.0,2023-07-01,2534.015422,1031.569,0.407089,-0.021448,0.434687
253556,18215.0,2023-07-01,86.882750,12.316,0.141754,-1.973530,-0.397402
253557,18719.0,2023-07-01,431.904725,535.932,1.240857,-0.304819,-0.456756
253558,20280.0,2023-07-01,380.655000,180.339,0.473760,-0.593000,-0.112750


In [21]:
sorting_variables = (size
    .merge(other_sorting_variables,how="inner",on=["permno", "sorting_date"])
    .dropna()
    .drop_duplicates(subset=["permno","sorting_date"])
)
sorting_variables

Unnamed: 0,permno,exchange,sorting_date,size,me,be,bm,op,inv
0,10028.0,AMEX,1993-07-01,9.044750,7.735750,0.812,0.104967,-0.125616,-0.227702
1,10028.0,AMEX,1994-07-01,13.209750,13.567125,1.627,0.119922,0.179471,0.667376
2,10028.0,AMEX,1995-07-01,9.192187,13.126500,1.755,0.133699,0.096296,0.135334
3,10028.0,AMEX,1996-07-01,8.367688,7.287500,1.774,0.243431,0.015220,0.103739
4,10028.0,AMEX,1997-07-01,7.735000,6.158250,2.126,0.345228,-0.070555,0.349720
...,...,...,...,...,...,...,...,...,...
242271,10042.0,AMEX,2001-07-01,14.687200,11.624000,34.316,2.952168,0.247232,-0.276691
242272,10042.0,AMEX,2002-07-01,39.130002,14.834060,33.370,2.249553,0.084657,-0.059062
242273,10042.0,AMEX,2003-07-01,28.366201,22.565760,26.495,1.174124,-0.011096,-0.236247
242274,10042.0,AMEX,2004-07-01,108.659373,96.315269,22.694,0.235622,-0.305852,-0.100281


In [22]:
portfolios = (sorting_variables
    .groupby("sorting_date",group_keys=False)
    .apply(lambda x: x.assign(portfolio_size =  assign_portfolio(data=x,sorting_variable="size",percentiles=[0,0.5,1])))
    .groupby(["sorting_date","portfolio_size"],group_keys=False)
    .apply(lambda x: x.assign(**{f"portfolio_{col}" : assign_portfolio(data=x,sorting_variable=col,percentiles=[0,0.3,0.7,1]) for col in ["bm","op","inv"]}))
    .get(["permno","sorting_date","portfolio_size","portfolio_bm","portfolio_op","portfolio_inv"])
)
portfolios

Unnamed: 0,permno,sorting_date,portfolio_size,portfolio_bm,portfolio_op,portfolio_inv
0,10028.0,1993-07-01,1,1,1,1
1,10028.0,1994-07-01,1,1,2,3
2,10028.0,1995-07-01,1,1,1,2
3,10028.0,1996-07-01,1,1,1,2
4,10028.0,1997-07-01,1,1,1,3
...,...,...,...,...,...,...
242271,10042.0,2001-07-01,1,3,2,1
242272,10042.0,2002-07-01,1,3,1,1
242273,10042.0,2003-07-01,1,2,1,1
242274,10042.0,2004-07-01,1,1,1,1


In [23]:
portfolios = (crsp_monthly
    .assign(sorting_date = lambda x: np.where(x["month"].dt.month <= 6,pd.to_datetime((x["month"].dt.year-1).astype(str) + "-07-01"),
                                              pd.to_datetime((x["month"].dt.year).astype(str) + "-07-01")))
    .merge(portfolios,how="inner",on=["permno","sorting_date"])
)
portfolios

Unnamed: 0,permno,gvkey,month,ret_excess,mktcap,mktcap_lag,exchange,sorting_date,portfolio_size,portfolio_bm,portfolio_op,portfolio_inv
0,10028.0,012096,1993-03-01,-0.102500,6.329250,7.032500,AMEX,1992-07-01,1,1,1,1
1,10028.0,012096,1993-04-01,0.386489,8.790625,6.329250,AMEX,1992-07-01,1,1,1,1
2,10028.0,012096,1993-05-01,0.197800,10.548750,8.790625,AMEX,1992-07-01,1,1,1,1
3,10028.0,012096,1993-06-01,-0.135833,9.044750,10.548750,AMEX,1992-07-01,1,1,1,1
4,10028.0,012096,1992-07-01,0.105596,8.976000,8.096000,AMEX,1992-07-01,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2555243,10042.0,012139,2005-02-01,-0.215192,23.583960,29.989479,AMEX,2004-07-01,1,1,1,1
2555244,10042.0,012139,2005-03-01,-0.113211,21.029761,23.583960,AMEX,2004-07-01,1,1,1,1
2555245,10042.0,012139,2005-04-01,-0.071544,19.569360,21.029761,AMEX,2004-07-01,1,1,1,1
2555246,10042.0,012139,2005-05-01,0.102078,25.538140,19.569360,AMEX,2004-07-01,1,1,1,1


In [24]:
portfolios_value = (portfolios
    .groupby(["portfolio_size","portfolio_bm","month"])
    .apply(lambda x: np.average(x["ret_excess"],weights = x["mktcap_lag"]))
    .reset_index(name ="ret")
)

factors_value = (portfolios_value
    .groupby("month",group_keys = False)
    .apply(lambda x: pd.Series({"hml_replicated" : np.nanmean(x[x["portfolio_bm"] == 3]["ret"]) - np.nanmean(x[x["portfolio_bm"] == 1]["ret"])}))
)
factors_value

Unnamed: 0_level_0,hml_replicated
month,Unnamed: 1_level_1
1962-07-01,-0.024053
1962-08-01,-0.006577
1962-09-01,0.004991
1962-10-01,-0.002795
1962-11-01,0.009973
...,...
2022-08-01,0.013136
2022-09-01,-0.000538
2022-10-01,0.072222
2022-11-01,0.011634


In [25]:
portfolios_profitability = (portfolios
    .groupby(["portfolio_size","portfolio_op","month"])
    .apply(lambda x: np.average(x["ret_excess"],weights = x["mktcap_lag"]))
    .reset_index(name ="ret")
)

factors_profitability = (portfolios_profitability
    .groupby("month",group_keys = False)
    .apply(lambda x: pd.Series({"rmw_replicated" : np.nanmean(x[x["portfolio_op"] == 3]["ret"]) - np.nanmean(x[x["portfolio_op"] == 1]["ret"])}))
)
factors_profitability

Unnamed: 0_level_0,rmw_replicated
month,Unnamed: 1_level_1
1962-07-01,0.020984
1962-08-01,0.008683
1962-09-01,0.000504
1962-10-01,0.012590
1962-11-01,-0.007101
...,...
2022-08-01,-0.049411
2022-09-01,-0.009860
2022-10-01,0.042216
2022-11-01,0.061493


In [26]:
portfolios_investment = (portfolios
    .groupby(["portfolio_size","portfolio_inv","month"])
    .apply(lambda x: np.average(x["ret_excess"],weights = x["mktcap_lag"]))
    .reset_index(name ="ret")
)

factors_investment = (portfolios_investment
    .groupby("month",group_keys = False)
    .apply(lambda x: pd.Series({"cma_replicated" : np.nanmean(x[x["portfolio_inv"] == 1]["ret"]) - np.nanmean(x[x["portfolio_inv"] == 3]["ret"])}))
)
factors_investment

Unnamed: 0_level_0,cma_replicated
month,Unnamed: 1_level_1
1962-07-01,-0.029187
1962-08-01,0.006461
1962-09-01,0.002035
1962-10-01,0.007685
1962-11-01,-0.002213
...,...
2022-08-01,0.017325
2022-09-01,-0.005703
2022-10-01,0.069982
2022-11-01,0.032976


In [27]:
factors_size = (pd.concat([portfolios_value, portfolios_profitability, portfolios_investment], ignore_index=True)
                .groupby("month",group_keys=False)
                .apply(lambda x: pd.Series({"smb_replicated": np.nanmean(x[x["portfolio_size"] == 1]["ret"]) - np.nanmean(x[x["portfolio_size"] == 2]["ret"])}))
)
factors_size

Unnamed: 0_level_0,smb_replicated
month,Unnamed: 1_level_1
1962-07-01,-0.004281
1962-08-01,0.003765
1962-09-01,-0.012423
1962-10-01,-0.026269
1962-11-01,0.020659
...,...
2022-08-01,0.016857
2022-09-01,-0.010323
2022-10-01,0.018472
2022-11-01,-0.032135


In [28]:
factors_replicated = (factors_size
    .merge(factors_value,how="outer",on=["month"])
    .merge(factors_profitability,how="outer",on=["month"])
    .merge(factors_investment,how="outer",on=["month"])
)
factors_replicated

Unnamed: 0_level_0,smb_replicated,hml_replicated,rmw_replicated,cma_replicated
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1962-07-01,-0.004281,-0.024053,0.020984,-0.029187
1962-08-01,0.003765,-0.006577,0.008683,0.006461
1962-09-01,-0.012423,0.004991,0.000504,0.002035
1962-10-01,-0.026269,-0.002795,0.012590,0.007685
1962-11-01,0.020659,0.009973,-0.007101,-0.002213
...,...,...,...,...
2022-08-01,0.016857,0.013136,-0.049411,0.017325
2022-09-01,-0.010323,-0.000538,-0.009860,-0.005703
2022-10-01,0.018472,0.072222,0.042216,0.069982
2022-11-01,-0.032135,0.011634,0.061493,0.032976


(二)检验复制因子的准确性

In [29]:
test = (factors_ff5_monthly
    .merge(factors_replicated,how="inner",on=["month"])
    .assign(smb_replicated =lambda x: x["smb_replicated"].round(4),
            hml_replicated =lambda x: x["hml_replicated"].round(4),
            rmw_replicated =lambda x: x["rmw_replicated"].round(4),
            cma_replicated =lambda x: x["cma_replicated"].round(4))
)
test

Unnamed: 0,month,smb,hml,rmw,cma,smb_replicated,hml_replicated,rmw_replicated,cma_replicated
0,1963-07-01,-0.0041,-0.0097,0.0068,-0.0118,-0.0146,-0.0006,0.0006,-0.0096
1,1963-08-01,-0.0080,0.0180,0.0036,-0.0035,-0.0030,0.0046,0.0022,0.0039
2,1963-09-01,-0.0052,0.0013,-0.0071,0.0029,-0.0078,0.0051,-0.0095,-0.0019
3,1963-10-01,-0.0139,-0.0010,0.0280,-0.0201,-0.0079,-0.0151,0.0291,-0.0215
4,1963-11-01,-0.0088,0.0175,-0.0051,0.0224,-0.0069,0.0084,-0.0057,0.0129
...,...,...,...,...,...,...,...,...,...
709,2022-08-01,0.0152,0.0029,-0.0475,0.0129,0.0169,0.0131,-0.0494,0.0173
710,2022-09-01,-0.0105,0.0005,-0.0151,-0.0080,-0.0103,-0.0005,-0.0099,-0.0057
711,2022-10-01,0.0189,0.0801,0.0334,0.0664,0.0185,0.0722,0.0422,0.0700
712,2022-11-01,-0.0274,0.0138,0.0638,0.0318,-0.0321,0.0116,0.0615,0.0330


In [30]:
model_smb = (smf.ols(
    formula="smb ~ smb_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_smb)

OLS Model:
smb ~ smb_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          -0.00       0.000     -1.495    0.135
smb_replicated      0.97       0.004    221.609    0.000

Summary statistics:
- Number of observations: 714
- R-squared: 0.986, Adjusted R-squared: 0.986
- F-statistic: 49,110.478 on 1 and 712 DF, p-value: 0.000



In [31]:
model_hml = (smf.ols(
    formula="hml ~ hml_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_hml)

OLS Model:
hml ~ hml_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          0.000        0.00      1.591    0.112
hml_replicated     0.992        0.01     96.544    0.000

Summary statistics:
- Number of observations: 714
- R-squared: 0.929, Adjusted R-squared: 0.929
- F-statistic: 9,320.798 on 1 and 712 DF, p-value: 0.000



In [32]:
model_rmw = (smf.ols(
    formula="rmw ~ rmw_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_rmw)

OLS Model:
rmw ~ rmw_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          0.000       0.000      0.277    0.782
rmw_replicated     0.955       0.009    107.519    0.000

Summary statistics:
- Number of observations: 714
- R-squared: 0.942, Adjusted R-squared: 0.942
- F-statistic: 11,560.251 on 1 and 712 DF, p-value: 0.000



In [33]:
model_cma = (smf.ols(
    formula="cma ~ cma_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_cma)

OLS Model:
cma ~ cma_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          0.001       0.000      3.928      0.0
cma_replicated     0.965       0.008    117.780      0.0

Summary statistics:
- Number of observations: 714
- R-squared: 0.951, Adjusted R-squared: 0.951
- F-statistic: 13,872.118 on 1 and 712 DF, p-value: 0.000



# Exercises

1.Fama and French (1993) claim that their sample excludes firms until they have appeared in Compustat for two years. Implement this additional filter and compare the improvements of your replication effort.

In [34]:
compustat_filter = (compustat
    .assign(year = lambda x:x["datadate"].dt.year)
    .drop_duplicates(subset=["gvkey","year"])
    .groupby("gvkey")
    .apply(lambda x: x["year"].count())
    .reset_index(name="num")
    .query("num >= 2")
    .drop(columns=["num"])
    .merge(compustat,how="inner",on=["gvkey"])
)
compustat_filter


Unnamed: 0,gvkey,datadate,be,op,inv
0,001000,1961-12-31,,,
1,001000,1962-12-31,0.552,2.880435,
2,001000,1963-12-31,0.561,0.046346,
3,001000,1964-12-31,0.627,0.149920,
4,001000,1965-12-31,0.491,-0.452138,0.631356
...,...,...,...,...,...
548097,351590,2022-12-31,21499.887,0.255406,0.096925
548098,353444,2021-12-31,40233.020,0.078487,
548099,353444,2022-12-31,24016.067,0.122968,-0.100782
548100,353945,2021-12-31,305.149,0.672177,


(一)三因子模型

In [35]:
book_to_market =(compustat_filter
    .assign(sorting_date = lambda x: pd.to_datetime((x["datadate"].dt.year+1).astype(str) + "-" +  "07-01"))
    .get(["gvkey","sorting_date","be"])
    .merge(market_equity,how="inner",on=["gvkey","sorting_date"])
    .assign(bm= lambda x:x["be"]/x["me"])
    .get(["permno","sorting_date","me","bm"])
)

sorting_variables = (size
    .merge(book_to_market,how="inner",on=["permno","sorting_date"])
    .dropna()
    .drop_duplicates(subset=["permno","sorting_date"])
)

portfolios = (sorting_variables
    .groupby(["sorting_date"],group_keys=False)
    .apply(lambda x: x.assign(
        portfolio_size = assign_portfolio(data=x,sorting_variable="size",percentiles=[0,0.5,1]),
        portfolio_bm = assign_portfolio(data=x,sorting_variable="bm",percentiles=[0,0.3,0.7,1])       
        )
    )
    .get(["permno","sorting_date","portfolio_size","portfolio_bm"])
)

portfolios=(crsp_monthly
    .assign(sorting_date = lambda x: np.where(x["month"].dt.month <= 6,pd.to_datetime((x["month"].dt.year-1).astype(str) + "-" +  "07-01"),
                                              pd.to_datetime((x["month"].dt.year).astype(str) + "-" +  "07-01")))
    .merge(portfolios,how="inner",on=["permno","sorting_date"])
)

factors_replicated =(portfolios
    .groupby(["portfolio_size","portfolio_bm","month"])
    .apply(lambda x:np.average(x["ret_excess"],weights=x["mktcap_lag"]))
    .reset_index(name="ret")
    .groupby("month",group_keys=False)
    .apply(lambda x:pd.Series(
        {"smb_replicated" : ((x["ret"][x["portfolio_size"] == 1]).mean() - (x["ret"][x["portfolio_size"] == 2]).mean()),
        "hml_replicated" : ((x["ret"][x["portfolio_bm"] == 3]).mean() - (x["ret"][x["portfolio_bm"] == 1]).mean())}))
)

test =(factors_ff3_monthly
    .merge(factors_replicated,how="inner",on=["month"])
    .assign(smb_replicated =lambda x: x["smb_replicated"].round(4),
            hml_replicated =lambda x: x["hml_replicated"].round(4))
)

In [36]:
model_smb = (smf.ols(
    formula="smb ~ smb_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_smb)

OLS Model:
smb ~ smb_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept         -0.000       0.000     -1.122    0.262
smb_replicated     0.993       0.004    228.622    0.000

Summary statistics:
- Number of observations: 738
- R-squared: 0.986, Adjusted R-squared: 0.986
- F-statistic: 52,268.006 on 1 and 736 DF, p-value: 0.000



In [37]:
model_hml = (smf.ols(
    formula="hml ~ hml_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_hml)

OLS Model:
hml ~ hml_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          0.000       0.000      1.281      0.2
hml_replicated     0.962       0.007    132.875      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.960, Adjusted R-squared: 0.960
- F-statistic: 17,655.895 on 1 and 736 DF, p-value: 0.000



（二）五因子模型

In [38]:
other_sorting_variables = (compustat_filter
    .assign(sorting_date = lambda x: pd.to_datetime((x["datadate"].dt.year+1).astype(str) + "-" +  "07-01"))
    .get(["gvkey","sorting_date","be","op","inv"])
    .merge(market_equity,how="inner",on=["gvkey", "sorting_date"])
    .assign(bm = lambda x: x["be"]/x["me"])
    .get(["permno","sorting_date","me","be","bm","op","inv"])
)

sorting_variables = (size
    .merge(other_sorting_variables,how="inner",on=["permno", "sorting_date"])
    .dropna()
    .drop_duplicates(subset=["permno","sorting_date"])
)

portfolios = (sorting_variables
    .groupby("sorting_date",group_keys=False)
    .apply(lambda x: x.assign(portfolio_size =  assign_portfolio(data=x,sorting_variable="size",percentiles=[0,0.5,1])))
    .groupby(["sorting_date","portfolio_size"],group_keys=False)
    .apply(lambda x: x.assign(**{f"portfolio_{col}" : assign_portfolio(data=x,sorting_variable=col,percentiles=[0,0.3,0.7,1]) for col in ["bm","op","inv"]}))
    .get(["permno","sorting_date","portfolio_size","portfolio_bm","portfolio_op","portfolio_inv"])
)

portfolios = (crsp_monthly
    .assign(sorting_date = lambda x: np.where(x["month"].dt.month <= 6,pd.to_datetime((x["month"].dt.year-1).astype(str) + "-07-01"),
                                              pd.to_datetime((x["month"].dt.year).astype(str) + "-07-01")))
    .merge(portfolios,how="inner",on=["permno","sorting_date"])
)

portfolios_value = (portfolios
    .groupby(["portfolio_size","portfolio_bm","month"])
    .apply(lambda x: np.average(x["ret_excess"],weights = x["mktcap_lag"]))
    .reset_index(name ="ret")
)

factors_value = (portfolios_value
    .groupby("month",group_keys = False)
    .apply(lambda x: pd.Series({"hml_replicated" : np.nanmean(x[x["portfolio_bm"] == 3]["ret"]) - np.nanmean(x[x["portfolio_bm"] == 1]["ret"])}))
)

portfolios_profitability = (portfolios
    .groupby(["portfolio_size","portfolio_op","month"])
    .apply(lambda x: np.average(x["ret_excess"],weights = x["mktcap_lag"]))
    .reset_index(name ="ret")
)

factors_profitability = (portfolios_profitability
    .groupby("month",group_keys = False)
    .apply(lambda x: pd.Series({"rmw_replicated" : np.nanmean(x[x["portfolio_op"] == 3]["ret"]) - np.nanmean(x[x["portfolio_op"] == 1]["ret"])}))
)

portfolios_investment = (portfolios
    .groupby(["portfolio_size","portfolio_inv","month"])
    .apply(lambda x: np.average(x["ret_excess"],weights = x["mktcap_lag"]))
    .reset_index(name ="ret")
)

factors_investment = (portfolios_investment
    .groupby("month",group_keys = False)
    .apply(lambda x: pd.Series({"cma_replicated" : np.nanmean(x[x["portfolio_inv"] == 1]["ret"]) - np.nanmean(x[x["portfolio_inv"] == 3]["ret"])}))
)

factors_size = (pd.concat([portfolios_value, portfolios_profitability, portfolios_investment], ignore_index=True)
                .groupby("month",group_keys=False)
                .apply(lambda x: pd.Series({"smb_replicated": np.nanmean(x[x["portfolio_size"] == 1]["ret"]) - np.nanmean(x[x["portfolio_size"] == 2]["ret"])}))
)

factors_replicated = (factors_size
    .merge(factors_value,how="outer",on=["month"])
    .merge(factors_profitability,how="outer",on=["month"])
    .merge(factors_investment,how="outer",on=["month"])
)

test = (factors_ff5_monthly
    .merge(factors_replicated,how="inner",on=["month"])
    .assign(smb_replicated =lambda x: x["smb_replicated"].round(4),
            hml_replicated =lambda x: x["hml_replicated"].round(4),
            rmw_replicated =lambda x: x["rmw_replicated"].round(4),
            cma_replicated =lambda x: x["cma_replicated"].round(4))
)

In [39]:
model_smb = (smf.ols(
    formula="smb ~ smb_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_smb)

OLS Model:
smb ~ smb_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          -0.00       0.000     -1.495    0.135
smb_replicated      0.97       0.004    221.609    0.000

Summary statistics:
- Number of observations: 714
- R-squared: 0.986, Adjusted R-squared: 0.986
- F-statistic: 49,110.478 on 1 and 712 DF, p-value: 0.000



In [40]:
model_hml = (smf.ols(
    formula="hml ~ hml_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_hml)

OLS Model:
hml ~ hml_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          0.000        0.00      1.591    0.112
hml_replicated     0.992        0.01     96.544    0.000

Summary statistics:
- Number of observations: 714
- R-squared: 0.929, Adjusted R-squared: 0.929
- F-statistic: 9,320.798 on 1 and 712 DF, p-value: 0.000



In [41]:
model_rmw = (smf.ols(
    formula="rmw ~ rmw_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_rmw)

OLS Model:
rmw ~ rmw_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          0.000       0.000      0.277    0.782
rmw_replicated     0.955       0.009    107.519    0.000

Summary statistics:
- Number of observations: 714
- R-squared: 0.942, Adjusted R-squared: 0.942
- F-statistic: 11,560.251 on 1 and 712 DF, p-value: 0.000



In [42]:
model_cma = (smf.ols(
    formula="cma ~ cma_replicated", 
    data=test
  )
  .fit()
)
prettify_result(model_cma)

OLS Model:
cma ~ cma_replicated

Coefficients:
                Estimate  Std. Error  Statistic  p-Value
Intercept          0.001       0.000      3.928      0.0
cma_replicated     0.965       0.008    117.780      0.0

Summary statistics:
- Number of observations: 714
- R-squared: 0.951, Adjusted R-squared: 0.951
- F-statistic: 13,872.118 on 1 and 712 DF, p-value: 0.000



2.On his homepage, Kenneth French provides instructions on how to construct the most common variables used for portfolio sorts. Try to replicate the univariate portfolio sort return time series for E/P (earnings / price) provided on his homepage and evaluate your replication effort using regressions.

(Earnings/Price. Earnings is total earnings before extraordinary items, from Compustat. The earnings/price ratio used to form portfolios in June of year t is earnings for the fiscal year ending in calendar year t-1, divided by market equity at the end of December of t-1.)

In [43]:
from datetime import datetime

ep_portfolios_return = pd.read_csv('data/csv_exports/ep_portfolios_return.csv')
ep_portfolios_return = (ep_portfolios_return
    .assign(month = lambda x: x["month"].astype(str).apply(lambda y: datetime.strptime(y,"%Y%m")))
)
ep_portfolios_return

Unnamed: 0,month,<= 0,Lo 30,Med 40,Hi 30,Lo 20,Qnt 2,Qnt 3,Qnt 4,Hi 20,Lo 10,2-Dec,3-Dec,4-Dec,5-Dec,6-Dec,7-Dec,8-Dec,9-Dec,Hi 10
0,1951-07-01,3.01,6.39,8.56,5.97,5.77,7.99,9.58,4.94,9.12,5.21,7.34,8.96,7.44,9.41,9.67,7.09,4.50,9.84,8.16
1,1951-08-01,14.65,3.88,5.72,6.65,2.92,6.56,5.67,6.63,6.02,2.55,3.95,7.73,5.90,4.83,6.09,5.05,6.95,5.74,6.41
2,1951-09-01,-2.90,-0.23,-0.15,1.65,-0.50,1.36,-1.78,1.76,1.65,-1.22,1.47,0.80,1.67,-0.01,-2.65,2.26,1.66,2.18,0.93
3,1951-10-01,-3.62,-4.12,-1.12,-2.26,-4.56,-3.22,1.12,-2.59,-1.99,-4.93,-3.60,-2.43,-3.66,-1.66,2.52,-3.56,-2.39,-3.00,-0.59
4,1951-11-01,8.33,0.21,0.69,1.30,0.15,0.18,0.98,2.07,-0.30,-0.21,1.10,0.40,0.06,1.29,0.83,2.01,2.08,0.06,-0.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,2023-06-01,7.07,7.94,5.27,7.32,8.00,7.20,4.29,6.56,8.19,10.60,6.58,7.82,6.38,2.44,7.57,6.79,6.42,7.50,9.76
864,2023-07-01,4.80,2.08,4.63,5.99,2.69,1.70,5.88,6.86,6.97,1.51,3.37,1.36,2.14,6.65,4.19,8.12,5.31,6.49,7.67
865,2023-08-01,-2.84,-0.95,-2.57,-2.54,-0.19,-2.07,-1.42,-4.12,-2.30,0.40,-0.52,-1.87,-2.32,-0.10,-4.40,-5.24,-2.71,-4.00,0.18
866,2023-09-01,-6.02,-4.91,-4.98,-1.93,-5.90,-4.97,-4.19,-2.43,-1.97,-4.64,-6.62,-3.69,-6.57,-4.37,-3.75,-2.86,-1.89,-3.19,-0.26


In [44]:
compustat = (pd.read_sql_query(
    sql="SELECT * FROM compustat_new",
    con=tidy_finance,
    parse_dates={"datadate"})
)

earnings_prices = (compustat
    .assign(earn = lambda x: (compustat['sale'] - compustat['cogs'].fillna(0) - compustat['xsga'].fillna(0) - compustat['xint'].fillna(0)),
            sorting_date = lambda x: pd.to_datetime((x["datadate"].dt.year+1).astype(str) + "-" +  "07-01"))
    .get(["gvkey","sorting_date","earn"])
    .merge(market_equity,how="inner",on=["gvkey","sorting_date"])
    .assign(ep= lambda x:x["earn"]/x["me"])
    .get(["permno","sorting_date","me","ep"])
)
earnings_prices

Unnamed: 0,permno,sorting_date,me,ep
0,18884.0,1961-07-01,110.418750,0.186291
1,19035.0,1961-07-01,8.881500,0.292743
2,19107.0,1961-07-01,24.792500,0.201271
3,10559.0,1961-07-01,125.102000,0.177135
4,17304.0,1961-07-01,116.592000,0.290672
...,...,...,...,...
253555,15067.0,2023-07-01,2534.015422,-0.008731
253556,18215.0,2023-07-01,86.882750,-0.279756
253557,18719.0,2023-07-01,431.904725,-0.378236
253558,20280.0,2023-07-01,380.655000,-0.280939


In [45]:
sorting_variables = (size
    .merge(earnings_prices,how="inner",on=["permno","sorting_date"])
    .dropna()
    .drop_duplicates(subset=["permno","sorting_date"])
)
sorting_variables

Unnamed: 0,permno,exchange,sorting_date,size,me,ep
0,10028.0,AMEX,1993-07-01,9.044750,7.735750,-0.013186
1,10028.0,AMEX,1994-07-01,13.209750,13.567125,0.021523
2,10028.0,AMEX,1995-07-01,9.192187,13.126500,0.012875
3,10028.0,AMEX,1996-07-01,8.367688,7.287500,0.003705
4,10028.0,AMEX,1997-07-01,7.735000,6.158250,-0.024358
...,...,...,...,...,...,...
242271,10042.0,AMEX,2001-07-01,14.687200,11.624000,0.729869
242272,10042.0,AMEX,2002-07-01,39.130002,14.834060,0.190440
242273,10042.0,AMEX,2003-07-01,28.366201,22.565760,-0.013029
242274,10042.0,AMEX,2004-07-01,108.659373,96.315269,-0.072065


In [46]:
portfolios_minus = (crsp_monthly
    .assign(sorting_date = lambda x: np.where(x["month"].dt.month <= 6,pd.to_datetime((x["month"].dt.year-1).astype(str) + "-" +  "07-01"),
                                              pd.to_datetime((x["month"].dt.year).astype(str) + "-" +  "07-01")))
    .merge(sorting_variables,how="inner",on=["permno","sorting_date"])
    .query("ep < 0")
    .groupby("month")
    .apply(lambda x:(np.average(x["ret_excess"],weights=x["mktcap_lag"]))*100)
    .reset_index(name="ret")  
)

test_portfolios_minus = (ep_portfolios_return
    .get(["month","<= 0"])
    .merge(portfolios_minus,how="inner",on=["month"])
    .rename(columns={"<= 0" : "minus","ret" : "minus_replicated"})
)
test_portfolios_minus


Unnamed: 0,month,minus,minus_replicated
0,1961-07-01,4.37,5.485720
1,1961-08-01,-4.42,-5.723373
2,1961-09-01,-8.89,-9.501958
3,1961-10-01,1.06,1.492499
4,1961-11-01,1.94,0.668542
...,...,...,...
733,2022-08-01,1.76,2.137687
734,2022-09-01,-9.85,-10.537406
735,2022-10-01,6.20,3.353137
736,2022-11-01,1.43,-1.542637


In [47]:
model_portfolios_minus = (smf.ols(
    formula="minus ~ minus_replicated", 
    data=test_portfolios_minus
  )
  .fit()
)
prettify_result(model_portfolios_minus)

OLS Model:
minus ~ minus_replicated

Coefficients:
                  Estimate  Std. Error  Statistic  p-Value
Intercept            0.812       0.104      7.824      0.0
minus_replicated     0.840       0.013     65.319      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.853, Adjusted R-squared: 0.853
- F-statistic: 4,266.617 on 1 and 736 DF, p-value: 0.000



In [48]:
portfolios = (sorting_variables
    .groupby(["sorting_date"],group_keys=False)
    .apply(lambda x: x.assign(portfolio_ep_three = assign_portfolio(data=x,sorting_variable="ep",percentiles=[0,0.3,0.7,1]),
                              portfolio_ep_five = assign_portfolio(data=x,sorting_variable="ep",percentiles=np.linspace(0, 1, 6)),
                              portfolio_ep_ten = assign_portfolio(data=x,sorting_variable="ep",percentiles=np.linspace(0, 1, 11))))
    .get(["permno","sorting_date","portfolio_ep_three","portfolio_ep_five","portfolio_ep_ten"])
)
portfolios

Unnamed: 0,permno,sorting_date,portfolio_ep_three,portfolio_ep_five,portfolio_ep_ten
0,10028.0,1993-07-01,1,1,1
1,10028.0,1994-07-01,1,1,1
2,10028.0,1995-07-01,1,1,1
3,10028.0,1996-07-01,1,1,1
4,10028.0,1997-07-01,1,1,1
...,...,...,...,...,...
242271,10042.0,2001-07-01,3,5,10
242272,10042.0,2002-07-01,3,4,8
242273,10042.0,2003-07-01,1,1,1
242274,10042.0,2004-07-01,1,1,1


In [49]:
portfolios=(crsp_monthly
    .assign(sorting_date = lambda x: np.where(x["month"].dt.month <= 6,pd.to_datetime((x["month"].dt.year-1).astype(str) + "-" +  "07-01"),
                                              pd.to_datetime((x["month"].dt.year).astype(str) + "-" +  "07-01")))
    .merge(portfolios,how="inner",on=["permno","sorting_date"])
)
portfolios

Unnamed: 0,permno,gvkey,month,ret_excess,mktcap,mktcap_lag,exchange,sorting_date,portfolio_ep_three,portfolio_ep_five,portfolio_ep_ten
0,10028.0,012096,1993-03-01,-0.102500,6.329250,7.032500,AMEX,1992-07-01,1,1,2
1,10028.0,012096,1993-04-01,0.386489,8.790625,6.329250,AMEX,1992-07-01,1,1,2
2,10028.0,012096,1993-05-01,0.197800,10.548750,8.790625,AMEX,1992-07-01,1,1,2
3,10028.0,012096,1993-06-01,-0.135833,9.044750,10.548750,AMEX,1992-07-01,1,1,2
4,10028.0,012096,1992-07-01,0.105596,8.976000,8.096000,AMEX,1992-07-01,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
2707144,10042.0,012139,2005-02-01,-0.215192,23.583960,29.989479,AMEX,2004-07-01,1,1,1
2707145,10042.0,012139,2005-03-01,-0.113211,21.029761,23.583960,AMEX,2004-07-01,1,1,1
2707146,10042.0,012139,2005-04-01,-0.071544,19.569360,21.029761,AMEX,2004-07-01,1,1,1
2707147,10042.0,012139,2005-05-01,0.102078,25.538140,19.569360,AMEX,2004-07-01,1,1,1


In [50]:
portfolio_ep_three =(portfolios
    .groupby(["portfolio_ep_three","month"])
    .apply(lambda x:(np.average(x["ret_excess"],weights=x["mktcap_lag"]))*100)
    .reset_index(name="ret")
    .pivot_table(index='month', columns='portfolio_ep_three', values='ret')
    .reset_index()
)
portfolio_ep_three

portfolio_ep_three,month,1,2,3
0,1961-07-01,1.820810,3.891827,4.427611
1,1961-08-01,3.585676,1.400427,1.337895
2,1961-09-01,-2.107969,-2.676887,-0.142716
3,1961-10-01,2.498183,3.150545,2.039614
4,1961-11-01,3.321676,4.944449,5.977436
...,...,...,...,...
733,2022-08-01,-4.986330,-3.404854,-2.392822
734,2022-09-01,-9.901886,-8.864238,-9.437047
735,2022-10-01,2.631081,8.931982,14.667344
736,2022-11-01,3.359694,5.099187,5.398848


In [51]:
test_portfolios_three = (ep_portfolios_return
    .get(["month","Lo 30","Med 40","Hi 30"])
    .rename(columns=lambda x: x.replace(' ', '_'))
    .merge(portfolio_ep_three,how="inner",on=["month"])
    .rename(columns=lambda x: 'portfolio_' + str(x) if str(x).isdigit() else x)
)
test_portfolios_three

Unnamed: 0,month,Lo_30,Med_40,Hi_30,portfolio_1,portfolio_2,portfolio_3
0,1961-07-01,2.73,3.69,3.43,1.820810,3.891827,4.427611
1,1961-08-01,2.79,2.96,1.06,3.585676,1.400427,1.337895
2,1961-09-01,-1.30,-2.75,-0.71,-2.107969,-2.676887,-0.142716
3,1961-10-01,2.25,3.68,2.94,2.498183,3.150545,2.039614
4,1961-11-01,4.24,4.36,6.83,3.321676,4.944449,5.977436
...,...,...,...,...,...,...,...
733,2022-08-01,-4.97,-3.29,-2.42,-4.986330,-3.404854,-2.392822
734,2022-09-01,-9.60,-8.39,-8.90,-9.901886,-8.864238,-9.437047
735,2022-10-01,5.21,9.40,14.47,2.631081,8.931982,14.667344
736,2022-11-01,4.00,6.61,5.73,3.359694,5.099187,5.398848


In [52]:
model_portfolios_three = (smf.ols(
    formula="Lo_30 ~ portfolio_1", 
    data=test_portfolios_three
  )
  .fit()
)
prettify_result(model_portfolios_three)

OLS Model:
Lo_30 ~ portfolio_1

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.416       0.027     15.287      0.0
portfolio_1     0.950       0.005    176.172      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.977, Adjusted R-squared: 0.977
- F-statistic: 31,036.510 on 1 and 736 DF, p-value: 0.000



In [53]:
model_portfolios_three = (smf.ols(
    formula="Med_40 ~ portfolio_2", 
    data=test_portfolios_three
  )
  .fit()
)
prettify_result(model_portfolios_three)

OLS Model:
Med_40 ~ portfolio_2

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.386       0.029     13.120      0.0
portfolio_2     0.966       0.007    143.351      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.965, Adjusted R-squared: 0.965
- F-statistic: 20,549.434 on 1 and 736 DF, p-value: 0.000



In [54]:
model_portfolios_three = (smf.ols(
    formula="Hi_30 ~ portfolio_3", 
    data=test_portfolios_three
  )
  .fit()
)
prettify_result(model_portfolios_three)

OLS Model:
Hi_30 ~ portfolio_3

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.400       0.041      9.643      0.0
portfolio_3     0.975       0.009    112.635      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.945, Adjusted R-squared: 0.945
- F-statistic: 12,686.741 on 1 and 736 DF, p-value: 0.000



In [55]:
portfolio_ep_five =(portfolios
    .groupby(["portfolio_ep_five","month"])
    .apply(lambda x:(np.average(x["ret_excess"],weights=x["mktcap_lag"]))*100)
    .reset_index(name="ret")
    .pivot_table(index='month', columns='portfolio_ep_five', values='ret')
    .reset_index()
)

test_portfolios_five = (ep_portfolios_return
    .get(["month","Lo 20","Qnt 2","Qnt 3","Qnt 4","Hi 20"])
    .rename(columns=lambda x: x.replace(' ', '_'))
    .merge(portfolio_ep_five,how="inner",on=["month"])
    .rename(columns=lambda x: 'portfolio_' + str(x) if str(x).isdigit() else x)
)
test_portfolios_five


Unnamed: 0,month,Lo_20,Qnt_2,Qnt_3,Qnt_4,Hi_20,portfolio_1,portfolio_2,portfolio_3,portfolio_4,portfolio_5
0,1961-07-01,1.51,4.83,3.21,3.70,3.35,1.609198,3.288223,4.412111,2.157260,4.895368
1,1961-08-01,3.69,1.80,2.94,2.00,0.96,3.510890,2.459766,1.598154,1.106798,1.282850
2,1961-09-01,-1.21,-1.79,-1.15,-1.94,-3.42,-1.803861,-4.389419,-0.933440,-2.564272,-0.051680
3,1961-10-01,2.35,2.21,4.79,2.51,4.81,2.570089,3.370393,2.140519,3.916031,1.869106
4,1961-11-01,3.00,5.75,4.10,6.59,4.84,2.879615,4.467820,5.931037,3.563841,6.436270
...,...,...,...,...,...,...,...,...,...,...,...
733,2022-08-01,-5.69,-3.14,-3.70,-2.07,-3.63,-3.349333,-4.958813,-2.642564,-2.955635,-1.760847
734,2022-09-01,-9.47,-9.06,-8.77,-8.51,-8.79,-9.587540,-9.762599,-8.110685,-9.135845,-9.532296
735,2022-10-01,3.52,9.47,7.18,15.41,13.23,0.981562,6.498670,8.370481,12.596605,13.954189
736,2022-11-01,4.44,4.39,7.24,5.51,6.37,-0.986926,4.823030,5.746362,5.760062,5.121205


In [56]:
model_portfolios_five = (smf.ols(
    formula="Lo_20 ~ portfolio_1", 
    data=test_portfolios_five
  )
  .fit()
)
prettify_result(model_portfolios_five)

OLS Model:
Lo_20 ~ portfolio_1

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.394       0.037     10.721      0.0
portfolio_1     0.931       0.007    136.688      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.962, Adjusted R-squared: 0.962
- F-statistic: 18,683.633 on 1 and 736 DF, p-value: 0.000



In [57]:
model_portfolios_five = (smf.ols(
    formula="Qnt_2 ~ portfolio_2", 
    data=test_portfolios_five
  )
  .fit()
)
prettify_result(model_portfolios_five)

OLS Model:
Qnt_2 ~ portfolio_2

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.461       0.040     11.478      0.0
portfolio_2     0.939       0.009    106.474      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.939, Adjusted R-squared: 0.939
- F-statistic: 11,336.795 on 1 and 736 DF, p-value: 0.000



In [58]:
model_portfolios_five = (smf.ols(
    formula="Qnt_3 ~ portfolio_3", 
    data=test_portfolios_five
  )
  .fit()
)
prettify_result(model_portfolios_five)

OLS Model:
Qnt_3 ~ portfolio_3

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.329       0.046      7.185      0.0
portfolio_3     0.971       0.011     91.571      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.919, Adjusted R-squared: 0.919
- F-statistic: 8,385.204 on 1 and 736 DF, p-value: 0.000



In [59]:
model_portfolios_five = (smf.ols(
    formula="Qnt_4 ~ portfolio_4", 
    data=test_portfolios_five
  )
  .fit()
)
prettify_result(model_portfolios_five)

OLS Model:
Qnt_4 ~ portfolio_4

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.480       0.047     10.180      0.0
portfolio_4     0.934       0.010     90.434      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.917, Adjusted R-squared: 0.917
- F-statistic: 8,178.282 on 1 and 736 DF, p-value: 0.000



In [60]:
model_portfolios_five = (smf.ols(
    formula="Hi_20 ~ portfolio_5", 
    data=test_portfolios_five
  )
  .fit()
)
prettify_result(model_portfolios_five)

OLS Model:
Hi_20 ~ portfolio_5

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.376       0.052      7.278      0.0
portfolio_5     0.966       0.010     94.835      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.924, Adjusted R-squared: 0.924
- F-statistic: 8,993.648 on 1 and 736 DF, p-value: 0.000



In [61]:
portfolio_ep_ten =(portfolios
    .groupby(["portfolio_ep_ten","month"])
    .apply(lambda x:(np.average(x["ret_excess"],weights=x["mktcap_lag"]))*100)
    .reset_index(name="ret")
    .pivot_table(index='month', columns='portfolio_ep_ten', values='ret')
    .reset_index()
)

test_portfolios_ten = (ep_portfolios_return
    .get(["month","Lo 10","2-Dec","3-Dec","4-Dec","5-Dec","6-Dec","7-Dec","8-Dec","9-Dec","Hi 10"])
    .rename(columns=lambda x: x.replace(' ', '_'))
    .rename(columns=lambda x: f"Dec_{x.split('-')[0]}" if '-' in x else x)
    .merge(portfolio_ep_ten,how="inner",on=["month"])
    .rename(columns=lambda x: 'portfolio_' + str(x) if str(x).isdigit() else x)
)
test_portfolios_ten

Unnamed: 0,month,Lo_10,Dec_2,Dec_3,Dec_4,Dec_5,Dec_6,Dec_7,Dec_8,Dec_9,...,portfolio_1,portfolio_2,portfolio_3,portfolio_4,portfolio_5,portfolio_6,portfolio_7,portfolio_8,portfolio_9,portfolio_10
0,1961-07-01,0.83,2.17,5.11,4.17,3.82,2.48,3.86,3.49,3.15,...,1.955403,1.250844,2.829181,3.627898,4.370170,4.439036,2.431695,1.829122,4.885309,4.925980
1,1961-08-01,4.49,2.93,1.12,3.41,2.51,3.46,2.66,1.14,-0.14,...,3.871256,3.135230,3.937939,1.375085,2.192951,1.216619,0.653147,1.653071,0.789179,2.788284
2,1961-09-01,-0.60,-1.81,-1.46,-2.57,-2.71,0.70,-4.52,1.44,-4.75,...,-0.533698,-3.139717,-3.536219,-5.033193,-3.370391,0.650266,-4.167057,-0.661461,-0.248365,0.535407
3,1961-10-01,2.03,2.65,2.05,2.59,4.69,4.91,3.30,1.52,6.85,...,2.970055,2.138143,2.154351,4.302284,1.361716,2.626111,4.696182,3.019454,1.609519,2.636254
4,1961-11-01,2.12,3.86,6.65,3.64,3.78,4.48,5.16,8.38,4.05,...,2.058833,3.774660,5.443391,3.735653,4.196809,6.999821,3.730052,3.369553,7.432283,3.511403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733,2022-08-01,-6.26,-5.38,-3.59,-2.56,-4.65,-2.07,-3.48,-1.18,-4.39,...,4.041291,-4.621357,-5.744424,-4.209094,-2.359046,-2.930196,-2.833299,-3.051485,-1.115943,-2.828256
734,2022-09-01,-9.11,-9.66,-9.85,-8.05,-8.50,-9.22,-7.71,-9.00,-6.73,...,-8.243575,-9.837526,-10.051576,-9.490412,-8.306571,-7.911146,-8.882389,-9.335842,-8.663980,-10.974665
735,2022-10-01,4.19,3.16,8.39,10.81,3.46,13.54,14.94,15.70,13.18,...,4.692199,0.283747,3.423397,9.376092,6.019742,10.756553,9.040467,15.420957,13.313871,15.037373
736,2022-11-01,2.63,5.44,3.21,5.82,7.73,6.47,6.15,5.10,5.90,...,-2.356198,-0.715895,5.395938,4.315884,5.760307,5.732832,5.859490,5.685587,5.288097,4.836824


In [62]:
model_portfolios_ten = (smf.ols(
    formula="Lo_10 ~ portfolio_1", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Lo_10 ~ portfolio_1

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.552       0.086      6.445      0.0
portfolio_1     0.820       0.013     61.080      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.835, Adjusted R-squared: 0.835
- F-statistic: 3,730.795 on 1 and 736 DF, p-value: 0.000



In [63]:
model_portfolios_ten = (smf.ols(
    formula="Dec_2 ~ portfolio_2", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Dec_2 ~ portfolio_2

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.344       0.061      5.655      0.0
portfolio_2     0.887       0.012     74.153      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.882, Adjusted R-squared: 0.882
- F-statistic: 5,498.653 on 1 and 736 DF, p-value: 0.000



In [64]:
model_portfolios_ten = (smf.ols(
    formula="Dec_3 ~ portfolio_3", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Dec_3 ~ portfolio_3

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.514       0.055      9.304      0.0
portfolio_3     0.934       0.012     78.812      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.894, Adjusted R-squared: 0.894
- F-statistic: 6,211.281 on 1 and 736 DF, p-value: 0.000



In [65]:
model_portfolios_ten = (smf.ols(
    formula="Dec_4 ~ portfolio_4", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Dec_4 ~ portfolio_4

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.440       0.064      6.910      0.0
portfolio_4     0.886       0.014     64.753      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.851, Adjusted R-squared: 0.850
- F-statistic: 4,192.906 on 1 and 736 DF, p-value: 0.000



In [66]:
model_portfolios_ten = (smf.ols(
    formula="Dec_5 ~ portfolio_5", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Dec_5 ~ portfolio_5

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.401       0.061      6.605      0.0
portfolio_5     0.946       0.014     69.009      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.866, Adjusted R-squared: 0.866
- F-statistic: 4,762.225 on 1 and 736 DF, p-value: 0.000



In [67]:
model_portfolios_ten = (smf.ols(
    formula="Dec_6 ~ portfolio_6", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Dec_6 ~ portfolio_6

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.344       0.073      4.721      0.0
portfolio_6     0.893       0.016     55.353      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.806, Adjusted R-squared: 0.806
- F-statistic: 3,063.938 on 1 and 736 DF, p-value: 0.000



In [68]:
model_portfolios_ten = (smf.ols(
    formula="Dec_7 ~ portfolio_7", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Dec_7 ~ portfolio_7

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.542       0.067      8.146      0.0
portfolio_7     0.871       0.014     61.515      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.837, Adjusted R-squared: 0.837
- F-statistic: 3,784.065 on 1 and 736 DF, p-value: 0.000



In [69]:
model_portfolios_ten = (smf.ols(
    formula="Dec_8 ~ portfolio_8", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Dec_8 ~ portfolio_8

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.440       0.071      6.218      0.0
portfolio_8     0.919       0.015     60.857      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.834, Adjusted R-squared: 0.834
- F-statistic: 3,703.612 on 1 and 736 DF, p-value: 0.000



In [70]:
model_portfolios_ten = (smf.ols(
    formula="Dec_9 ~ portfolio_9", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Dec_9 ~ portfolio_9

Coefficients:
             Estimate  Std. Error  Statistic  p-Value
Intercept       0.397       0.066      5.978      0.0
portfolio_9     0.968       0.014     71.050      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.873, Adjusted R-squared: 0.873
- F-statistic: 5,048.085 on 1 and 736 DF, p-value: 0.000



In [71]:
model_portfolios_ten = (smf.ols(
    formula="Hi_10 ~ portfolio_10", 
    data=test_portfolios_ten
  )
  .fit()
)
prettify_result(model_portfolios_ten)

OLS Model:
Hi_10 ~ portfolio_10

Coefficients:
              Estimate  Std. Error  Statistic  p-Value
Intercept        0.420       0.074      5.645      0.0
portfolio_10     0.882       0.013     67.689      0.0

Summary statistics:
- Number of observations: 738
- R-squared: 0.862, Adjusted R-squared: 0.861
- F-statistic: 4,581.829 on 1 and 736 DF, p-value: 0.000

