# Data Processing

In [1]:
import pandas as pd

---

## data_ml

In [2]:
data_ml = (
    pd.read_csv('./data/data_ml.csv').
    assign(month_end = lambda df: pd.to_datetime(df['date'], format='%m/%d/%Y').dt.to_period('M')).
    drop(columns=['date']).
    set_index(['stock_id', 'month_end']).
    sort_index()
)

data_ml.to_pickle('./data/data_ml.pkl')

data_ml.sample(10).sort_index().T

stock_id,96,111,134,174,231,645,773,867,1009,1158
month_end,2018-11,2006-07,2013-02,2003-09,2017-08,2016-07,2006-06,2017-11,2007-08,2018-04
Advt_12M_Usd,0.840,0.420,0.890,0.150,0.440,0.570,0.370,0.220,0.130,0.990
Advt_3M_Usd,0.840,0.420,0.900,0.130,0.440,0.580,0.380,0.180,0.140,1.000
Advt_6M_Usd,0.840,0.420,0.890,0.160,0.420,0.560,0.360,0.210,0.140,0.990
Asset_Turnover,0.670,0.910,0.210,0.040,0.550,0.970,0.930,0.960,0.900,0.090
Bb_Yld,0.730,0.660,0.560,0.260,0.280,0.940,0.800,0.600,0.810,0.910
...,...,...,...,...,...,...,...,...,...,...
Vol3Y_Usd,0.360,0.140,0.080,0.080,0.870,0.350,0.260,0.900,0.520,0.260
R1M_Usd,-0.133,-0.029,-0.009,-0.020,0.202,0.014,-0.105,0.007,0.022,0.047
R3M_Usd,0.056,-0.039,0.006,0.126,0.150,0.025,-0.079,-0.084,0.130,0.111
R6M_Usd,0.000,0.012,-0.098,0.127,0.096,0.139,-0.013,0.133,-0.326,0.040


---

## Fama-French Factors

In [3]:
ff_data_lib_url = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/'

### 5 Factors

In [4]:
url = ff_data_lib_url + 'F-F_Research_Data_5_Factors_2x3_CSV.zip'
nrows = 687

df_ff_factors = (
    pd.read_csv(url, skiprows=3, index_col=0, nrows=nrows, na_values=-99.99).
    rename(columns={'Mkt-RF': 'MKT_RF'}).
    sort_index()
)
df_ff_factors.index.rename('month_end', inplace=True)
df_ff_factors = df_ff_factors / 100
df_ff_factors.index = pd.to_datetime(df_ff_factors.index, format='%Y%m').to_period('M')
df_ff_factors.to_pickle('./data/ff_5f_m.pkl')

df_ff_factors.sample(10).sort_index()

Unnamed: 0_level_0,MKT_RF,SMB,HML,RMW,CMA,RF
month_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1966-12,0.0013,0.0191,-0.0122,0.0068,-0.0013,0.004
1969-09,-0.0298,0.0129,-0.0319,0.0342,-0.0083,0.0062
1970-10,-0.0228,-0.045,0.0027,0.0183,0.0238,0.0046
1971-11,-0.0046,-0.029,-0.017,0.0245,-0.0036,0.0037
1973-12,0.0061,-0.046,0.041,-0.028,0.023,0.0064
1979-04,-0.0006,0.024,0.0105,0.0105,0.0021,0.008
1984-08,0.1028,-0.003,-0.0185,-0.0089,-0.0081,0.0083
1987-07,0.0385,-0.0111,0.0066,-0.005,0.0156,0.0046
2001-12,0.0161,0.0515,0.012,0.0024,-0.0029,0.0015
2004-04,-0.0183,-0.0205,-0.0304,0.0348,-0.028,0.0008


---