In [4]:
import pandas as pd
import pickle as pkl
import pyarrow.feather as feather
import numpy as np
from tqdm import tqdm
from functions import *

####################
#    All Stocks    #
####################
with open('chars_q_raw.ftr', 'rb') as f:
    chars_q = feather.read_feather(f)

chars_q = chars_q.dropna(subset=['permno'])
chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int)
chars_q['jdate'] = pd.to_datetime(chars_q['jdate'])
chars_q = chars_q.drop_duplicates(['permno', 'jdate'])

with open('chars_a_raw.ftr', 'rb') as f:
    chars_a = feather.read_feather(f)

chars_a = chars_a.dropna(subset=['permno'])
chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int)
chars_a['jdate'] = pd.to_datetime(chars_a['jdate'])
chars_a = chars_a.drop_duplicates(['permno', 'jdate'])

# information list
obs_var_list = ['gvkey', 'permno', 'jdate', 'sic', 'ret', 'retx', 'retadj', 'exchcd', 'shrcd']
# characteristics with quarterly and annual frequency at the same time
accounting_var_list = ['datadate', 'acc', 'bm', 'agr', 'alm', 'ato',  'cash', 'cashdebt', 'cfp', 'chcsho', 'chpm',
                       'chtx', 'depr', 'ep', 'gma', 'grltnoa', 'lev', 'lgr', 'ni', 'noa', 'op', 'pctacc', 'pm',
                       'rd_sale', 'rdm', 'rna', 'roa', 'roe', 'rsup', 'sgr', 'sp']
a_var_list = ['a_'+i for i in accounting_var_list]
q_var_list = ['q_'+i for i in accounting_var_list]
# annual frequency only list
a_only_list = ['adm', 'bm_ia', 'herf', 'hire', 'me_ia']
# quarterly frequency only list
q_only_list = ['abr', 'sue', 'cinvest', 'nincr', 'pscore',
               # 'turn', 'dolvol'
               ]
# monthly frequency only list
m_var_list = ['baspread', 'beta', 'ill', 'maxret', 'mom12m', 'mom1m', 'mom36m', 'mom60m', 'mom6m', 'rvar_capm',
              'rvar_ff3', 'rvar_mean', 'seas1a', 'std_dolvol', 'std_turn', 'zerotrade', 'me', 'dy',
              'turn', 'dolvol' # need to rerun the accounting to put them in to char_a
              ]

df_a = chars_a[obs_var_list + accounting_var_list + a_only_list + m_var_list]
df_a.columns = obs_var_list + a_var_list + a_only_list + m_var_list
df_a = df_a.sort_values(obs_var_list)

df_q = chars_q[obs_var_list + accounting_var_list + q_only_list]
df_q.columns = obs_var_list + q_var_list + q_only_list
# drop the same information columns for merging
df_q = df_q.drop(['sic', 'ret', 'retx', 'retadj', 'exchcd', 'shrcd'], axis=1)

df = df_a.merge(df_q, how='left', on=['gvkey', 'jdate', 'permno'])

# first element in accounting_var_list is datadate
for i in tqdm(accounting_var_list[1:]):
    print('processing %s' % i)
    a = 'a_'+i
    q = 'q_'+i
    t1 = 'tmp1_'+i
    t2 = 'tmp2_'+i
    t3 = 'tmp3_'+i
    t4 = 'tmp4_'+i
    t5 = 'tmp5_'+i
    
    # tmp1: if the annual variable is available
    df[t1] = np.where(df[a].isna(), False, True)
    # tmp2: if the quarterly variable is available
    df[t2] = np.where(df[q].isna(), False, True)
    # tmp3: both
    df[t3] = df[t1] & df[t2]
    # tmp4: latest one
    df[t4] = np.where(df['q_datadate'] < df['a_datadate'], df[a], df[q])
    # available one
    df[t5] = np.where(df[t1], df[a], df[q])
    # final
    df[i] = np.where(df[t3], df[t4], df[t5])
    df = df.drop([a, q, t1, t2, t3, t4, t5], axis=1)

# drop the datadate of different frequency
df = df.drop(['a_datadate', 'q_datadate'], axis=1)

# drop optional variables, you can adjust it by your selection
df = df.drop(['ret', 'retx'], axis=1)
df = df.rename(columns={'retadj': 'ret'})  # retadj is return adjusted by dividend
df['ret'] = df.groupby(['permno'])['ret'].shift(-1)  # we shift return in t period to t+1 for prediction
df['date'] = df.groupby(['permno'])['jdate'].shift(-1)  # date is return date, jdate is predictor date
df = df.drop(['jdate'], axis=1)  # now we only keep the date of return
df = df.dropna(subset=['ret']).reset_index(drop=True)
df.replace([-np.inf, np.inf], np.nan, inplace=True)

# save raw data
with open('chars60_raw_no_impute.feather', 'wb') as f:
    feather.write_feather(df, f)

# impute missing values, you can choose different func form functions.py, such as ffi49/ffi10
df_impute = df.copy()
df_impute['sic'] = df_impute['sic'].astype(int)
df_impute['date'] = pd.to_datetime(df_impute['date'])

df_impute['ffi49'] = ffi49(df_impute)
df_impute['ffi49'] = df_impute['ffi49'].fillna(49)  # we treat na in ffi49 as 'other'
df_impute['ffi49'] = df_impute['ffi49'].astype(int)

  3%|▎         | 1/30 [00:00<00:03,  8.33it/s]

processing acc
processing bm


 10%|█         | 3/30 [00:00<00:03,  8.59it/s]

processing agr
processing alm
processing ato
processing cash


 30%|███       | 9/30 [00:00<00:01, 17.47it/s]

processing cashdebt
processing cfp
processing chcsho
processing chpm
processing chtx


 47%|████▋     | 14/30 [00:00<00:00, 18.73it/s]

processing depr
processing ep
processing gma
processing grltnoa


 60%|██████    | 18/30 [00:01<00:00, 18.03it/s]

processing lev
processing lgr
processing ni
processing noa


 67%|██████▋   | 20/30 [00:01<00:00, 15.60it/s]

processing op
processing pctacc
processing pm


 80%|████████  | 24/30 [00:01<00:00, 16.06it/s]

processing rd_sale
processing rdm
processing rna
processing roa


100%|██████████| 30/30 [00:01<00:00, 16.33it/s]


processing roe
processing rsup
processing sgr
processing sp


In [29]:
def fillna_atq(df_q, df_a):
    # fina columns are na in df_q and exist in df_a
    df_q_na_list = df_q.columns[df_q.isna().any()].tolist()
    df_a_columns_list = df_a.columns.values.tolist()
    list_temp = list(set(df_q_na_list) & set(df_a_columns_list))
    # remove mom columns, mom chars are same in annual and quarterly
    na_columns_list = []
    for i in list_temp:
        if re.match(r'mom.', i) is None:
            na_columns_list.append(i)
    # get annual columns from df_a
    df_temp = df_a[na_columns_list].copy()
    df_temp[['permno', 'date']] = df_a[['permno', 'date']].copy()
    # rename annual columns in the form of 'chars_a'
    for na_column in na_columns_list:
        df_temp = df_temp.rename(columns={'%s' % na_column: '%s_a' % na_column})
    df_temp = df_temp.reset_index(drop=True)
    # use annual chars to fill quarterly na
    df_q = pd.merge(df_q, df_temp, how='left', on=['permno', 'date'])
    for na_column in na_columns_list:
        df_q['%s' % na_column] = np.where(df_q['%s' % na_column].isnull(), df_q['%s_a' % na_column], df_q['%s' % na_column])
        df_q = df_q.drop(['%s_a' % na_column], axis=1)
    return df_q


def fillna_ind(df, method, ffi):
    df_fill = pd.DataFrame()
    na_columns_list = df.columns[df.isna().any()].tolist()
    for na_column in na_columns_list:
        if method == 'mean':
            df_temp = df.groupby(['date', 'ffi%s' % ffi])['%s' % na_column].mean()
        elif method == 'median':
            df_temp = df.groupby(['date', 'ffi%s' % ffi])['%s' % na_column].median()
        else:
            None
        df_fill = pd.concat([df_fill, df_temp], axis=1)
        if method == 'mean':
            df_fill = df_fill.rename(columns={'%s' % na_column: '%s_mean' % na_column})
        elif method == 'median':
            df_fill = df_fill.rename(columns={'%s' % na_column: '%s_median' % na_column})
        else:
            None
    df_fill = df_fill.reset_index(names=['date', 'ffi%s' % ffi])
    df_fill[['date', 'ffi%s' % ffi]] = df_fill[['date', 'ffi%s' % ffi]].astype(str)
    # reset multiple index to date and ffi code
    df_fill['date'] = df_fill['date'].str.strip('(Timestamp(\' \')')
    df_fill['ffi%s' % ffi] = df_fill['ffi%s' % ffi].str.strip(')')
    df_fill['date'] = pd.to_datetime(df_fill['date'])
    df_fill['ffi49'] = df_fill['ffi49'].astype(int)
    # fill na
    df = pd.merge(df, df_fill, how='left', on=['date', 'ffi%s' % ffi])
    for na_column in na_columns_list:
        if method == 'mean':
            df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column])
            df = df.drop(['%s_mean' % na_column], axis=1)
        elif method == 'median':
            df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_median' % na_column])
            df = df.drop(['%s_median' % na_column], axis=1)
        else:
            None
    return df

In [30]:
# there are two ways to impute: industrial median or mean
df_impute = fillna_ind(df_impute, method='median', ffi=49)

In [32]:
df_impute = fillna_all(df_impute, method='median')

df_impute['year'] = df_impute['date'].dt.year
df_impute = df_impute[df_impute['year'] >= 1972]
df_impute = df_impute.drop(['year'], axis=1)

with open('chars60_raw_imputed.feather', 'wb') as f:
    feather.write_feather(df_impute, f)

# standardize raw data
df_rank = df.copy()
df_rank['lag_me'] = df_rank['me']
df_rank['bm'] = np.where(df_rank['bm']<0,np.nan,df_rank['bm']) # if bm<0 then bm=nan and rank_bm=0
df_rank = standardize(df_rank)
df_rank['year'] = df_rank['date'].dt.year
df_rank = df_rank[df_rank['year'] >= 1972]
df_rank = df_rank.drop(['year'], axis=1)
df_rank['log_me'] = np.log(df_rank['lag_me'])

with open('chars60_rank_no_impute.feather', 'wb') as f:
    feather.write_feather(df_rank, f)

# standardize imputed data
df_rank = df_impute.copy()
df_rank['lag_me'] = df_rank['me']
df_rank = standardize(df_rank)
df_rank['year'] = df_rank['date'].dt.year
df_rank = df_rank[df_rank['year'] >= 1972]
df_rank = df_rank.drop(['year'], axis=1)
df_rank['log_me'] = np.log(df_rank['lag_me'])

with open('chars60_rank_imputed.feather', 'wb') as f:
    feather.write_feather(df_rank, f)


# ####################
# #      SP1500      #
# ####################
# with open('/home/jianxinma/chars/data/sp1500_impute_benchmark.feather', 'rb') as f:
#     sp1500_index = feather.read_feather(f)

# sp1500_index = sp1500_index[['gvkey', 'date']]

# sp1500_impute = pd.merge(sp1500_index, df_impute, how='left', on=['gvkey', 'date'])

# # for test
# # test = sp1500_rank.groupby(['jdate'])['gvkey'].nunique()

# with open('sp1500_impute_60.feather', 'wb') as f:
#     feather.write_feather(sp1500_impute, f)

# # standardize characteristics
# sp1500_rank = pd.merge(sp1500_index, df_rank, how='left', on=['gvkey', 'date'])

# with open('sp1500_rank_60.feather', 'wb') as f:
#     feather.write_feather(sp1500_rank, f)


  0%|          | 0/60 [00:00<?, ?it/s]

processing mom36m


  2%|▏         | 1/60 [00:00<00:22,  2.62it/s]

processing rvar_mean


  3%|▎         | 2/60 [00:00<00:17,  3.34it/s]

processing abr


  5%|▌         | 3/60 [00:00<00:16,  3.40it/s]

processing op


  7%|▋         | 4/60 [00:01<00:16,  3.50it/s]

processing rd_sale


  8%|▊         | 5/60 [00:01<00:14,  3.79it/s]

processing noa


 10%|█         | 6/60 [00:01<00:14,  3.65it/s]

processing bm


 12%|█▏        | 7/60 [00:01<00:14,  3.61it/s]

processing std_dolvol


 13%|█▎        | 8/60 [00:02<00:15,  3.30it/s]

processing bm_ia


 15%|█▌        | 9/60 [00:02<00:15,  3.33it/s]

processing seas1a


 18%|█▊        | 11/60 [00:03<00:13,  3.75it/s]

processing nincr
processing dolvol


 20%|██        | 12/60 [00:03<00:13,  3.55it/s]

processing acc


 22%|██▏       | 13/60 [00:03<00:12,  3.81it/s]

processing adm


 23%|██▎       | 14/60 [00:03<00:12,  3.65it/s]

processing mom12m


 27%|██▋       | 16/60 [00:04<00:10,  4.02it/s]

processing ato
processing pm


 28%|██▊       | 17/60 [00:04<00:10,  3.96it/s]

processing rsup


 30%|███       | 18/60 [00:04<00:11,  3.76it/s]

processing chtx


 32%|███▏      | 19/60 [00:05<00:10,  3.80it/s]

processing rvar_ff3


 33%|███▎      | 20/60 [00:05<00:10,  3.66it/s]

processing baspread


 35%|███▌      | 21/60 [00:05<00:12,  3.22it/s]

processing std_turn


 37%|███▋      | 22/60 [00:06<00:10,  3.49it/s]

processing agr


 38%|███▊      | 23/60 [00:06<00:10,  3.45it/s]

processing pctacc


 40%|████      | 24/60 [00:06<00:10,  3.49it/s]

processing beta


 42%|████▏     | 25/60 [00:07<00:10,  3.33it/s]

processing rdm


 43%|████▎     | 26/60 [00:07<00:10,  3.31it/s]

processing roe


 45%|████▌     | 27/60 [00:07<00:09,  3.43it/s]

processing ill


 48%|████▊     | 29/60 [00:08<00:07,  3.88it/s]

processing sue
processing sgr


 50%|█████     | 30/60 [00:08<00:08,  3.73it/s]

processing chpm


 52%|█████▏    | 31/60 [00:08<00:07,  3.74it/s]

processing lgr


 53%|█████▎    | 32/60 [00:08<00:07,  3.66it/s]

processing alm


 55%|█████▌    | 33/60 [00:09<00:07,  3.70it/s]

processing herf


 57%|█████▋    | 34/60 [00:09<00:06,  3.78it/s]

processing depr


 58%|█████▊    | 35/60 [00:09<00:06,  3.97it/s]

processing mom1m


 60%|██████    | 36/60 [00:10<00:06,  3.43it/s]

processing ep


 62%|██████▏   | 37/60 [00:10<00:06,  3.57it/s]

processing pscore


 63%|██████▎   | 38/60 [00:10<00:06,  3.64it/s]

processing cash


 65%|██████▌   | 39/60 [00:10<00:05,  3.55it/s]

processing mom60m


 67%|██████▋   | 40/60 [00:11<00:05,  3.79it/s]

processing rna


 68%|██████▊   | 41/60 [00:11<00:05,  3.70it/s]

processing rvar_capm


 70%|███████   | 42/60 [00:11<00:05,  3.52it/s]

processing gma


 72%|███████▏  | 43/60 [00:11<00:04,  3.57it/s]

processing chcsho


 73%|███████▎  | 44/60 [00:12<00:04,  3.57it/s]

processing mom6m


 75%|███████▌  | 45/60 [00:12<00:04,  3.30it/s]

processing zerotrade


 77%|███████▋  | 46/60 [00:12<00:03,  3.60it/s]

processing turn


 78%|███████▊  | 47/60 [00:13<00:03,  3.49it/s]

processing sp


 80%|████████  | 48/60 [00:13<00:03,  3.60it/s]

processing cinvest


 82%|████████▏ | 49/60 [00:13<00:03,  3.52it/s]

processing dy


 83%|████████▎ | 50/60 [00:14<00:02,  3.48it/s]

processing maxret


 85%|████████▌ | 51/60 [00:14<00:02,  3.73it/s]

processing lev


 87%|████████▋ | 52/60 [00:14<00:02,  3.52it/s]

processing me_ia


 88%|████████▊ | 53/60 [00:14<00:02,  3.37it/s]

processing cfp


 90%|█████████ | 54/60 [00:15<00:01,  3.36it/s]

processing hire


 92%|█████████▏| 55/60 [00:15<00:01,  3.55it/s]

processing cashdebt


 93%|█████████▎| 56/60 [00:15<00:01,  3.65it/s]

processing me


 95%|█████████▌| 57/60 [00:15<00:00,  3.47it/s]

processing ni


 97%|█████████▋| 58/60 [00:16<00:00,  3.78it/s]

processing roa


 98%|█████████▊| 59/60 [00:16<00:00,  3.53it/s]

processing grltnoa


100%|██████████| 60/60 [00:16<00:00,  3.57it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

processing mom36m


  2%|▏         | 1/60 [00:00<00:14,  4.01it/s]

processing rvar_mean


  3%|▎         | 2/60 [00:00<00:13,  4.25it/s]

processing abr


  5%|▌         | 3/60 [00:00<00:13,  4.18it/s]

processing op


  7%|▋         | 4/60 [00:01<00:14,  3.82it/s]

processing rd_sale


  8%|▊         | 5/60 [00:01<00:14,  3.75it/s]

processing noa


 10%|█         | 6/60 [00:01<00:14,  3.82it/s]

processing bm


 12%|█▏        | 7/60 [00:01<00:15,  3.44it/s]

processing std_dolvol


 13%|█▎        | 8/60 [00:02<00:15,  3.33it/s]

processing bm_ia


 15%|█▌        | 9/60 [00:02<00:15,  3.26it/s]

processing seas1a


 17%|█▋        | 10/60 [00:02<00:14,  3.35it/s]

processing nincr


 18%|█▊        | 11/60 [00:03<00:13,  3.61it/s]

processing dolvol


 20%|██        | 12/60 [00:03<00:14,  3.41it/s]

processing acc


 22%|██▏       | 13/60 [00:03<00:12,  3.69it/s]

processing adm


 23%|██▎       | 14/60 [00:03<00:12,  3.63it/s]

processing mom12m


 27%|██▋       | 16/60 [00:04<00:11,  3.81it/s]

processing ato
processing pm


 28%|██▊       | 17/60 [00:04<00:11,  3.72it/s]

processing rsup


 30%|███       | 18/60 [00:04<00:11,  3.59it/s]

processing chtx


 32%|███▏      | 19/60 [00:05<00:11,  3.65it/s]

processing rvar_ff3


 33%|███▎      | 20/60 [00:05<00:11,  3.53it/s]

processing baspread


 35%|███▌      | 21/60 [00:05<00:12,  3.21it/s]

processing std_turn


 37%|███▋      | 22/60 [00:06<00:10,  3.52it/s]

processing agr


 38%|███▊      | 23/60 [00:06<00:10,  3.61it/s]

processing pctacc


 40%|████      | 24/60 [00:06<00:10,  3.47it/s]

processing beta


 43%|████▎     | 26/60 [00:07<00:09,  3.71it/s]

processing rdm


 45%|████▌     | 27/60 [00:07<00:08,  4.02it/s]

processing roe
processing ill


 48%|████▊     | 29/60 [00:07<00:06,  4.63it/s]

processing sue
processing sgr


 50%|█████     | 30/60 [00:08<00:07,  4.28it/s]

processing chpm


 52%|█████▏    | 31/60 [00:08<00:07,  4.07it/s]

processing lgr


 53%|█████▎    | 32/60 [00:08<00:07,  3.91it/s]

processing alm


 55%|█████▌    | 33/60 [00:08<00:07,  3.86it/s]

processing herf


 58%|█████▊    | 35/60 [00:09<00:06,  4.09it/s]

processing depr
processing mom1m


 60%|██████    | 36/60 [00:09<00:06,  3.82it/s]

processing ep


 62%|██████▏   | 37/60 [00:09<00:05,  3.91it/s]

processing pscore


 63%|██████▎   | 38/60 [00:10<00:05,  3.82it/s]

processing cash


 65%|██████▌   | 39/60 [00:10<00:05,  3.75it/s]

processing mom60m


 67%|██████▋   | 40/60 [00:10<00:05,  3.70it/s]

processing rna


 68%|██████▊   | 41/60 [00:11<00:04,  3.95it/s]

processing rvar_capm


 70%|███████   | 42/60 [00:11<00:05,  3.56it/s]

processing gma


 72%|███████▏  | 43/60 [00:11<00:04,  3.65it/s]

processing chcsho


 73%|███████▎  | 44/60 [00:11<00:04,  3.71it/s]

processing mom6m


 75%|███████▌  | 45/60 [00:12<00:04,  3.57it/s]

processing zerotrade


 77%|███████▋  | 46/60 [00:12<00:03,  3.72it/s]

processing turn


 78%|███████▊  | 47/60 [00:12<00:03,  3.35it/s]

processing sp


 80%|████████  | 48/60 [00:13<00:03,  3.60it/s]

processing cinvest


 83%|████████▎ | 50/60 [00:13<00:02,  4.04it/s]

processing dy
processing maxret


 85%|████████▌ | 51/60 [00:13<00:02,  4.02it/s]

processing lev


 87%|████████▋ | 52/60 [00:14<00:02,  3.78it/s]

processing me_ia


 88%|████████▊ | 53/60 [00:14<00:01,  3.66it/s]

processing cfp


 90%|█████████ | 54/60 [00:14<00:01,  3.86it/s]

processing hire


 92%|█████████▏| 55/60 [00:14<00:01,  3.64it/s]

processing cashdebt


 93%|█████████▎| 56/60 [00:15<00:01,  3.71it/s]

processing me


 95%|█████████▌| 57/60 [00:15<00:00,  3.53it/s]

processing ni


 97%|█████████▋| 58/60 [00:15<00:00,  3.70it/s]

processing roa


 98%|█████████▊| 59/60 [00:15<00:00,  3.51it/s]

processing grltnoa


100%|██████████| 60/60 [00:16<00:00,  3.69it/s]
