In [1]:
# Fama & French 3 factors residual variance
# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling
# To get a faster speed, we split the big dataframe into small ones
# Then using different process to calculate the variance
# We use 20 process to calculate variance, you can change the number of process according to your CPU situation
# You can use the following code to check your CPU situation
# import multiprocessing
# multiprocessing.cpu_count()

import pandas as pd
import numpy as np
import datetime as dt
import wrds
from dateutil.relativedelta import *
from pandas.tseries.offsets import *
import datetime
import pickle as pkl
import pyarrow.feather as feather
import multiprocessing as mp
import os

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [2]:
crsp = pd.read_feather("raw_data/ill_crsp.ftr")

In [3]:
# sort variables by permno and date
crsp = crsp.sort_values(by=['permno', 'date'])

# change variable format to int
crsp['permno'] = crsp['permno'].astype(int)

# Line up date to be end of month
crsp['date'] = pd.to_datetime(crsp['date'])

# add delisting return
dlret = pd.read_feather("raw_data/dlret.ftr")

dlret.permno = dlret.permno.astype(int)
dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt'])
dlret['date'] = dlret['dlstdt']

# merge delisting return to crsp return
crsp = pd.merge(crsp, dlret, how='left', on=['permno', 'date'])
crsp['dlret'] = crsp['dlret'].fillna(0)
crsp['ret'] = crsp['ret'].fillna(0)
crsp['retadj'] = (1 + crsp['ret']) * (1 + crsp['dlret']) - 1

# find the closest trading day to the end of the month
crsp['monthend'] = crsp['date'] + MonthEnd(0)
crsp['date_diff'] = crsp['monthend'] - crsp['date']
date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min()
date_temp = pd.DataFrame(date_temp)  # convert Series to DataFrame
date_temp.reset_index(inplace=True)
date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True)
crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend'])
crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan)

# label every date of month end
crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount()

# label numbers of months for a firm
month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1)
month_num = month_num.astype(int)
month_num = month_num.reset_index(drop=True)

# mark the number of each month to each day of this month
crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill')

# crate a firm list
df_firm = crsp.drop_duplicates(['permno'])
df_firm = df_firm[['permno']]
df_firm['permno'] = df_firm['permno'].astype(int)
df_firm = df_firm.reset_index(drop=True)
df_firm = df_firm.reset_index()
df_firm = df_firm.rename(columns={'index': 'count'})
df_firm['month_num'] = month_num

######################
# Calculate residual #
######################


def get_baspread(df, firm_list):
    """

    :param df: stock dataframe
    :param firm_list: list of firms matching stock dataframe
    :return: dataframe with variance of residual
    """
    for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)):
        prog = prog + 1
        print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100))
        for i in range(count + 1):
            # if you want to change the rolling window, please change here: i - 2 means 3 months is a window.
            temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)]
            if temp['permno'].count() < 21:
                pass
            else:
                index = temp.tail(1).index
                X = pd.DataFrame()
                X[['vol', 'prc', 'retadj']] = temp[['vol', 'prc', 'retadj']]
                ill = (abs(X['retadj']) / abs(X['prc'])*X['vol']).mean()
                df.loc[index, 'ill'] = ill
    return df


def sub_df(start, end, step):
    """
    :param start: the quantile to start cutting, usually it should be 0
    :param end: the quantile to end cutting, usually it should be 1
    :param step: quantile step
    :return: a dictionary including all the 'firm_list' dataframe and 'newly grouped stock data' dataframe
    """
    # 按照原来的逻辑先分10组
    temp = {}
    for i, h in zip(np.arange(start, end, step), range(int((end - start) / step))):
        print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2))
        if i == 0:  # to get the left point
            temp['firm' + str(h)] = df_firm[df_firm['month_count'] <= df_firm['month_count'].quantile(i + step)]
            temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left',
                                             on='permno').dropna(subset=['month_count'])
        else:
            temp['firm' + str(h)] = df_firm[(df_firm['month_count'].quantile(i) < df_firm['month_count']) & (
                    df_firm['month_count'] <= df_firm['month_count'].quantile(i + step))]
            temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left',
                                             on='permno').dropna(subset=['month_count'])

    # 初始化新的10组df
    new_groups = {f'new_group_{i}': pd.DataFrame() for i in range(10)}

    # 将原来的10组数据均匀地分配到新的10组中，可以确保每个df的计算进度相近
    for key in temp:
        if key.startswith('crsp'):
            group_data = temp[key]
            for i, row in group_data.iterrows():
                new_group_index = len(new_groups[f'new_group_{i % 10}'])
                new_groups[f'new_group_{i % 10}'] = new_groups[f'new_group_{i % 10}'].append(row, ignore_index=True)

    return new_groups


def process(start, end, step):
    """

    :param start: the quantile to start cutting, usually it should be 0
    :param end: the quantile to end cutting, usually it should be 1
    :param step: quantile step
    :return: a dataframe with calculated variance of residual
    """
    df = sub_df(start, end, step)
    pool = mp.Pool()
    p_dict = {}
    for i in range(int((end-start)/step)):
        p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],))
    pool.close()
    pool.join()
    result = pd.DataFrame()
    print('processing pd.concat')
    for h in range(int((end-start)/step)):
        result = pd.concat([result, p_dict['p%s' % h].get()])
    return result

  crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill')
  crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill')


In [5]:
# calculate variance of residual through rolling window
# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub
# dataframes here, so the function will use 20 cores to calculate variance of residual.
crsp = process(0, 1, 0.05)

processing splitting dataframe: 0.0 to 0.05
processing splitting dataframe: 0.05 to 0.1
processing splitting dataframe: 0.1 to 0.15
processing splitting dataframe: 0.15 to 0.2
processing splitting dataframe: 0.2 to 0.25
processing splitting dataframe: 0.25 to 0.3
processing splitting dataframe: 0.3 to 0.35
processing splitting dataframe: 0.35 to 0.4
processing splitting dataframe: 0.4 to 0.45
processing splitting dataframe: 0.45 to 0.5
processing splitting dataframe: 0.5 to 0.55
processing splitting dataframe: 0.55 to 0.6
processing splitting dataframe: 0.6 to 0.65
processing splitting dataframe: 0.65 to 0.7
processing splitting dataframe: 0.7 to 0.75
processing splitting dataframe: 0.75 to 0.8
processing splitting dataframe: 0.8 to 0.85
processing splitting dataframe: 0.85 to 0.9
processing splitting dataframe: 0.9 to 0.95
processing splitting dataframe: 0.95 to 1.0
processing permno 10001 / finished 0.13%
processing permno 10025 / finished 0.27%
processing permno 10026 / finished 0.4

In [7]:
# process dataframe
crsp = crsp.dropna(subset=['ill'])  # drop NA due to rolling
crsp = crsp.reset_index(drop=True)
crsp = crsp[['permno', 'date', 'ill']]

In [9]:
crsp.to_feather("processed_data/ill.ftr")