In [1]:
# Virtual env libraries
import sys, os
sys.path.append(os.path.abspath(os.path.join('../../../virtualenv/lib/python3.8/site-packages')))
sys.path.append(os.path.abspath(os.path.join('../../../virtualenv/lib/python3.9/site-packages')))
sys.path.append(os.path.abspath(os.path.join('../../../.local/lib/python3.9/site-packages')))
sys.path.append('/usr/local/sas/grid/python3-3.9.1/lib/python3.9/site-packages')

import pandas as pd
import numpy as np
import datetime as dt
import wrds
from tqdm import tqdm
from multiprocessing import Pool
import psycopg2 
import matplotlib.pyplot as plt
import saspy
import seaborn as sns
from dateutil.relativedelta import *
from pandas.tseries.offsets import *
from scipy import stats
import glob

# Set up WRDS sessions

In [3]:
# sas = saspy.SASsession(**{'cfgname': 'default', 'encoding': 'utf_8'})
# sql = wrds.Connection()

# Get Dates List

In [4]:
## Dates for each library
taqm_dates = list(pd.read_csv('../../data/taq/taqm_dates_list.csv').iloc[:,0])
taqmsec_dates = list(pd.read_csv('../../data/taq/taqmsec_dates_list.csv').iloc[:,0])

## Merge and filter by yyyymm input
taq_all_dates = taqm_dates + taqmsec_dates

# Main

In [58]:
def gen_sas_script(date, output_filename):
    sas_script =     f'''

        %include '/wrds/lib/utility/wrdslib.sas' ;
        options sasautos=('/wrds/wrdsmacros/', SASAUTOS) MAUTOSOURCE;

        data main;
            set crsp.dsf(        
                keep = permno permco date cusip shrout cfacpr cfacshr prc openprc ret retx
            );
            where (date = '{date}'d);
        run;

        %macro check(file);
        %if %sysfunc(fileexist(&file)) ge 1 %then %do;
           %let rc=%sysfunc(filename(temp,&file));
           %let rc=%sysfunc(fdelete(&temp));
        %end; 
        %else %put The file &file does not exist;
        %mend check; 

        %check('{output_filename}');

        proc export data=main
            outfile='{output_filename}'
            dbms=dlm;  
            delimiter=',';
        run;

        quit;
        '''
    return sas_script

## Params

In [59]:
# Folder params
output_folder = '/scratch/duke/sa400/HFZoo/data_crsp_daily/'
scripts_folder = '/scratch/duke/sa400/HFZoo/scripts/'

## Create Scripts for Batch Job

In [60]:
# Clean up scripts folder
files = glob.glob(f'{scripts_folder}*')
for f in tqdm(files):
    os.remove(f)

In [61]:
arg_dates = range(1993,2021)

for arg_date in tqdm(arg_dates):
    
    arg_date = str(arg_date)

    ## Create batch script for running SAS programs

    # Prepare batch script text file
    sh_file = open(f'{scripts_folder}batch_script_crsp_{arg_date}.sh', 'w+')

    # Add header to batch script file
    sh_file.write('''
    #!/bin/bash
    #$ -cwd\n\n''')

    # Get list of TAQ files for given date
    k = len(arg_date)
    taq_filtered_dates = [str(x) for x in taq_all_dates if str(x)[:k] == arg_date]

    # Create SAS scripts for each day
    # print(f'Creating script for', end = ' ')
    for date in taq_filtered_dates:

        if (int(date) > 20210000) or (int(date) < 19930000):
            continue 

        date_str = date
        date_strarg = pd.to_datetime(date).strftime('%d%b%Y')
        output_filename = output_folder + pd.to_datetime(date).strftime('%Y%m%d') + '.csv'
        sas_script = gen_sas_script(date_strarg, output_filename)

        # Write SAS script to file
        with open(scripts_folder + f'crsp_{date_str}.sas', 'w+') as sas_script_file:
            sas_script_file.write(sas_script)

        # Add SAS script reference to .sh file
        sh_file.write(f'sas crsp_{date_str}.sas \n')

    sh_file.close()      

100%|██████████| 28/28 [00:11<00:00,  2.51it/s]


In [55]:
## Get list of .sh file commands
sh_files = list(np.sort([y for y in [x for x in os.walk(scripts_folder)][0][2] if '.sh' in y and '.sh.' not in y]))
for f in sh_files:
    print('qsub -cwd', f)
print('\n')

qsub -cwd batch_script_crsp_1993.sh
qsub -cwd batch_script_crsp_1994.sh
qsub -cwd batch_script_crsp_1995.sh
qsub -cwd batch_script_crsp_1996.sh
qsub -cwd batch_script_crsp_1997.sh
qsub -cwd batch_script_crsp_1998.sh
qsub -cwd batch_script_crsp_1999.sh
qsub -cwd batch_script_crsp_2000.sh
qsub -cwd batch_script_crsp_2001.sh
qsub -cwd batch_script_crsp_2002.sh
qsub -cwd batch_script_crsp_2003.sh
qsub -cwd batch_script_crsp_2004.sh
qsub -cwd batch_script_crsp_2005.sh
qsub -cwd batch_script_crsp_2006.sh
qsub -cwd batch_script_crsp_2007.sh
qsub -cwd batch_script_crsp_2008.sh
qsub -cwd batch_script_crsp_2009.sh
qsub -cwd batch_script_crsp_2010.sh
qsub -cwd batch_script_crsp_2011.sh
qsub -cwd batch_script_crsp_2012.sh
qsub -cwd batch_script_crsp_2013.sh
qsub -cwd batch_script_crsp_2014.sh
qsub -cwd batch_script_crsp_2015.sh
qsub -cwd batch_script_crsp_2016.sh
qsub -cwd batch_script_crsp_2017.sh
qsub -cwd batch_script_crsp_2018.sh
qsub -cwd batch_script_crsp_2019.sh
qsub -cwd batch_script_crsp_

# Check output

In [62]:
pd.read_csv(output_folder + '20000208.csv').sort_values(by = 'PERMCO')

Unnamed: 0,CUSIP,PERMNO,PERMCO,DATE,PRC,RET,SHROUT,CFACPR,CFACSHR,OPENPRC,RETX
604,04820910,15580,5,20000208,2.5625,-0.068182,21027,1.000000,1.000000,2.75000,-0.068182
576,03783310,14593,7,20000208,114.8750,0.007123,161159,112.000000,112.000000,114.00000,0.007123
1976,03216510,62770,25,20000208,17.1250,0.014815,391089,1.000000,1.000000,16.87500,0.014815
1850,03522910,59184,29,20000208,66.5625,-0.009302,465290,2.000000,2.000000,67.25000,-0.009302
1854,60871R20,59248,33,20000208,46.8125,-0.049492,35600,2.000000,2.000000,49.25000,-0.049492
...,...,...,...,...,...,...,...,...,...,...,...
542,45812Y10,12928,53854,20000208,2.3750,0.000000,16888,1.000000,1.000000,2.50000,0.000000
3886,L8873E10,79191,56242,20000208,11.7500,0.005348,6008,1.000000,1.000000,12.00000,0.005348
3658,L8874210,78671,56245,20000208,16.7500,0.000000,29359,1.000000,1.000000,16.75000,0.000000
640,92924840,17337,56283,20000208,6.8125,-0.026786,14391,0.333333,0.333333,7.00000,-0.026786


In [None]:
out = sql.raw_sql('''
select *
from crsp.dsf
where (permco = 7)
limit 100000
''').drop(['hexcd', 'hsiccd', 'vol', 'issuno', 'bidlo', 'askhi'], axis = 1)

out = out.query('openprc == openprc').copy()

# Infer close-to-open adjusted overnight returns 
out['ret_open_close_intraday'] = (out['prc']-out['openprc'])/out['openprc']
out['ret_close_open_adj'] = (1+out['ret'])/(1+out['ret_open_close_intraday']) - 1