# Finance V: Research Topics in Finance, Risk- and Resource management 
## Replication of paper: Lowry, Michaely & Volkova (2017)

<blockquote>
    Author: Stefan Reimer <br>
    Date: 2019-12-28 <br>
    python version: 3.7 <br>
</blockquote>

In [156]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime

# import defined functions
from src.functions.functions import *

# set the settings for displayed dataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### functions

In [157]:
def create_date_difference(df, date1, date2, new_col_name='dif', info_print = True):
  """
  function to create the date dif variable for two dates 
  @param df: dataframe containing the two date columns
  @param date1: date to extract date2 from
  @param date2: date which should be extracted from date1
  @return: dataframe with added "dif" column, which describes the range between date1 and date2
  """
  df.loc[:, new_col_name] = df.loc[:, date1] - df.loc[:, date2]
  df.loc[:, new_col_name] = df[new_col_name].dt.days 
  if(info_print):
    difminus60 = (df[new_col_name]<=-60.0).sum()
    difminus30 = ((df[new_col_name]>-60.0) &(df[new_col_name]<=-30.0)).sum()
    difminus10 = ((df[new_col_name]>-30.0) &(df[new_col_name]<-10.0)).sum()
    difminus1 = ((df[new_col_name]>-10.0) & (df[new_col_name]<-1.0)).sum()
    difunder7 = ((df[new_col_name]>=-1.0) & (df[new_col_name]<=7.0)).sum()
    difover7 = ((df[new_col_name]>7.0) & (df[new_col_name]<15.0)).sum()
    difover15 = (df[new_col_name]>=15.0).sum()
    print(f'{new_col_name}:  dif<=-60: {difminus60}, -60 > dif <= -30: {difminus30}, -30 <= dif < -10: {difminus10} \n , -10 < dif < -1: {difminus1}, -1 <= dif <= 7: {difunder7}, 7 < dif < 15: {difover7}, 15 <= dif: {difover15} \n')
  return df

### load cleaned sdc & crsp data

In [158]:
#%%
# define path to files & load data chunkwise

initial_folder_path = "data/initial_data/"
cleaned_folder_path = "data/cleaned_data/"

sdc_path = cleaned_folder_path + 'sdc_data_cleaned.pkl'
crsp_path = cleaned_folder_path + 'crsp_only_start.pkl'

sdc = pd.read_pickle(sdc_path)
crsp = pd.read_pickle(crsp_path)

sdc = sdc.reset_index()

In [159]:
crsp[crsp['PERMNO']==59248]

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,NCUSIP,CUSIP,PRC,RET,SHROUT,NCUSIP6
34045372,59248.0,1975-06-12,11.0,3.0,21701610,60871R20,-30.75,C,34159.0,217016
34052862,59248.0,2005-02-09,11.0,1.0,60871R20,60871R20,73.5,-0.030087,36166.0,60871R


### match sdc & crsp data

In [160]:
# print size and unique values
unique_PERMNO = len(np.unique(crsp['PERMNO'])) 
print(f'the dataset contains {len(crsp)} samples.'
      f' {unique_PERMNO} PERMNO values are unique.')

the dataset contains 43367 samples. 31560 PERMNO values are unique.


In [161]:
# create PERMNO_NUCSIP and FirstDate from sdc_data and CRSP#
sdc = pd.merge(sdc,
         crsp[['NCUSIP', 'PERMNO', 'date']], 
         left_on='CUSIP8', right_on='NCUSIP',
                    how='left')
sdc = sdc.rename(columns={'date': 'First_CRSP_date_ncusip', 'PERMNO': 'Permno_ncusip'})

# create PERMNO_NUCSIP6 and FirstDate from sdc_data and CRSP
sdc = pd.merge(sdc,
         crsp[['NCUSIP6', 'PERMNO', 'date']], 
         left_on='CUSIP6', right_on='NCUSIP6',
                    how='left')
sdc = sdc.rename(columns={'date': 'First_CRSP_date_ncusip6', 'PERMNO': 'Permno_ncusip6'})

In [162]:
sdc['Permno'] = -999


In [163]:
def return_condition_check(df, col):
  condition = (df[col]>=-1) & (df[col]<=7) & (df['Permno'] == -999) & df[col].notna() 
  return condition

In [164]:
sdc = create_date_difference(sdc, 'First_CRSP_date_ncusip', 'IssueDate', 'dif')
condition = return_condition_check(sdc, 'dif')
sdc.loc[condition, 'Permno'] = sdc['Permno_ncusip']
print((sdc['Permno']==-999).sum())

dif:  dif<=-60: 121, -60 > dif <= -30: 5, -30 <= dif < -10: 3 
 , -10 < dif < -1: 13, -1 <= dif <= 7: 10105, 7 < dif < 15: 57, 15 <= dif: 260 

971


In [165]:
sdc = create_date_difference(sdc, 'First_CRSP_date_ncusip6', 'IssueDate', 'dif')
condition = return_condition_check(sdc, 'dif')
sdc.loc[condition, 'Permno'] = sdc['Permno_ncusip6']
print((sdc['Permno']==-999).sum())

dif:  dif<=-60: 152, -60 > dif <= -30: 5, -30 <= dif < -10: 5 
 , -10 < dif < -1: 12, -1 <= dif <= 7: 9203, 7 < dif < 15: 58, 15 <= dif: 1200 

762


In [166]:
# dropping non-match companies and duplicates (keep the first duplicates)
print(len(sdc))
sdc = sdc.loc[sdc['Permno']!=-999, :]
print(len(sdc))
sdc = sdc.drop_duplicates(subset=['Permno'], keep='first')
print(len(sdc))

11076
10314
8371


In [167]:
### matching CRSP infor
crsp = crsp.sort_values('date')
crsp = crsp.drop_duplicates(subset=['PERMNO'], keep='first')
crsp = crsp.rename(columns={'date': 'First_CRSP_date', 'PRC': 'Close_price1'})

# merge sdc and crsp
sdc = pd.merge(sdc, crsp[['First_CRSP_date', 'PERMNO', 'Close_price1', 'RET']], how='left', left_on='Permno', right_on='PERMNO')

In [168]:
sdc = sdc.dropna(subset=['NCUSIP'])
sdc_2016 = sdc.loc[sdc['IssueDate']<'2016-12-31', :]
sdc = sdc.loc[sdc['IssueDate']<'2015-12-31', :]

In [169]:
len(sdc)

8209

In [170]:
len(sdc_2016)

8282

### save prepared data

In [171]:
sdc.to_pickle(cleaned_folder_path + 'data_prepared.pkl')

### <<< comment >>>###

Lowry et al. kept 8.543 obs 

Here we kept 8.209 obs

### removing extra variables
ipo[,`:=`(First_CRSP_date_ncusip = NULL,First_CRSP_date_ncusip6 = NULL, Permno_ncusip = NULL, Permno_ncusip6 = NULL, dif = NULL)]
ipo[, `:=`(REIT = NULL, Unit = NULL, Depositary = NULL, CEF = NULL, CUSIP = NULL, CUSIP9 = NULL)]
ipo[, `:=`(IPO_Flag = NULL, Original_IPO_Flag = NULL)]

### Dropping wrong share clases and shares traded on other exchanges
### Loading CRSP Stock Header Information file
crsp.info <- fread(crsp.info.datafile, select = c("PERMNO", "HSHRCD", "HEXCD"))
m <- match(ipo$Permno, crsp.info$PERMNO)
ipo[, `:=` (exch = crsp.info$HEXCD[m], shrcd = crsp.info$HSHRCD[m])]

### removing extra variables
for drop_column in ['REIT', 'Unit', 'CEF', 'CUSIP6', 'CUSIP8', 'CUSIP9', 'IPO', 'OrigIPO']:
    # First_CRSP_date_ncusip, First_CRSP_date_ncusip6, Permno_ncusip, Permno_ncusip6, dif, Depositary
    sdc_data = sdc_data.drop(columns=drop_column)