# Finance V: Research Topics in Finance, Risk- and Resource management 
## Replication of paper: Lowry, Michaely & Volkova (2017)

<blockquote>
    Author: Stefan Reimer <br>
    Date: 2019-12-28 <br>
    python version: 3.7 <br>
</blockquote>

### import packages and set general options

In [55]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import collections

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# import defined functions
from src.functions.functions import (data_import_chunkwise, min_max_print, convert_NAs, 
                                     get_duplicates, find_char_in_colnames, convert_date, 
                                     convert_price)

# set the settings for displayed dataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### load sdc data

In [56]:
#%%
# define path to files & load data chunkwise

initialFolderPath = "data/initial_data/"
cleanedFolderPath = "data/cleaned_data/"

filePath = initialFolderPath + "000_sdc_full.csv"

sdc_data = data_import_chunkwise(filePath=filePath)

INFO:root:loading started...
INFO:root:loading finished.


The loaded data frame has 43709 rows and 94 columns.


### data exploration

In [57]:
# show first 5 rows
#print(sdc_data.head(5))

# show last 5 rows
#print(sdc_data.tail(5))

# get key statistics for data
#print(sdc_data.describe())

# show all columns names
#print(sdc_data.columns)

# inspect the sample size
len_paper = 43816
len_sample = len(sdc_data)

print(f'length of neede sample (from paper): {len_paper}\n'
      f'length of given sample: {len_sample}\n'
     f'difference of sample size: {len_paper-len_sample}')

# inspect the date range
min_max_print(sdc_data, 'IssueDate')

length of neede sample (from paper): 43816
length of given sample: 43709
difference of sample size: 107
[IssueDate] min: 1973-01-02, max: 2019-12-13


### +++ Comment +++
the given SDC data sample is not exactly the same as in the inspected paper. 
One reason could be the extraction process of the countries
Lowry et al. picked by excluding, not by including the choosen countries

data horizont is enough, data is needed from 01.01.1973 until 31.12.2016

### SDC data preparation

- Sort dataFrame
- choose only fulfilled IPOs
- convert date information
- convert price information
- create year variable



In [58]:
# sort dataFrame
sdc_data = sdc_data.sort_index()

# convert NAs in OrigIPO
sdc_data = convert_NAs(sdc_data, 'OrigIPO', print_bool = False)

# print values of IPO and OrigIPO
# print(collections.Counter(sdc_data['OrigIPO']))
# print(collections.Counter(sdc_data['IPO']))

# keep only IPOs
sdc_data = sdc_data.loc[(sdc_data['IPO']=='Yes') &
                        (sdc_data['OrigIPO']!='No'), :]

print(f'By including only IPOs, the data sample gets reduced from {len_sample} to {len(sdc_data)} samples.')

# find all columns which contains some date information
date_cols = find_char_in_colnames(sdc_data, 'Date', print_bool= False)

# find all columns which contains some price information
price_cols = find_char_in_colnames(sdc_data, 'Price', print_bool= False)

# find all columns which contains some share information
share_cols = find_char_in_colnames(sdc_data, 'Share', print_bool= False)

# find all columns which contains some overallotment information
overall_cols = find_char_in_colnames(sdc_data, 'Overall', print_bool= False)

# find all columns which contains some round information
round_cols = find_char_in_colnames(sdc_data, 'Round', print_bool= False)

# convert and clean date columns
for date_col in date_cols:
    sdc_data = convert_date(sdc_data, date_col, format='%Y-%m-%d', errors='coerce', print_bool = False)

# convert and clean price columns
price_cols = np.setdiff1d(price_cols, ['HistoryHighFilingPrice', 'HistoryLowFilingPrice'])
for price_col in price_cols:
    sdc_data = convert_price(data = sdc_data, column = price_col, errors='coerce', print_bool = False)

# convert proceeds column
sdc_data = convert_price(sdc_data, 'ProceedsAmtInThisMktMil', print_bool= False)

# create year variable
sdc_data['Year'] = sdc_data['IssueDate'].dt.year

# select date range
sdc_data = sdc_data[(sdc_data['IssueDate'] > '1973-1-1') & (sdc_data['IssueDate'] <= '2016-12-31')]
print('\n date range has been reduced to') 
min_max_print(sdc_data, 'IssueDate')

By including only IPOs, the data sample gets reduced from 43709 to 15247 samples.

 date range has been reduced to
[IssueDate] min: 1973-01-02 00:00:00, max: 2016-12-27 00:00:00


In [59]:
# clean the types of securites
length_before = len(sdc_data)

ex_types= ["Units", "Ltd Prtnr Int", "MLP-Common Shs", "Shs Benficl Int",
             "Ltd Liab Int", "Stock Unit", "Trust Units", "Beneficial Ints"]
sdc_data = sdc_data[~sdc_data.Type.isin(ex_types)]

### drop REIT, Units, ADR, penny stocks and CEF ###

# drop REIT - Real Estate Investment Trust
sdc_data = sdc_data[sdc_data['REIT'].isna()]

# drop Unit 
sdc_data = sdc_data[~(sdc_data['Unit'] == 'Yes')]

# drop Depositary (ADR)
sdc_data = sdc_data.loc[sdc_data['ADR']== 'No', :]

# filter the offer prices (drop penny stocks)
sdc_data[sdc_data['OfferPrice'].notna()]
sdc_data = sdc_data[sdc_data['OfferPrice']>5]

# filter CEF
sdc_data = sdc_data[sdc_data['CEF'] == 'No']

length_after = len(sdc_data)
print(f'before: {length_before} rows. After: {length_after} rows. \n'
     f'{length_before - length_after} samples are dropped.')

before: 14524 rows. After: 10026 rows. 
4498 samples are dropped.


In [60]:
# explore CUSIP
cusip_cols = find_char_in_colnames(sdc_data, 'CUSIP')

### creating 8-digit CUSIP to match with CRSP
sdc_data['CUSIP8'] = sdc_data['CUSIP6'].astype(str) + '10'
CUSIP9_sliced = sdc_data['CUSIP9'].str.slice(0, 8)
sdc_data['CUSIP8'] = sdc_data['CUSIP8'].where(CUSIP9_sliced.isna(), CUSIP9_sliced)

columns containing <<CUSIP>> are:
['CUSIP6' 'CUSIP9']



In [61]:
CUSIP9_68 = sdc_data['CUSIP8'].str.slice(6, 8)
collections.Counter(CUSIP9_68)

Counter({'00': 1,
         '01': 1,
         '10': 9592,
         '11': 10,
         '12': 1,
         '13': 2,
         '15': 2,
         '20': 269,
         '30': 84,
         '40': 24,
         '50': 19,
         '60': 2,
         '70': 3,
         '80': 5,
         '83': 1,
         '85': 1,
         '87': 3,
         '88': 1,
         '90': 1,
         'AA': 2,
         'B9': 1,
         'HE': 1})

In [62]:
# save cleaned sdc_data
sdc_data.to_csv(cleanedFolderPath + "sdc_data_cleaned.csv", index=False)
sdc_data.to_pickle(cleanedFolderPath + "sdc_data_cleaned.pkl")
print('cleaned sdc_data has been saved.')

cleaned sdc_data has been saved.


### +++ Comment +++
the given SDC data sample size is 10.026 after preparation.

According to Lowry et al, 11.103 samples should be left after preparation.

The achieved sample size is close to the target size.