# Finance V: Research Topics in Finance, Risk- and Resource management 
## Replication of paper: ...

<blockquote>
    Author: Stefan Reimer <br>
    Date: 2019-12-28 <br>
    python version: 3.7 <br>
</blockquote>

In [85]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import collections

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# import defined functions
from src.functions.functions import (data_import_chunkwise, convert_NAs, get_duplicates, 
                                     find_char_in_colnames, convert_date, convert_price)

# set the settings for displayed dataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# 1 Load Data

In [86]:
#%%
# define path to files & load data chunkwise
filePath = "data/initial_data/000_sdc_full.csv"

sdc_data = data_import_chunkwise(filePath=filePath)

INFO:root:loading started...
INFO:root:loading finished.


The loaded data frame has 43709 rows and 86 columns.


# first data exploration

In [87]:
# show first 5 rows
#print(sdc_data.head(5))

# show last 5 rows
#print(sdc_data.tail(5))

# get key statistics for data
#print(sdc_data.describe())

# show all columns names
#print(sdc_data.columns)

In [88]:
# Question: Does the sample size fit? 
# Are only North American IPOs given?
len_paper = 43816
len_sample = len(sdc_data)

print(f'length of neede sample (from paper): {len_paper}\n'
      f'length of given sample: {len_sample}\n'
     f'difference: {len_paper-len_sample}')

length of neede sample (from paper): 43816
length of given sample: 43709
difference: 107


Answer: there are some IPO data missing.
the difference of 107 samples could results from the missing year 1973 and 1974 

TODO: ASK PHILIPP, WHICH DATA HAS BEEN EXTRACTED

A lot of missing values in the data.

# 2 Data preparation

- Sort dataFrame
- choose only fulfilled IPOs
- convert date information
- convert price information
- create year variable



In [97]:
# sort dataFrame
sdc_data = sdc_data.sort_index()

In [None]:
# convert NAs in OrigIPO
sdc_data = convert_NAs(sdc_data, 'OrigIPO')

# print values of IPO and OrigIPO
print(collections.Counter(sdc_data['OrigIPO']))
print(collections.Counter(sdc_data['IPO']))

# keep only IPOs
sdc_data = sdc_data.loc[(sdc_data['IPO']=='Yes') &
                        (sdc_data['OrigIPO']!='No'), :]
print(len(sdc_data))
# 16,454 should be left.
# TODO: the difference is getting bigger; maybe made changes before the filtering?

In [98]:
# find all columns which contains some date information
date_cols = find_char_in_colnames(sdc_data, 'Date', print_bool= False)

# find all columns which contains some price information
price_cols = find_char_in_colnames(sdc_data, 'Price', print_bool= False)

# find all columns which contains some share information
share_cols = find_char_in_colnames(sdc_data, 'Share', print_bool= False)

# find all columns which contains some overallotment information
overall_cols = find_char_in_colnames(sdc_data, 'Overall', print_bool= False)

# find all columns which contains some round information
round_cols = find_char_in_colnames(sdc_data, 'Round', print_bool= False)

In [106]:
# convert and clean date columns
for date_col in date_cols:
    sdc_data = convert_date(sdc_data, date_col, format='%Y-%m-%d', errors='coerce', print_bool=False)

# convert and clean price columns
for price_col in price_cols:
    sdc_data = convert_price(data = sdc_data, column = price_col, errors='coerce', print_bool=False)

# create year variable
sdc_data['Year'] = sdc_data['IssueDate'].dt.year

In [101]:
# Question: Is the data horizont enough?
date_start = sdc_data['FilingDate'].min()
date_end = sdc_data['FilingDate'].max()
print(f'The data start at {date_start} and ends at {date_end}')

#Answer:
## No, data is needed from 01.01.1973 until 31.12.2016
## additional data needed for 1973 until 1975

The data start at 1983-01-14 00:00:00 and ends at 2019-11-25 00:00:00


In [103]:
# clean the types of securites
length_before = len(sdc_data)
ex_types= ["Units", "Ltd Prtnr Int", "MLP-Common Shs", "Shs Benficl Int",
             "Ltd Liab Int", "Stock Unit", "Trust Units", "Beneficial Ints"]
sdc_data = sdc_data[~sdc_data.Type.isin(ex_types)]
length_after = len(sdc_data)
print(f'before: {na_amount_before} rows. After: {na_amount_after} \n'
     f'{na_amount_before - na_amount_after} are deleted.')

#### Should be "16,454 obs before ---> 15,107 obs after" ####

before: 15268 rows. After: 14483 
785 are deleted.


In [104]:
### drop REIT, Units, ADR, penny stocks and CEF ###
print(len(sdc_data))

# drop REIT - Real Estate Investment Trust
sdc_data = sdc_data[sdc_data['REIT'].isna()]

# drop Unit 
sdc_data = sdc_data[~(sdc_data['Unit'] == 'Yes')]

# drop Depositary (ADR)
### print(collections.Counter(sdc_data['Depositary']))
### TODO: get depositary & delete despositary!!! ####

# filter the offer prices (drop penny stocks)
sdc_data[sdc_data['OfferPrice'].notna()]
sdc_data = sdc_data[sdc_data['OfferPrice']>5]

# filter CEF
sdc_data = sdc_data[sdc_data['CEF'] == 'No']

print(len(sdc_data))

14483
11130


In [None]:
# from CUSIP

In [None]:
# CRSP data

#date, PERMNO, NCUSIP, PRC, RET, SHROUT, EXCHCD,
#SHRCD