# Finance V: Research Topics in Finance, Risk- and Resource management 
## Replication of paper: ...

<blockquote>
    Author: Stefan Reimer <br>
    Date: 2019-12-28 <br>
    python version: 3.7 <br>
</blockquote>

In [85]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import collections

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# import defined functions
from src.functions.functions import (data_import_chunkwise, convert_NAs, get_duplicates, 
                                     find_char_in_colnames, convert_date, convert_price)

# set the settings for displayed dataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# 1 Load Data

In [86]:
#%%
# define path to files & load data chunkwise
filePath = "data/initial_data/000_sdc_full.csv"

sdc_data = data_import_chunkwise(filePath=filePath)

INFO:root:loading started...
INFO:root:loading finished.


The loaded data frame has 43709 rows and 86 columns.


# first data exploration

In [3]:
# show first 5 rows
#print(sdc_data.head(5))

# show last 5 rows
#print(sdc_data.tail(5))

# get key statistics for data
#print(sdc_data.describe())

# show all columns names
#print(sdc_data.columns)

In [35]:
# Question: Does the sample size fit? 
# Are only North American IPOs given?
len_paper = 43816
len_sample = len(sdc_data)

print(f'length of neede sample (from paper): {len_paper}\n'
      f'length of given sample: {len_sample}\n'
     f'difference: {len_paper-len_sample}')

length of neede sample (from paper): 43816
length of given sample: 43709
difference: 107


Answer: there are some IPO data missing.
the difference of 107 samples could results from the missing year 1973 and 1974 

TODO: ASK PHILIPP, WHICH DATA HAS BEEN EXTRACTED

In [36]:
# convert NAs in OrigIPO
sdc_data = convert_NAs(sdc_data, 'OrigIPO')
# print values of IPO and OrigIPO
print(collections.Counter(sdc_data['OrigIPO']))
print(collections.Counter(sdc_data['IPO']))

0 NAs have been created. 6659 valid values are left. 

Counter({nan: 37050, 'No': 5110, 'Yes': 1549})
Counter({'No': 28400, 'Yes': 15309})


In [37]:
# keep only IPOs
sdc_data = sdc_data.loc[(sdc_data['IPO']=='Yes') &
                        (sdc_data['OrigIPO']!='No'), :]
print(len(sdc_data))
# 16,454 should be left.
# TODO: the difference is getting bigger; maybe made changes before the filtering?

15268


In [8]:
# Question: how many values are often missing?
na_freq = sdc_data.isna().sum()
na_freq = na_freq/sdc_data.shape[0]
print(na_freq[na_freq>0.85])

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


REIT                                    0.933286
BestEftFirmCmtBghtDl                    0.995218
SecondaryShsFiledSumOfAllMkts           0.889611
AmendedSecondaryShsFiledSumOfAllMkts    0.906999
TickerSpinOffParent                     0.973209
ManagerAgentsLawyersCode                0.997987
DomesticSyndicateMemberCode             0.905878
FirmName                                0.898053
FundName                                0.898053
RoundNumberOfInvesTors                  0.898053
DisclosedRoundTotalmil                  0.904299
RoundDate                               0.898053
TotalKnownAmtInvestedInCompany000       0.905786
SpinParExch                             0.973118
dtype: float64


A lot of missing values in the data.

In [9]:
IPO_Counter = collections.Counter(sdc_data['IPO'])
print(IPO_Counter)
IPO_yes_percentage = IPO_Counter['Yes']/(IPO_Counter['Yes']+IPO_Counter['No'])
print(f"{IPO_yes_percentage} percent of the data points finished the IPO.")

Counter({'No': 28400, 'Yes': 15309})
0.3502482326294356 percent of the data points finished the IPO.


In [10]:
# Question: Are many missing values in the companies, that finished the IPO?
na_freq_of_finished_ipos = sdc_data.loc[sdc_data['IPO']=='Yes'].isna().sum()
na_freq_of_finished_ipos = na_freq_of_finished_ipos/sdc_data.loc[sdc_data['IPO']=='Yes'].shape[0]
print(na_freq_of_finished_ipos)

DealNumber                                 0.000000
CIK                                        0.568881
CUSIP6                                     0.000000
CUSIP9                                     0.210464
Issuer                                     0.000000
FilingDate                                 0.071200
IssueDate                                  0.000000
IPO                                        0.000000
OrigIPO                                    0.896140
Type                                       0.000000
REIT                                       0.970475
ADR                                        0.001894
Unit                                       0.104318
CEF                                        0.000261
MainSICCode                                0.000000
Units                                      0.000261
SpinOff                                    0.000000
ForeignIssue                               0.000000
TrackingStockIssue                         0.003985
BestEftFirmC

## Duplicate analysis

In [13]:
# get the duplicates for choosen columns
duplicated_issuers = get_duplicates(sdc_data, 'Issuer')
duplicated_IPO_issuers = get_duplicates(sdc_data.loc[sdc_data['IPO']=='Yes', :], 'Issuer', ' (only fulfilled IPOs)')
duplicated_CUSIP6 = get_duplicates(sdc_data, 'CUSIP6')
duplicated_indices = sdc_data.index.duplicated().sum()
print(f'There are {duplicated_indices} duplicated indices.')

There are 22804 duplicates in Issuer.
There are 1430 duplicates in Issuer (only fulfilled IPOs).
There are 23050 duplicates in CUSIP6.
There are 0 duplicated indices.


In [13]:
# show number of duplicates for all columns
#for column in sdc_data.columns.values:
#    duplicated_issuers = get_duplicates(sdc_data, column)

In [14]:
# get and inspect the duplicated rows
duplicated_issuers_true = duplicated_issuers[duplicated_issuers == 1]
duplicated_issuers_values = sdc_data.loc[sdc_data.index.isin(duplicated_issuers_true.index), :][['Issuer', 'CUSIP9', 'IPO']]
duplicated_issuers_values = duplicated_issuers_values.sort_values('CUSIP9')
print(duplicated_issuers_values.head(70))
# inspect choosen duplicates
#sdc_data.loc[sdc_data['Issuer'] == 'AMAX Inc', :]
#sdc_data.loc[sdc_data['Issuer'] == 'ADMA Biologics Inc', :]

                             Issuer     CUSIP9  IPO
6564                       AAR Corp  000361105   No
18529                      AAR Corp  000361105   No
15564        ABC Rail Products Corp  000752105   No
15563        ABC Rail Products Corp  000752105   No
14811        ABC Rail Products Corp  000752105   No
7332                  ABM Gold Corp  000776104  Yes
16810  ABR Information Services Inc  00077R108   No
13867    ABT Building Products Corp  000782102   No
17835                      ACC Corp  000794107   No
17043                      ACC Corp  000794107   No
24962   A C Moore Arts & Crafts Inc  00086T103   No
13865           ACS Enterprises Inc  000872309   No
11750           ACS Enterprises Inc  000872309   No
22873                      ACTV Inc  00088E104   No
22872                      ACTV Inc  00088E104   No
12983                    ADESA Corp  000892109   No
39955            ADMA Biologics Inc  000899104   No
41606            ADMA Biologics Inc  000899104   No
43131       

# 2 Data preparation

- Sort dataFrame
- inspect groups of columns
- convert date types
- create additional variables

In [40]:
# sort dataFrame
sdc_data = sdc_data.sort_index()

In [41]:
# find all columns which contains some date information
date_cols = find_char_in_colnames(sdc_data, 'Date', print_bool=False)

# find all columns which contains some price information
price_cols = find_char_in_colnames(sdc_data, 'Price', print_bool=False)

# find all columns which contains some share information
share_cols = find_char_in_colnames(sdc_data, 'Share', print_bool=False)

# find all columns which contains some overallotment information
overall_cols = find_char_in_colnames(sdc_data, 'Overall', print_bool=False)

# find all columns which contains some round information
round_cols = find_char_in_colnames(sdc_data, 'Round', print_bool=False)

In [42]:
# covert dates
sdc_data = convert_date(sdc_data, 'IssueDate', format='%Y-%m-%d')
sdc_data = convert_date(sdc_data, 'FilingDate', format='%Y-%m-%d')
sdc_data = convert_date(sdc_data, 'AmendMentDate', format='%Y-%m-%d', errors='coerce')
sdc_data = convert_date(sdc_data, 'RoundDate', format='%Y-%m-%d', errors='coerce')

# create year variable
sdc_data['Year'] = sdc_data['IssueDate'].dt.year

IssueDate has been converted. 
0 NAs have been created. 15268 valid values are left. 

FilingDate has been converted. 
0 NAs have been created. 14178 valid values are left. 

AmendMentDate has been converted. 
10684 NAs have been created. 0 valid values are left. 

RoundDate has been converted. 
4456 NAs have been created. 0 valid values are left. 



In [44]:
# Question: Is the data horizont enough?
date_start = sdc_data['FilingDate'].min()
date_end = sdc_data['FilingDate'].max()
print(f'The data start at {date_start} and ends at {date_end}')

#Answer:
## No, data is needed from 01.01.1973 until 31.12.2016
## additional data needed for 1973 until 1975

The data start at 1983-01-14 00:00:00 and ends at 2019-11-25 00:00:00


***
<font color ='blue'> __TODO: Methods to keep AmendMentDate and RoundDate__ </font>
***

***
<font color ='blue'> __TODO: convert price information__ </font>
***

In [45]:
for price_col in price_cols:
    sdc_data = convert_price(data = sdc_data, column = price_col, errors='coerce') #

OfferPrice has been converted. 
0 NAs have been created. 15268 valid values are left. 

AmendedHighFilingPrice has been converted. 
0 NAs have been created. 10543 valid values are left. 

AmendedLowFilingPrice has been converted. 
0 NAs have been created. 10545 valid values are left. 

AmendedMiddleOfFilingPrice has been converted. 
0 NAs have been created. 10547 valid values are left. 

HighPriceOfFilingPriceRnge has been converted. 
5364 NAs have been created. 5201 valid values are left. 

LowPriceOfFilingPriceRnge has been converted. 
5364 NAs have been created. 5201 valid values are left. 

OriginalHighFilingPrice has been converted. 
0 NAs have been created. 8989 valid values are left. 

OriginalLowFilingPrice has been converted. 
0 NAs have been created. 8996 valid values are left. 

OriginalMiddleOfFilingPriceRange has been converted. 
0 NAs have been created. 9000 valid values are left. 



In [24]:
#sdc_data[price_cols].notna().sum()

#sdc_data[price_cols].isna().sum()

# IPO data points with total missing pricing information before IPO
#sdc_data.loc[(sdc_data["AmendedHighFilingPrice"].isna()) & 
#             (sdc_data["OriginalHighFilingPrice"].isna()) & 
#             (sdc_data["HighPriceOfFilingPriceRnge"].isna()) &
#             (sdc_data['IPO'] == 'Yes'), :]

# 1923 data rows are missing with price information

In [50]:
# clean the types of securites 
na_amount_before = len(sdc_data)
ex_types= ["Units", "Ltd Prtnr Int", "MLP-Common Shs", "Shs Benficl Int",
             "Ltd Liab Int", "Stock Unit", "Trust Units", "Beneficial Ints"]
sdc_data = sdc_data[~sdc_data.Type.isin(ex_types)]
na_amount_after = len(sdc_data)
print(f'before: {na_amount_before} rows. After: {na_amount_after} \n'
     f'{na_amount_before - na_amount_after} are deleted.')

#### Should be "16,454 obs before ---> 15,107 obs after" ####

before: 15268 rows. After: 14483 
785 are deleted.


In [51]:
sdc_data.columns

Index(['DealNumber', 'CIK', 'CUSIP6', 'CUSIP9', 'Issuer', 'FilingDate',
       'IssueDate', 'IPO', 'OrigIPO', 'Type', 'REIT', 'ADR', 'Unit', 'CEF',
       'MainSICCode', 'Units', 'SpinOff', 'ForeignIssue', 'TrackingStockIssue',
       'BestEftFirmCmtBghtDl', 'OfferPrice',
       'SharesOutstandingAfterTheOffering', 'SharesOutstandingBeforeOffering',
       'SharesOfferedSumOfAllMkts', 'SharesOfrdIncOverSoldSumOfAllMkts',
       'PrimaryShsOfrdSumOfAllMkts', 'SecondaryShsOfrdSumOfAllMkts',
       'OverallotAmtOptionSumOfAllMktsMil', 'OverallotAmtSoldSumOfAllMktsmil',
       'TotGlobalOverallotmentSharesSold', 'SharesFiledSumOfAllMkts',
       'PrimaryShsFiledSumOfAllMkts', 'SecondaryShsFiledSumOfAllMkts',
       'AmendedShsFiledSumOfAllMkts', 'AmendedPrimaryShsFiledSumOfAllMkts',
       'AmendedSecondaryShsFiledSumOfAllMkts', 'AmendMentDate',
       'AmendHistShsFiledSumOfAllMkts', 'AmendHistSecShsFiledSumOfAllMkts',
       'AmendHistOverallotShsOptionSumOfAllMkts', 'TickerAtIssue',
   

In [84]:
### drop REIT, Units, ADR, penny stocks and CEF ###
print(len(sdc_data))

# drop REIT - Real Estate Investment Trust
sdc_data = sdc_data[sdc_data['REIT'].isna()]

# drop Unit 
sdc_data = sdc_data[~(sdc_data['Unit'] == 'Yes')]

# drop Depositary (ADR)
### print(collections.Counter(sdc_data['Depositary']))
### TODO: get depositary & delete despositary!!! ####

# filter the offer prices (drop penny stocks)
sdc_data[sdc_data['OfferPrice'].notna()]
sdc_data = sdc_data[sdc_data['OfferPrice']>5]

# filter CEF
sdc_data = sdc_data[sdc_data['CEF'] == 'No']

print(len(sdc_data))

11130
11130


In [29]:
# how large is the price span?
sdc_data.loc[:, 'AmendedFilingPriceSpan'] = sdc_data.loc[: , 'AmendedHighFilingPrice'] - sdc_data.loc[: , 'AmendedLowFilingPrice']
sdc_data.loc[:, 'OriginalFilingPriceSpan'] = sdc_data.loc[: , 'OriginalHighFilingPrice'] - sdc_data.loc[: , 'OriginalLowFilingPrice']

In [30]:
AmendedFilingPriceSpan = sdc_data['AmendedFilingPriceSpan'][sdc_data['AmendedFilingPriceSpan'].notna()].reset_index()['AmendedFilingPriceSpan']
OriginalFilingPriceSpan = sdc_data['OriginalFilingPriceSpan'][sdc_data['OriginalFilingPriceSpan'].notna()].reset_index()['OriginalFilingPriceSpan']

In [79]:
#collections.Counter(AmendedFilingPriceSpan)
print(len(OriginalFilingPriceSpan))
collections.Counter(OriginalFilingPriceSpan)
#plt.hist(AmendedFilingPriceSpan, bins=100)
#plt.show()

30258


Counter({0.0: 24056,
         2.0: 4086,
         4.0: 46,
         1.0: 600,
         3.0: 517,
         1.25: 17,
         0.5: 182,
         0.25: 67,
         1.5: 169,
         0.75: 39,
         0.6699999999999999: 7,
         2.5: 38,
         0.375: 6,
         3.3000000000000007: 1,
         6.0: 5,
         3.6000000000000014: 2,
         2.75: 2,
         2.25: 5,
         0.125: 2,
         5.0: 11,
         7.0: 1,
         0.625: 2,
         3.75: 1,
         8.0: 1,
         0.6500000000000004: 4,
         0.6600000000000001: 3,
         0.22499999999999998: 1,
         0.1499999999999999: 1,
         0.9199999999999999: 1,
         1.75: 4,
         2.8000000000000007: 1,
         0.33000000000000007: 5,
         2.1000000000000005: 1,
         2.5999999999999996: 1,
         2.08: 1,
         0.16999999999999993: 2,
         2.6999999999999993: 1,
         0.03: 1,
         0.6000000000000001: 1,
         1.700000000000001: 2,
         1.125: 1,
         0.199999999999

In [None]:
#date, PERMNO, NCUSIP, PRC, RET, SHROUT, EXCHCD,
#SHRCD



# Questions: which values do i need?
# IPO Flag (Y/N)
#sum(sdc_data.loc[:, "IPOFlag(Y/N)"] == "Yes")
# Out: 18224
#sum(sdc_data.loc[:, "IPOFlag(Y/N)"] == "No")
# Out: 31503

sdc_data_usa = sdc_data.loc[sdc_data['Nation'] == "United States", :]
sdc_data_usa_ipo = sdc_data_usa.loc[sdc_data['IPOFlagYN'] == 'Yes', :]

df = sdc_data_usa_ipo.copy()
print(df.shape)
df = df.dropna(subset=['FilingDate'])
print(df.shape)
df = df.dropna(subset=['OfferPrice'])
print(df.shape)
df = df.dropna(subset=['OriginalHighFilingPrice'])
print(df.shape)
df = df.dropna(subset=['OriginalLowFilingPrice'])
print(df.shape)
#df = df.dropna(subset=['LowPriceofFilingPriceRnge'])
#print(df.shape)
#df = df.dropna(subset=['HighPriceofFilingPriceRnge'])
#print(df.shape)

df_without_range = df.dropna(subset=['HighPriceofFilingPriceRnge'])
df_without_range = df.dropna(subset=['LowPriceofFilingPriceRnge'])

# Data per Year?
data_per_year = sdc_data.groupby('Year')['IssueDate'].size()

# How many data for US per Year?
data_US_per_year = sdc_data_usa.groupby('Year')['IssueDate'].count()

# How many data for US IPOs per Year?
ipo_per_year = sdc_data_usa_ipo.groupby('Year')['IssueDate'].count()

# How many OfferPrices are per year given?
ipo_US_with_data_per_year = df.groupby('Year')['OfferPrice'].count()

# How many Ranges are per year given?
ipo_US_with_range_data_per_year = df_without_range.groupby('Year')['OfferPrice'].count()

ipos = pd.merge(data_per_year, data_US_per_year, how='outer', on="Year")
ipos = pd.merge(ipos, ipo_per_year, how='outer', on="Year")
ipos = pd.merge(ipos, ipo_US_with_data_per_year, how='outer', on="Year")
ipos = pd.merge(ipos, ipo_US_with_range_data_per_year, how='outer', on='Year')
# create barplot with numbers of IPOs
fig, ax = plt.subplots()
width = 0.35
p1 = ax.bar(ipos.index, ipos['IssueDate_x'])
p2 = ax.bar(ipos.index, ipos['IssueDate_y'])
p3 = ax.bar(ipos.index, ipos['IssueDate'])
p4 = ax.bar(ipos.index, ipos['OfferPrice_x'])
p5 = ax.bar(ipos.index, ipos['OfferPrice_y'])

ax.legend((p1[0], p2[0], p3[0], p4[0], p5[0]),
          ('data in SDC',
           'US data in SDC',
           'US IPOs in SDC',
           'US IPOs in SDC with needed price data',
           'US IPOs in SDC with needed range data')
          )
ax.autoscale_view()
plt.show()


# TODO: set date range
sdc_data_subset = sdc_data_subset.loc[sdc_data['Year'] > 2000]

# set Issue Date as index
sdc_data.index = sdc_data["IssueDate"]

sdc_data.groupby(["Year"]).count()

# TODO: Offer Price
# TODO: Original Low Filing Price
# TODO: Original High Filing Price

#sum(sdc_data.loc[["Filing Date"].notna() , ["Filing Date"]] == sdc_data.index)
#sum(sdc_data.loc[sdc_data["Filing Date"].notna() , ["Filing Date"]].values== sdc_data.loc[sdc_data["Filing Date"].notna() , :].index)

# TODO: how many values are since 1983 given?

# TODO: Get only numeric values
#def to_numeric(df, column):
#    df[df[[column]].apply(lambda x: x[0].isdigit(), axis=1)]
#to_numeric(sdc_data, "High Price of Filing Price Range")

type(sdc_data["HighPriceOfFilingPriceRange"][49724])
test = pd.to_numeric(sdc_data["HighPriceOfFilingPriceRange"], errors='coerce')
cleaned_data = sdc_data[pd.to_numeric(sdc_data["HighPriceOfFilingPriceRange"], errors='coerce').notnull()]
test.isna().sum()

# quick modeling

sdc_data_1987 = sdc_data.loc[(sdc_data["IssueDate"] > "1983-01-01") & (sdc_data["IssueDate"] < "1987-09-30"), :]
# 5009 examples from 1983 to Sept 1987

sum(sdc_data_1987.loc[:, "IPOFlag(Y/N)"] == "Yes")
# 2689 examples with IPO Flag

sdc_data_1987_IPO = sdc_data_1987.loc[sdc_data_1987["IPOFlag(Y/N)"] == "Yes", :]

sdc_data_1987_IPO.isna().sum()
# missing values
#Original Low Filing Price                        488
#Original High Filing Price                       489
#Low Price of Filing Price Range                2679
#High Price of Filing Price Range                2679

sdc_data_1987_IPO = sdc_data_1987_IPO.loc[sdc_data_1987_IPO["OriginalHighFilingPrice"].notna(), :]
sdc_data_1987_IPO = sdc_data_1987_IPO.drop(columns=["IssueDate"])
sdc_data_1987_IPO = sdc_data_1987_IPO.reset_index()


#sdc_data_1987_IPO = sdc_data[pd.to_numeric(sdc_data_1987_IPO["OfferPrice"])]
#sdc_data_1987_IPO = sdc_data[pd.to_numeric(sdc_data_1987_IPO["OriginalHighFilingPrice"])]

#features = np.array[("IssueDate", "IPOFlag(Y/N", "OfferPrice", "OriginalLowFilingPrice", "OriginalHighFilingPrice")]
#results = smf.ols('OfferPrice ~ OriginalLowFilingPrice + OriginalHighFilingPrice', data=sdc_data_1987_IPO).fit()
#print(results.summary())

#X = sdc_data_1987_IPO.loc[:, ["OriginalLowFilingPrice", "OriginalHighFilingPrice"]].values
#y = sdc_data_1987_IPO["OfferPrice"]
#X = sm.add_constant(X)

df = sdc_data
regex = r"\d+.\d+,\d+"

df[~df["OfferPrice"].str.contains(pat=regex, regex=True).fillna(value = True).values]
df[~df["OriginalHighFilingPrice"].str.contains(pat=regex, regex=True).fillna(value = True).values]
df[~df["OriginalLowFilingPrice"].str.contains(pat=regex, regex=True).fillna(value = True).values]

# Test two indizes
#2019-08-20 1.000,000
#2019-08-16 ,810
#2019-08-15 1,250
#2019-07-26 ,75
#2019-04-05 7.500,000

sample = df.loc[df.index.isin(["2019-08-20",
                               "2019-08-16",
                               "2019-08-15",
                               "2019-07-26",
                               "2019-04-05"]), :]

#test = df[pd.to_numeric(df["OriginalLowFilingPrice"], errors='coerce')]

verb = df.loc[df['Issuer'] == "Verb Technology Co Inc", :]


test = sample.loc[:, ['OfferPrice',
               'OriginalHighFilingPrice',
               'OriginalLowFilingPrice',
               'LowPriceOfFilingPriceRange',
               'HighPriceOfFilingPriceRange']]


