# Finance V: Research Topics in Finance, Risk- and Resource management 
## Replication of paper: ...

<blockquote>
    Author: Stefan Reimer <br>
    Date: 2019-12-28 <br>
    python version: 3.7 <br>
</blockquote>

In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import collections

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# import defined functions
from src.functions.functions import data_import_chunkwise, replace_hardcoded_columnames

# set the settings for displayed dataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# 1 Load Data

In [193]:
#%%
# define path to files & load data chunkwise
filePath = "data/initial_data/000_sdc_full.csv"

sdc_data = data_import_chunkwise(filePath=filePath)

INFO:root:loading started...
INFO:root:loading finished.


The loaded data frame has 43709 rows and 86 columns.


# first data exploration

In [3]:
# show first 5 rows
sdc_data.head(5)

Unnamed: 0,DealNumber,CIK,CUSIP6,CUSIP9,Issuer,FilingDate,IssueDate,IPO,OrigIPO,Type,REIT,ADR,Unit,CEF,MainSICCode,Units,SpinOff,ForeignIssue,TrackingStockIssue,BestEftFirmCmtBghtDl,OfferPrice,SharesOutstandingAfterTheOffering,SharesOutstandingBeforeOffering,SharesOfferedSumOfAllMkts,SharesOfrdIncOverSoldSumOfAllMkts,PrimaryShsOfrdSumOfAllMkts,SecondaryShsOfrdSumOfAllMkts,OverallotAmtOptionSumOfAllMktsMil,OverallotAmtSoldSumOfAllMktsmil,TotGlobalOverallotmentSharesSold,SharesFiledSumOfAllMkts,PrimaryShsFiledSumOfAllMkts,SecondaryShsFiledSumOfAllMkts,AmendedShsFiledSumOfAllMkts,AmendedPrimaryShsFiledSumOfAllMkts,AmendedSecondaryShsFiledSumOfAllMkts,AmendMentDate,AmendHistShsFiledSumOfAllMkts,AmendHistSecShsFiledSumOfAllMkts,AmendHistOverallotShsOptionSumOfAllMkts,TickerAtIssue,TickerUltimateParents,TickerCurrent,TickerSpinOffParent,AllManagers_x,NonBookrunners,AllManagersParentsCode,AllMgrRoleCode,Managers_x,CoManagers,LeadManager,ManagerAgentsLawyersCode,DomesticSyndicateMemberCode,Bookrunners_x,BookrunnersParent,VentureBacked,FirmName,FundName,RoundNumberOfInvesTors,DisclosedRoundTotalmil,RoundDate,TotalKnownAmtInvestedInCompany000,AllManagers_y,Bookrunners_y,LeadManagersLongName,LeadManagers,Managers_y,CoManagersLongName,State,Nation,CurrentExchangeLongDescriptio,AllExchangesWhereIssueWillBeListed_1,AllExchangesWhereIssueWillBeListed_2,ExchangeWhereIssuWillBeLi,AllExchangesWhereIssuersStockTrades,FirstTwoExchangesWhereIssueWillBeListed,SpinParExch,PrimaryExchangeWhereIssuersStockTrades,AmendedHighFilingPrice,AmendedLowFilingPrice,AmendedMiddleOfFilingPrice,HighPriceOfFilingPriceRnge,LowPriceOfFilingPriceRnge,OriginalHighFilingPrice,OriginalLowFilingPrice,OriginalMiddleOfFilingPriceRange
0,62319002,,655312,,Nolex Corp,,1973-01-02,Yes,,Common Shares,,No,No,No,5111,No,N,No,No,,7.0,500000.0,,500000.0,500000.0,500000.0,0.0,,,,,,,,,,,,,,NLX,NLX,,,SHAPIRO,,SHAPIRO,BM,SHAPIRO,,SHAPIRO,,,SHAPIRO,SHAPIRO,No,,,,,,,J. Shapiro,J. Shapiro Co.,J. Shapiro Co.,SHAPIRO,J. Shapiro Co.,,California,United States,New York,A,A,AMEX,American,AMEX,,American,,,,,,,,
1,62321002,,560787,,Major Electronics,,1973-01-04,No,,Common Shares,,No,No,No,3669,No,N,No,No,,16.75,150000.0,,225000.0,225000.0,150000.0,75000.0,,,,,,,,,,,,,,,,,,SEIDEN,,SEIDEN,BM,SEIDEN,,SEIDEN,,,SEIDEN,SEIDEN,No,,,,,,,Seiden & de Cuevas,Seiden & de Cuevas,Seiden & de Cuevas,SEIDEN,Seiden & de Cuevas,,Unknown,United States,,NM,NM,NASDQ,Nasdaq,NASDQ,,,,,,,,,,
2,62322002,,1032,,AES Technology Systems,,1973-01-05,Yes,,Common Shares,,No,No,No,3579,No,N,No,No,,10.5,150000.0,,260000.0,260000.0,150000.0,110000.0,,,,,,,,,,,,,,AEST,AEST,,,RICHARD-ELLIS//PASCUMA,PASCUMA,CB-RICHARD-ELL,BM,RICHARD-ELLIS,PASCUMA,RICHARD-ELLIS,,,RICHARD-ELLIS,CB-RICHARD-ELL,No,,,,,,,"Pascuma, Florsheim",CB Richard Ellis & Co,CB Richard Ellis & Co,RICHARD-ELLIS,"Pascuma, Florsheim & Co.","Pascuma, Florsheim & Co.",Illinois,United States,,O,O,OTC,OTC,OTC,,OTC,,,,,,,,
3,62324002,,893287,,Trans-National Leasing Inc,,1973-01-05,Yes,,Common Shares,,No,No,No,7515,No,N,No,No,,5.5,200000.0,,200000.0,200000.0,200000.0,0.0,,,,,,,,,,,,,,TNLS,TNLS,,,BROWN-ALLEN,,BROWN-ALLEN,BM,BROWN-ALLEN,,BROWN-ALLEN,,,BROWN-ALLEN,BROWN-ALLEN,No,,,,,,,"Brown, Allen & Co.","Brown, Allen & Co.","Brown, Allen & Co.",BROWN-ALLEN,"Brown, Allen & Co.",,Texas,United States,New York,O,O,OTC,OTC,OTC,,OTC,,,,,,,,
4,62326002,,221754,,Cotton Petroleum,,1973-01-09,No,,Common Shares,,No,No,No,1311,Yes,N,No,No,,14.0,500000.0,,500000.0,500000.0,500000.0,0.0,,,,,,,,,,,,,,,,,,G-H-WALKER,,BOA-MERRILL,BM,G-H-WALKER,,G-H-WALKER,,,G-H-WALKER,BOA-MERRILL,No,,,,,,,"G. H. Walker & Co., Inc.","G. H. Walker & Co., Incorporated","G. H. Walker & Co., Incorporated",G-H-WALKER,"G. H. Walker & Co., Incorporated",,Unknown,United States,,O,O,OTC,OTC,OTC,,,,,,,,,,


In [4]:
# show last 5 rows
sdc_data.tail(5)

Unnamed: 0,DealNumber,CIK,CUSIP6,CUSIP9,Issuer,FilingDate,IssueDate,IPO,OrigIPO,Type,REIT,ADR,Unit,CEF,MainSICCode,Units,SpinOff,ForeignIssue,TrackingStockIssue,BestEftFirmCmtBghtDl,OfferPrice,SharesOutstandingAfterTheOffering,SharesOutstandingBeforeOffering,SharesOfferedSumOfAllMkts,SharesOfrdIncOverSoldSumOfAllMkts,PrimaryShsOfrdSumOfAllMkts,SecondaryShsOfrdSumOfAllMkts,OverallotAmtOptionSumOfAllMktsMil,OverallotAmtSoldSumOfAllMktsmil,TotGlobalOverallotmentSharesSold,SharesFiledSumOfAllMkts,PrimaryShsFiledSumOfAllMkts,SecondaryShsFiledSumOfAllMkts,AmendedShsFiledSumOfAllMkts,AmendedPrimaryShsFiledSumOfAllMkts,AmendedSecondaryShsFiledSumOfAllMkts,AmendMentDate,AmendHistShsFiledSumOfAllMkts,AmendHistSecShsFiledSumOfAllMkts,AmendHistOverallotShsOptionSumOfAllMkts,TickerAtIssue,TickerUltimateParents,TickerCurrent,TickerSpinOffParent,AllManagers_x,NonBookrunners,AllManagersParentsCode,AllMgrRoleCode,Managers_x,CoManagers,LeadManager,ManagerAgentsLawyersCode,DomesticSyndicateMemberCode,Bookrunners_x,BookrunnersParent,VentureBacked,FirmName,FundName,RoundNumberOfInvesTors,DisclosedRoundTotalmil,RoundDate,TotalKnownAmtInvestedInCompany000,AllManagers_y,Bookrunners_y,LeadManagersLongName,LeadManagers,Managers_y,CoManagersLongName,State,Nation,CurrentExchangeLongDescriptio,AllExchangesWhereIssueWillBeListed_1,AllExchangesWhereIssueWillBeListed_2,ExchangeWhereIssuWillBeLi,AllExchangesWhereIssuersStockTrades,FirstTwoExchangesWhereIssueWillBeListed,SpinParExch,PrimaryExchangeWhereIssuersStockTrades,AmendedHighFilingPrice,AmendedLowFilingPrice,AmendedMiddleOfFilingPrice,HighPriceOfFilingPriceRnge,LowPriceOfFilingPriceRnge,OriginalHighFilingPrice,OriginalLowFilingPrice,OriginalMiddleOfFilingPriceRange
43704,3488205002,1412408.0,71944F,71944F106,Phreesia Inc,2019-12-10,2019-12-12,No,,Common Shares,,No,No,No,7372,No,N,No,No,,26.0,35872057.0,3.0,6750000.0,6750000.0,0.0,6750000.0,26.325,,,6000000.0,,6000000.0,6750000.0,,6750000.0,12/12/19,6750000.0,6750000.0,1012500.0,PHR,PHR,PHR,,JPM-SEC-LLC(JB)/WELLS-FARGO-SEC(JB),BAIRD/RAYMOND,JPM,JB,JPM-SEC-LLC,BAIRD/RAYMOND,JPM-SEC-LLC/WELLS-FARGO-SEC/WM-BLAIR,,,JPM-SEC-LLC,JPM,No,,,,,,,Wells Fargo Securities LLC|William Blair & Co|...,Wells Fargo Securities LLC|William Blair & Co|...,Wells Fargo Securities LLC|William Blair & Co|...,WELLS-FARGO-SEC|WM-BLAIR|ALLEN|PIPER-JAFFRAY,Wells Fargo Securities LLC|William Blair & Co|...,Raymond James & Associates Inc,New York,United States,New York,N,N,NYSE,New York,NYSE,,New York,28.67,28.67,28.67,28.67,28.67,28.93,28.93,28.93
43705,3481495002,1789760.0,8J3835,,Ciig Merger Corp,2019-11-22,2019-12-12,Yes,,Units,,No,Yes,Yes,6726,No,Y,No,No,,10.0,28968750.0,,22500000.0,22500000.0,22500000.0,0.0,33.75,,,22500000.0,22500000.0,,22500000.0,22500000.0,,12/12/19,22500000.0,,3375000.0,CIICU,,,,UBS-SEC(JB)/BARCLAYS-CAP-I(JB),,UBS-BANK,JB,UBS-SEC,,UBS-SEC/BARCLAYS-CAP-I,,,UBS-SEC,UBS-BANK,No,,,,,,,Barclays Capital Inc,Barclays Capital Inc,Barclays Capital Inc,BARCLAYS-CAP-I,Barclays Capital Inc,,New York,United States,,NM,NM,NASDQ,Nasdaq,NASDQ,,Nasdaq,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
43706,3481532002,1791091.0,8J3864,,Healthcare Merger Corp,2019-11-25,2019-12-12,Yes,,Units,,No,No,Yes,6726,No,Y,No,No,,10.0,0.0,,22000000.0,22000000.0,22000000.0,0.0,33.0,,,20000000.0,20000000.0,,22000000.0,22000000.0,,12/12/19,22000000.0,,3300000.0,HCCOU,,,,CAN-FITZ-CO,,CAN-FITZ,BM,CAN-FITZ-CO,,CAN-FITZ-CO,,,CAN-FITZ-CO,CAN-FITZ,No,,,,,,,Cantor Fitzgerald & Co,Cantor Fitzgerald & Co,Cantor Fitzgerald & Co,CAN-FITZ-CO,Cantor Fitzgerald & Co,,New York,United States,,NM,NM,NASDQ,Nasdaq,NASDQ,,Nasdaq,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
43707,3489650002,,913915,913915104,Universal Technical Institute,2019-12-12,2019-12-12,No,,Common Shares,,No,No,No,8249,No,N,No,No,,6.25,25633933.0,2.0,3601724.0,3601724.0,0.0,3601724.0,,,,3601724.0,,3601724.0,,,,,,,,UTI,UTI,UTI,,B-RILEY-FBR,,B-RILEY-FIN,BM,B-RILEY-FBR,,B-RILEY-FBR,,,B-RILEY-FBR,B-RILEY-FIN,No,,,,,,,B Riley FBR,B Riley FBR,B Riley FBR,B-RILEY-FBR,B Riley FBR,,Arizona,United States,New York,N,N,NYSE,New York,NYSE,,New York,,,,,,7.25,7.25,7.25
43708,3489048002,882291.0,00808Y,00808Y307,Aethlon Medical Inc,2019-11-15,2019-12-13,No,,Common Shares,,No,No,No,3826,No,N,No,No,,1.5,4670593.0,,3333334.0,3333334.0,3333334.0,0.0,0.75,,,,,,3333334.0,3333334.0,,13/12/19,3333334.0,,499999.0,AEMD,AEMD,AEMD,,H-C-WAINWRIGHT,,H-C-WAINWRIGHT,BM,H-C-WAINWRIGHT,,H-C-WAINWRIGHT,,,H-C-WAINWRIGHT,H-C-WAINWRIGHT,No,,,,,,,HC Wainwright & Co Inc,HC Wainwright & Co Inc,HC Wainwright & Co Inc,H-C-WAINWRIGHT,HC Wainwright & Co Inc,,California,United States,Nasdaq,NM,NM,NASDQ,Nasdaq,NASDQ,,Nasdaq,1.5,1.5,1.5,1.5,1.5,,,


In [5]:
# get key statistics for data
sdc_data.describe()

Unnamed: 0,DealNumber,CIK,OfferPrice,SharesOutstandingAfterTheOffering,SharesOutstandingBeforeOffering,SharesOfferedSumOfAllMkts,SharesOfrdIncOverSoldSumOfAllMkts,PrimaryShsOfrdSumOfAllMkts,SecondaryShsOfrdSumOfAllMkts,OverallotAmtSoldSumOfAllMktsmil
count,43709.0,24751.0,43705.0,43709.0,22749.0,43700.0,43702.0,43709.0,43709.0,21025.0
mean,1289522000.0,1009768.0,214.8717,119554300.0,22.512726,10763470.0,11367410.0,8217867.0,2430589.0,22.861588
std,1062256000.0,447523.3,25372.34,2109662000.0,350.294326,108949800.0,111028500.0,100154000.0,42508900.0,70.603186
min,451002.0,1750.0,0.0,0.0,1.0,1.0,1.0,-3000000.0,0.0,0.001
25%,308381000.0,832904.0,7.75,4125000.0,2.0,1560873.0,1698960.0,860000.0,0.0,2.912
50%,1095128000.0,1038572.0,14.625,17375490.0,3.0,3500000.0,3751748.0,2500000.0,0.0,8.625
75%,2278038000.0,1364479.0,23.5,50167180.0,8.0,7700000.0,8100000.0,6000000.0,200000.0,21.42
max,3490135000.0,1791091.0,5250000.0,286202500000.0,29080.0,9953283000.0,9953283000.0,9953283000.0,4891005000.0,3265.081


In [6]:
# show all columns names
sdc_data.columns

Index(['DealNumber', 'CIK', 'CUSIP6', 'CUSIP9', 'Issuer', 'FilingDate',
       'IssueDate', 'IPO', 'OrigIPO', 'Type', 'REIT', 'ADR', 'Unit', 'CEF',
       'MainSICCode', 'Units', 'SpinOff', 'ForeignIssue', 'TrackingStockIssue',
       'BestEftFirmCmtBghtDl', 'OfferPrice',
       'SharesOutstandingAfterTheOffering', 'SharesOutstandingBeforeOffering',
       'SharesOfferedSumOfAllMkts', 'SharesOfrdIncOverSoldSumOfAllMkts',
       'PrimaryShsOfrdSumOfAllMkts', 'SecondaryShsOfrdSumOfAllMkts',
       'OverallotAmtOptionSumOfAllMktsMil', 'OverallotAmtSoldSumOfAllMktsmil',
       'TotGlobalOverallotmentSharesSold', 'SharesFiledSumOfAllMkts',
       'PrimaryShsFiledSumOfAllMkts', 'SecondaryShsFiledSumOfAllMkts',
       'AmendedShsFiledSumOfAllMkts', 'AmendedPrimaryShsFiledSumOfAllMkts',
       'AmendedSecondaryShsFiledSumOfAllMkts', 'AmendMentDate',
       'AmendHistShsFiledSumOfAllMkts', 'AmendHistSecShsFiledSumOfAllMkts',
       'AmendHistOverallotShsOptionSumOfAllMkts', 'TickerAtIssue',
   

In [7]:
# Question: how many values are often missing?
na_freq = sdc_data.isna().sum()
na_freq = na_freq/sdc_data.shape[0]
print(na_freq[na_freq>0.85])

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


REIT                                    0.933286
BestEftFirmCmtBghtDl                    0.995218
SecondaryShsFiledSumOfAllMkts           0.889611
AmendedSecondaryShsFiledSumOfAllMkts    0.906999
TickerSpinOffParent                     0.973209
ManagerAgentsLawyersCode                0.997987
DomesticSyndicateMemberCode             0.905878
FirmName                                0.898053
FundName                                0.898053
RoundNumberOfInvesTors                  0.898053
DisclosedRoundTotalmil                  0.904299
RoundDate                               0.898053
TotalKnownAmtInvestedInCompany000       0.905786
SpinParExch                             0.973118
dtype: float64


A lot of missing values in the data.

In [8]:
IPO_Counter = collections.Counter(sdc_data['IPO'])
print(IPO_Counter)
IPO_yes_percentage = IPO_Counter['Yes']/(IPO_Counter['Yes']+IPO_Counter['No'])
print(f"{IPO_yes_percentage} percent of the data points finished the IPO.")

Counter({'No': 28400, 'Yes': 15309})
0.3502482326294356 percent of the data points finished the IPO.


In [9]:
# Question: Are many missing values in the companies, that finished the IPO?
na_freq_of_finished_ipos = sdc_data.loc[sdc_data['IPO']=='Yes'].isna().sum()
na_freq_of_finished_ipos = na_freq_of_finished_ipos/sdc_data.loc[sdc_data['IPO']=='Yes'].shape[0]
print(na_freq_of_finished_ipos)

DealNumber                                 0.000000
CIK                                        0.568881
CUSIP6                                     0.000000
CUSIP9                                     0.210464
Issuer                                     0.000000
FilingDate                                 0.071200
IssueDate                                  0.000000
IPO                                        0.000000
OrigIPO                                    0.896140
Type                                       0.000000
REIT                                       0.970475
ADR                                        0.001894
Unit                                       0.104318
CEF                                        0.000261
MainSICCode                                0.000000
Units                                      0.000261
SpinOff                                    0.000000
ForeignIssue                               0.000000
TrackingStockIssue                         0.003985
BestEftFirmC

## Duplicate analysis

In [11]:
def get_duplicates(df, column, comment=''):
    # function to print the sum of duplicates and return the duplicates
    duplicates = df.loc[:, [column]].duplicated()
    duplicates_sum = duplicates.sum()
    print(f'There are {duplicates_sum} duplicates in {column}{comment}.')
    return duplicates

In [25]:
# get the duplicates for choosen columns
duplicated_issuers = get_duplicates(sdc_data, 'Issuer')
duplicated_IPO_issuers = get_duplicates(sdc_data.loc[sdc_data['IPO']=='Yes', :], 'Issuer', ' (only fulfilled IPOs)')
duplicated_CUSIP6 = get_duplicates(sdc_data, 'CUSIP6')
duplicated_indices = sdc_data.index.duplicated().sum()
print(f'There are {duplicated_indices} duplicated indices.')

There are 22804 duplicates in Issuer.
There are 1430 duplicates in Issuer (only fulfilled IPOs).
There are 23050 duplicates in CUSIP6.
There are 0 duplicated indices.


In [23]:
# show number of duplicates for all columns
#for column in sdc_data.columns.values:
#    duplicated_issuers = get_duplicates(sdc_data, column)

In [24]:
# get and inspect the duplicated rows
duplicated_issuers_true = duplicated_issuers[duplicated_issuers == 1]
duplicated_issuers_values = sdc_data.loc[sdc_data.index.isin(duplicated_issuers_true.index), :][['Issuer', 'CUSIP9', 'IPO']]
duplicated_issuers_values = duplicated_issuers_values.sort_values('CUSIP9')
print(duplicated_issuers_values.head(70))

                               Issuer     CUSIP9  IPO
38390                AAC Holdings Inc  000307108  Yes
6564                         AAR Corp  000361105   No
18529                        AAR Corp  000361105   No
14289    ABC Bancorp,Moultrie,Georgia  000400101  Yes
15564          ABC Rail Products Corp  000752105   No
15563          ABC Rail Products Corp  000752105   No
14811          ABC Rail Products Corp  000752105   No
13587          ABC Rail Products Corp  000752105  Yes
7332                    ABM Gold Corp  000776104  Yes
16810    ABR Information Services Inc  00077R108   No
14323    ABR Information Services Inc  00077R108  Yes
12607      ABT Building Products Corp  000782102  Yes
13867      ABT Building Products Corp  000782102   No
10063                        ACC Corp  000794107   No
17835                        ACC Corp  000794107   No
17043                        ACC Corp  000794107   No
17142       ACC Consumer Finance Corp  00079H108  Yes
28464                ACCO Br

In [14]:
# inspect choosen duplicates
#sdc_data.loc[sdc_data['Issuer'] == 'AMAX Inc', :]
sdc_data.loc[sdc_data['Issuer'] == 'ADMA Biologics Inc', :]

Unnamed: 0,DealNumber,CIK,CUSIP6,CUSIP9,Issuer,FilingDate,IssueDate,IPO,OrigIPO,Type,REIT,ADR,Unit,CEF,MainSICCode,Units,SpinOff,ForeignIssue,TrackingStockIssue,BestEftFirmCmtBghtDl,OfferPrice,SharesOutstandingAfterTheOffering,SharesOutstandingBeforeOffering,SharesOfferedSumOfAllMkts,SharesOfrdIncOverSoldSumOfAllMkts,PrimaryShsOfrdSumOfAllMkts,SecondaryShsOfrdSumOfAllMkts,OverallotAmtOptionSumOfAllMktsMil,OverallotAmtSoldSumOfAllMktsmil,TotGlobalOverallotmentSharesSold,SharesFiledSumOfAllMkts,PrimaryShsFiledSumOfAllMkts,SecondaryShsFiledSumOfAllMkts,AmendedShsFiledSumOfAllMkts,AmendedPrimaryShsFiledSumOfAllMkts,AmendedSecondaryShsFiledSumOfAllMkts,AmendMentDate,AmendHistShsFiledSumOfAllMkts,AmendHistSecShsFiledSumOfAllMkts,AmendHistOverallotShsOptionSumOfAllMkts,TickerAtIssue,TickerUltimateParents,TickerCurrent,TickerSpinOffParent,AllManagers_x,NonBookrunners,AllManagersParentsCode,AllMgrRoleCode,Managers_x,CoManagers,LeadManager,ManagerAgentsLawyersCode,DomesticSyndicateMemberCode,Bookrunners_x,BookrunnersParent,VentureBacked,FirmName,FundName,RoundNumberOfInvesTors,DisclosedRoundTotalmil,RoundDate,TotalKnownAmtInvestedInCompany000,AllManagers_y,Bookrunners_y,LeadManagersLongName,LeadManagers,Managers_y,CoManagersLongName,State,Nation,CurrentExchangeLongDescriptio,AllExchangesWhereIssueWillBeListed_1,AllExchangesWhereIssueWillBeListed_2,ExchangeWhereIssuWillBeLi,AllExchangesWhereIssuersStockTrades,FirstTwoExchangesWhereIssueWillBeListed,SpinParExch,PrimaryExchangeWhereIssuersStockTrades,AmendedHighFilingPrice,AmendedLowFilingPrice,AmendedMiddleOfFilingPrice,HighPriceOfFilingPriceRnge,LowPriceOfFilingPriceRnge,OriginalHighFilingPrice,OriginalLowFilingPrice,OriginalMiddleOfFilingPriceRange
36853,2412570002,,899,899104,ADMA Biologics Inc,2012-03-29,2013-10-16,Yes,,Common Shares,,No,No,No,2836,No,Y,No,No,,8.5,9726884.0,,3352941.0,3855882.0,3352941.0,0.0,4.275,4.275,502941.0,1881161.0,,1881161.0,2666667.0,2666667.0,,05/06/12|03/07/12|10/08/12|13/08/12|11/02/13|0...,"1,881,161|1,969,026|1,969,026|1,969,026|-|2,10...","1,881,161|1,969,026|1,969,026|1,969,026|-|-|-|...","-|-|-|-|-|315,000|315,000|458,823|400,000|400,000",,,ADMA,,OPPENHEIMER-CO//LADENBURG/MAXIM-GROUP,LADENBURG/MAXIM-GROUP/LAIDLAW-UK,OPPEN-HOLD,BM,OPPENHEIMER-CO,LADENBURG/MAXIM-GROUP/LAIDLAW-UK,OPPENHEIMER-CO,,,OPPENHEIMER-CO,OPPEN-HOLD,Yes,Undisclosed Firm|Undisclosed Firm|Aisling Capi...,Undisclosed Fund|Undisclosed Fund|Aisling Capi...,5|5|5|5|1|1|5,"17,550.0|17,550.0|17,550.0|17,550.0|-|3,500.0|...",13/02/2012|13/02/2012|13/02/2012|13/02/2012|21...,27050.0,Ladenburg Thalmann & Co|Maxim Group LLC|Laidla...,Oppenheimer & Co Inc,Oppenheimer & Co Inc,OPPENHEIMER-CO,Ladenburg Thalmann & Co|Maxim Group LLC|Laidla...,Maxim Group LLC|Laidlaw & Co (UK) Ltd,New Jersey,United States,Nasdaq,O,O,OTC,OTC,OTC,,OTC,9.5,8.5,9.0,9.6|11.5|11.5|11.5|15.5|15.5|9.0|9.5|9.5,9.6|9.6|9.6|9.6|13.5|13.5|8.0|8.5|8.5,9.6,9.6,9.6
38924,2732514002,1368514.0,899,899104,ADMA Biologics Inc,2014-11-28,2015-03-13,No,,Common Shares,,No,No,No,2836,No,N,No,No,,8.0,10700573.0,,1225000.0,1408750.0,1225000.0,0.0,1.47,1.47,183750.0,,,,,,,,,,,ADMA,ADMA,ADMA,,RAYMOND//LAIDLAW-UK/MAXIM-GROUP,LAIDLAW-UK/MAXIM-GROUP,RAYJAM,BM,RAYMOND,LAIDLAW-UK/MAXIM-GROUP,RAYMOND,,,RAYMOND,RAYJAM,No,,,,,,,Laidlaw & Co (UK) Ltd|Maxim Group LLC,Raymond James & Associates Inc,Raymond James & Associates Inc,RAYMOND,Laidlaw & Co (UK) Ltd|Maxim Group LLC,Maxim Group LLC,New Jersey,United States,Nasdaq,NM,NM,NASDQ,Nasdaq,NASDQ,,OTC,,,,,,8.6,8.6,8.6
39955,2953009002,1368514.0,899,899104,ADMA Biologics Inc,2016-04-27,2016-04-28,No,,Common Shares,,No,No,No,2836,No,N,No,No,,6.5,12886741.0,1.0,1892308.0,2176154.0,1892308.0,0.0,1.845,1.845,283846.0,,,,,,,,,,,ADMA,ADMA,ADMA,,RAYMOND,,RAYJAM,BM,RAYMOND,,RAYMOND,,,RAYMOND,RAYJAM,No,,,,,,,Raymond James & Associates Inc,Raymond James & Associates Inc,Raymond James & Associates Inc,RAYMOND,Raymond James & Associates Inc,,New Jersey,United States,Nasdaq,NM,NM,NASDQ,Nasdaq,NASDQ,,OTC,,,,,,7.31,7.31,7.31
41606,3163507002,1368514.0,899,899104,ADMA Biologics Inc,2017-10-11,2017-11-08,No,,Common Shares,,No,No,No,2836,No,N,No,No,,2.15,34178988.0,1.0,16976744.0,16976744.0,16976744.0,0.0,5.475,,,,,,13616071.0,13616071.0,,08/11/17,13616071,,2546511,ADMA,ADMA,ADMA,,RAYMOND//LADENBURG,LADENBURG,RAYJAM,BM,RAYMOND,LADENBURG,RAYMOND,,,RAYMOND,RAYJAM,No,,,,,,,Ladenburg Thalmann & Co,Raymond James & Associates Inc,Raymond James & Associates Inc,RAYMOND,Ladenburg Thalmann & Co,Ladenburg Thalmann & Co,New Jersey,United States,Nasdaq,NM,NM,NASDQ,Nasdaq,NASDQ,,OTC,2.24,2.24,2.24,2.24,2.24,3.19,3.19,3.19
42218,3255062002,1368514.0,899,899104,ADMA Biologics Inc,2018-06-07,2018-06-08,No,,Common Shares,,No,No,No,2836,No,N,No,No,,4.78,46349514.0,3.0,8368200.0,9623430.0,8368200.0,0.0,6.0,6.0,1255230.0,8368200.0,8368200.0,,,,,,,,,ADMA,ADMA,ADMA,,RAYMOND/OPPENHEIMER-CO(JOINT),OPPENHEIMER-CO/CHARDAN-CAP-MKT,RAYJAM,BM,RAYMOND,CHARDAN-CAP-MKT,RAYMOND/OPPENHEIMER-CO,,,RAYMOND,RAYJAM,No,,,,,,,Oppenheimer & Co Inc|Chardan Capital Markets LLC,Raymond James & Associates Inc,Oppenheimer & Co Inc,OPPENHEIMER-CO,Oppenheimer & Co Inc|Chardan Capital Markets LLC,Chardan Capital Markets LLC,New Jersey,United States,Nasdaq,NM,NM,NASDQ,Nasdaq,NASDQ,,OTC,,,,,,4.78,4.78,4.78
43131,3393682002,1368514.0,899,899104,ADMA Biologics Inc,2019-05-15,2019-05-16,No,,Common Shares,,No,No,No,2836,No,N,No,No,,4.0,59290568.0,4.0,11250000.0,12937500.0,11250000.0,0.0,6.75,6.75,1687500.0,,,,11250000.0,11250000.0,,16/05/19,11250000,,1687500,ADMA,ADMA,ADMA,,JEFFERIES-LLC(JB)/RAYMOND(JB),OPPENHEIMER-CO,JEFFERIES-LLC,JB,JEFFERIES-LLC,,JEFFERIES-LLC/RAYMOND/OPPENHEIMER-CO,,,JEFFERIES-LLC,JEFFERIES-LLC,No,,,,,,,Raymond James & Associates Inc|Oppenheimer & C...,Raymond James & Associates Inc,Raymond James & Associates Inc|Oppenheimer & C...,RAYMOND|OPPENHEIMER-CO,Raymond James & Associates Inc|Oppenheimer & C...,,New Jersey,United States,Nasdaq,NM,NM,NASDQ,Nasdaq,NASDQ,,OTC,4.0,4.0,4.0,4.0,4.0,,,


# 2 Data preparation

- Sort dataFrame
- inspect groups of columns
- convert date types

- create additional variables

In [15]:
# sort dataFrame
sdc_data = sdc_data.sort_index()

In [39]:
sdc_data.dtypes

DealNumber                                          int64
CIK                                               float64
CUSIP6                                             object
CUSIP9                                             object
Issuer                                             object
FilingDate                                 datetime64[ns]
IssueDate                                  datetime64[ns]
IPO                                                object
OrigIPO                                            object
Type                                               object
REIT                                               object
ADR                                                object
Unit                                               object
CEF                                                object
MainSICCode                                        object
Units                                              object
SpinOff                                            object
ForeignIssue  

In [110]:
def find_char_in_colnames(df, char_to_find):
    """ function to find colnames in a dataFrame with matching subcharacters
    @param df: dataFrame to search the columns
    @ param char_to_find: character, which should be searched for as substring in the dataframe
    @return: array with matching columns
    """
    col_names = df.columns.values
    matching_col_names = col_names[[char_to_find in i for i in col_names]]
    print(
        f"columns containing <<{char_to_find}>> are:"
        "\n"
        f"{matching_col_names}"
        "\n")
    #print(matching_col_names)
    #print('\t')
    return matching_col_names

In [111]:
# find all columns which contains some date information
date_cols = find_char_in_colnames(sdc_data, 'Date')

# find all columns which contains some price information
price_cols = find_char_in_colnames(sdc_data, 'Price')

# find all columns which contains some share information
share_cols = find_char_in_colnames(sdc_data, 'Share')

# find all columns which contains some overallotment information
overall_cols = find_char_in_colnames(sdc_data, 'Overall')

# find all columns which contains some round information
round_cols = find_char_in_colnames(sdc_data, 'Round')

columns containing <<Date>> are:
['FilingDate' 'IssueDate' 'AmendMentDate' 'RoundDate']

columns containing <<Price>> are:
['OfferPrice' 'AmendedHighFilingPrice' 'AmendedLowFilingPrice'
 'AmendedMiddleOfFilingPrice' 'HighPriceOfFilingPriceRnge'
 'LowPriceOfFilingPriceRnge' 'OriginalHighFilingPrice'
 'OriginalLowFilingPrice' 'OriginalMiddleOfFilingPriceRange']

columns containing <<Share>> are:
['SharesOutstandingAfterTheOffering' 'SharesOutstandingBeforeOffering'
 'SharesOfferedSumOfAllMkts' 'SharesOfrdIncOverSoldSumOfAllMkts'
 'TotGlobalOverallotmentSharesSold' 'SharesFiledSumOfAllMkts']

columns containing <<Overall>> are:
['OverallotAmtOptionSumOfAllMktsMil' 'OverallotAmtSoldSumOfAllMktsmil'
 'TotGlobalOverallotmentSharesSold'
 'AmendHistOverallotShsOptionSumOfAllMkts']

columns containing <<Round>> are:
['RoundNumberOfInvesTors' 'DisclosedRoundTotalmil' 'RoundDate']



In [96]:
def convert_date(df, column, format='%Y-%m-%d', errors = 'raise'):
    """
    function to convert characters to pandas datetime column in a dataFrame and print NA information.
    
    @param df: dataFrame with column to convert
    @param column: column to convert to datetime
    @param format: look at pd.to_datetime documentation
    @param errors: look at pd.to_datetime documentation
    @return: dataFrame with converted column
    """
    na_amount_before = df[column].isna().sum()
    df[column] = pd.to_datetime(df[column], format=format, errors=errors)
    na_amount_after = df[column].isna().sum()
    nonna_amount_after = df[column].notna().sum()
    print(f"{na_amount_after - na_amount_before} NAs have been created. "
          f"{nonna_amount_after} valid values are left.")    
    return df

In [89]:
# covert dates
sdc_data = convert_date(sdc_data, 'IssueDate', format='%Y-%m-%d')
sdc_data = convert_date(sdc_data, 'FilingDate', format='%Y-%m-%d')
sdc_data = convert_date(sdc_data, 'AmendMentDate', format='%Y-%m-%d', errors='coerce')
sdc_data = convert_date(sdc_data, 'RoundDate', format='%Y-%m-%d', errors='coerce')

0 NAs have been created. 43709 valid values are left.
0 NAs have been created. 39393 valid values are left.
0 NAs have been created. 0 valid values are left.
0 NAs have been created. 0 valid values are left.


***
<font color ='blue'> __TODO: Methods to keep AmendMentDate and RoundDate__ </font>
***

***
<font color ='blue'> __TODO: convert price information__ </font>
***

In [195]:
def convert_price(data, column, errors = 'raise'):
    """
    function to convert characters to numeric column in a dataFrame and print NA information.
    
    @param df: dataFrame with column to convert
    @return: dataFrame with converted column
    """
    df = data.copy()
    na_amount_before = df[column].isna().sum()
    df.loc[:, column] = df[column].astype(str)
    df.loc[:, column] = df[column].str.replace(',', '')
    df.loc[:, column] = df[column].str.replace('nan', 'NaN')
    df.loc[:, column] = df[column].replace('NaN', np.NAN)
    df.loc[:, column] = pd.to_numeric(df.loc[:, column], errors=errors)
    na_amount_after = df[column].isna().sum()
    nonna_amount_after = df[column].notna().sum()
    print(f"{column} has been converted. \n"
          f"{na_amount_after - na_amount_before} NAs have been created. "
          f"{nonna_amount_after} valid values are left. \n")    
    return df

In [196]:
#def clean_and_invert_numeric(data: pd.DataFrame, column: str):
#    
#    data.loc[:, column] = data[column].str.replace('na', 'NaN')
#    
#    data.loc[:, column] = data[column].str.replace('NaNn', 'NaN')
#    
#    
#    data.loc[:, column] = pd.to_numeric(data.loc[:, column])
#    return data
#
#for column in ['OfferPrice', 'OriginalHighFilingPrice', 'OriginalLowFilingPrice']:
#    sdc_data_usa_ipo = clean_and_invert_numeric(sdc_data_usa_ipo, column)

In [197]:
for price_col in price_cols:#[[2]]:
    sdc_data = convert_price(data = sdc_data, column = price_col, errors='coerce') #

OfferPrice has been converted. 
0 NAs have been created. 43705 valid values are left. 

AmendedHighFilingPrice has been converted. 
0 NAs have been created. 17756 valid values are left. 

AmendedLowFilingPrice has been converted. 
0 NAs have been created. 17760 valid values are left. 

AmendedMiddleOfFilingPrice has been converted. 
0 NAs have been created. 17763 valid values are left. 

HighPriceOfFilingPriceRnge has been converted. 
6274 NAs have been created. 11537 valid values are left. 

LowPriceOfFilingPriceRnge has been converted. 
6274 NAs have been created. 11539 valid values are left. 

OriginalHighFilingPrice has been converted. 
0 NAs have been created. 30269 valid values are left. 

OriginalLowFilingPrice has been converted. 
0 NAs have been created. 30285 valid values are left. 

OriginalMiddleOfFilingPriceRange has been converted. 
0 NAs have been created. 30296 valid values are left. 



In [198]:
sdc_data[price_cols].dtypes

OfferPrice                          float64
AmendedHighFilingPrice              float64
AmendedLowFilingPrice               float64
AmendedMiddleOfFilingPrice          float64
HighPriceOfFilingPriceRnge          float64
LowPriceOfFilingPriceRnge           float64
OriginalHighFilingPrice             float64
OriginalLowFilingPrice              float64
OriginalMiddleOfFilingPriceRange    float64
dtype: object

In [194]:
#sdc_data[price_cols].notna().sum()

#sdc_data[price_cols].isna().sum()

# IPO data points with total missing pricing information before IPO
sdc_data.loc[(sdc_data["AmendedHighFilingPrice"].isna()) & 
             (sdc_data["OriginalHighFilingPrice"].isna()) & 
             (sdc_data["HighPriceOfFilingPriceRnge"].isna()) &
             (sdc_data['IPO'] == 'Yes'), :]

Unnamed: 0,DealNumber,CIK,CUSIP6,CUSIP9,Issuer,FilingDate,IssueDate,IPO,OrigIPO,Type,REIT,ADR,Unit,CEF,MainSICCode,Units,SpinOff,ForeignIssue,TrackingStockIssue,BestEftFirmCmtBghtDl,OfferPrice,SharesOutstandingAfterTheOffering,SharesOutstandingBeforeOffering,SharesOfferedSumOfAllMkts,SharesOfrdIncOverSoldSumOfAllMkts,PrimaryShsOfrdSumOfAllMkts,SecondaryShsOfrdSumOfAllMkts,OverallotAmtOptionSumOfAllMktsMil,OverallotAmtSoldSumOfAllMktsmil,TotGlobalOverallotmentSharesSold,SharesFiledSumOfAllMkts,PrimaryShsFiledSumOfAllMkts,SecondaryShsFiledSumOfAllMkts,AmendedShsFiledSumOfAllMkts,AmendedPrimaryShsFiledSumOfAllMkts,AmendedSecondaryShsFiledSumOfAllMkts,AmendMentDate,AmendHistShsFiledSumOfAllMkts,AmendHistSecShsFiledSumOfAllMkts,AmendHistOverallotShsOptionSumOfAllMkts,TickerAtIssue,TickerUltimateParents,TickerCurrent,TickerSpinOffParent,AllManagers_x,NonBookrunners,AllManagersParentsCode,AllMgrRoleCode,Managers_x,CoManagers,LeadManager,ManagerAgentsLawyersCode,DomesticSyndicateMemberCode,Bookrunners_x,BookrunnersParent,VentureBacked,FirmName,FundName,RoundNumberOfInvesTors,DisclosedRoundTotalmil,RoundDate,TotalKnownAmtInvestedInCompany000,AllManagers_y,Bookrunners_y,LeadManagersLongName,LeadManagers,Managers_y,CoManagersLongName,State,Nation,CurrentExchangeLongDescriptio,AllExchangesWhereIssueWillBeListed_1,AllExchangesWhereIssueWillBeListed_2,ExchangeWhereIssuWillBeLi,AllExchangesWhereIssuersStockTrades,FirstTwoExchangesWhereIssueWillBeListed,SpinParExch,PrimaryExchangeWhereIssuersStockTrades,AmendedHighFilingPrice,AmendedLowFilingPrice,AmendedMiddleOfFilingPrice,HighPriceOfFilingPriceRnge,LowPriceOfFilingPriceRnge,OriginalHighFilingPrice,OriginalLowFilingPrice,OriginalMiddleOfFilingPriceRange
0,62319002,,655312,,Nolex Corp,,1973-01-02,Yes,,Common Shares,,No,No,No,5111,No,N,No,No,,7.00,500000.0,,500000.0,500000.0,500000.0,0.0,,,,,,,,,,,,,,NLX,NLX,,,SHAPIRO,,SHAPIRO,BM,SHAPIRO,,SHAPIRO,,,SHAPIRO,SHAPIRO,No,,,,,,,J. Shapiro,J. Shapiro Co.,J. Shapiro Co.,SHAPIRO,J. Shapiro Co.,,California,United States,New York,A,A,AMEX,American,AMEX,,American,,,,,,,,
2,62322002,,001032,,AES Technology Systems,,1973-01-05,Yes,,Common Shares,,No,No,No,3579,No,N,No,No,,10.50,150000.0,,260000.0,260000.0,150000.0,110000.0,,,,,,,,,,,,,,AEST,AEST,,,RICHARD-ELLIS//PASCUMA,PASCUMA,CB-RICHARD-ELL,BM,RICHARD-ELLIS,PASCUMA,RICHARD-ELLIS,,,RICHARD-ELLIS,CB-RICHARD-ELL,No,,,,,,,"Pascuma, Florsheim",CB Richard Ellis & Co,CB Richard Ellis & Co,RICHARD-ELLIS,"Pascuma, Florsheim & Co.","Pascuma, Florsheim & Co.",Illinois,United States,,O,O,OTC,OTC,OTC,,OTC,,,,,,,,
3,62324002,,893287,,Trans-National Leasing Inc,,1973-01-05,Yes,,Common Shares,,No,No,No,7515,No,N,No,No,,5.50,200000.0,,200000.0,200000.0,200000.0,0.0,,,,,,,,,,,,,,TNLS,TNLS,,,BROWN-ALLEN,,BROWN-ALLEN,BM,BROWN-ALLEN,,BROWN-ALLEN,,,BROWN-ALLEN,BROWN-ALLEN,No,,,,,,,"Brown, Allen & Co.","Brown, Allen & Co.","Brown, Allen & Co.",BROWN-ALLEN,"Brown, Allen & Co.",,Texas,United States,New York,O,O,OTC,OTC,OTC,,OTC,,,,,,,,
17,62306002,,913821,913821104,Universal Security Instruments,,1973-01-16,Yes,,Common Shares,,No,No,No,3669,No,N,No,No,,13.00,84375.0,,150000.0,150000.0,150000.0,0.0,,,,,,,,,,,,,,UUU,UUU,UUU,,COENEN,,COENEN,BM,COENEN,,COENEN,,,COENEN,COENEN,No,,,,,,,"Coenen & Co., Inc.","Coenen & Co., Inc.","Coenen & Co., Inc.",COENEN,"Coenen & Co., Inc.",,Maryland,United States,American,A,A,AMEX,American,AMEX,,American,,,,,,,,
18,62305002,,929236,,WD-40 Co,,1973-01-16,Yes,,Common Shares,,No,No,No,2899,No,N,No,No,,16.50,0.0,,300000.0,300000.0,40000.0,260000.0,,,,,,,,,,,,,,WDFC,WDFC,WDFC,,BATEMAN,,WF,BM,BATEMAN,,BATEMAN,,,BATEMAN,WF,No,,,,,,,Bateman Eichler Hill Richards,Bateman Eichler Hill Richards Inc,Bateman Eichler Hill Richards Inc,BATEMAN,Bateman Eichler Hill Richards Inc,,California,United States,Nasdaq,NM,NM,NASDQ,Nasdaq,NASDQ,,Nasdaq,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43241,3399625002,,40266N,,Linx SA,2019-05-29,2019-06-25,Yes,,ADS,,Yes,No,No,7376,No,N,Yes,No,,9.40,166308960.0,16.0,27299898.0,27299898.0,0.0,27299898.0,,,,,,,29274601,,29274601,25/06/19,29274601,29274601,4391190,LINX3,LINX3,LIX,,GS(JB)/MS(JB)/JEFFERIES-LLC(JB)/ML(JB),,GS,JB,GS,,GS/MS/JEFFERIES-LLC/ML/ITAU-BBA-USA,,,GS/MS(JB),GS,No,,,,,,,Morgan Stanley & Co|Jefferies LLC|Merrill Lync...,Morgan Stanley & Co|Jefferies LLC|Merrill Lync...,Morgan Stanley & Co|Jefferies LLC|Merrill Lync...,MS|JEFFERIES-LLC|ML|ITAU-BBA-USA,Morgan Stanley & Co|Jefferies LLC|Merrill Lync...,,Foreign,Brazil,BOVESPA,N,N,NYSE,New York,NYSE,,BMFBOVESPA,,,,,,,,
43299,3398663002,1434265.0,K3967W,,Genmab A/S,2019-02-04,2019-07-17,Yes,,ADS,,Yes,No,No,2836,No,N,Yes,No,,17.75,94298868.0,6.0,28500000.0,32775000.0,28500000.0,0.0,75.881,75.881,4275000,,,,28500000,28500000,,09/07/19|16/07/19|17/07/19,"27,800,000|27,800,000|28,500,000",-|-|-,"4,170,000|4,170,000|-",GEN,GEN,GEN,,ML(JB)/MS(JB)/JEFFERIES-LLC(JB)/GUGG,GUGG/RBC-CAP-MKTS/DANSKE-MARKET/,BOA-MERRILL,JB,ML,DANSKE-MARKET/H-C-WAINWRIGHT/KEMPEN,ML/MS/JEFFERIES-LLC/GUGG/RBC-CAP-MKTS,,,ML/MS(JB),BOA-MERRILL,No,,,,,,,Morgan Stanley & Co|Jefferies LLC|Guggenheim S...,Morgan Stanley & Co|Jefferies LLC,Morgan Stanley & Co|Jefferies LLC|Guggenheim S...,MS|JEFFERIES-LLC|GUGG|RBC-CAP-MKTS,Morgan Stanley & Co|Jefferies LLC|Guggenheim S...,HC Wainwright & Co Inc|Kempen and Co NV,Foreign,Denmark,Copenhagen,NM,NM,NASDQ,Nasdaq,NASDQ,,OMX Copen,,,,,,,,
43326,3424433002,,5F7277,,Vista Oil & Gas SAB de CV,2019-07-02,2019-07-25,Yes,,ADS,,Yes,No,No,1311,No,N,Yes,No,,9.25,85929002.0,7.0,10000000.0,10000000.0,10000000.0,0.0,13.875,,,,,,10000000,10000000,,18/07/19,10000000,,1500000,VISTAA,VISTAA,VOG,,CITIGROUP-GM(JB)/CREDIT-SEC-USA(JB),,CITI,JB,CITIGROUP-GM,,CITIGROUP-GM/CREDIT-SEC-USA/ITAU-BBA-USA,,,CITIGROUP-GM,CITI,No,,,,,,,Credit Suisse Securities (USA)|Itau BBA USA Se...,Credit Suisse Securities (USA) LLC|Itau BBA US...,Credit Suisse Securities (USA) LLC|Itau BBA US...,CREDIT-SEC-USA|ITAU-BBA-USA|MS|SANTUS,Credit Suisse Securities (USA) LLC|Itau BBA US...,,Foreign,Mexico,Mexico,N,N,NYSE,New York,NYSE,,Mexico,,,,,,,,
43404,3449038002,1259942.0,07986A,,BELLUS Health Inc,2019-09-03,2019-09-04,Yes,,Common Shares,,No,No,No,2834,No,N,Yes,No,,7.10,55378660.0,4.0,9859155.0,11179451.0,9859155.0,0.0,10.500,9.374,1320296,,,,,,,,,,,BLU,,BLU,,JEFFERIES-LLC(JB)/COWEN-CO(JB)/GUGG(JB),BAIRD/BLOOM-BURTON,JEFFERIES-LLC,JB,JEFFERIES-LLC,BLOOM-BURTON,JEFFERIES-LLC/COWEN-CO/GUGG/BAIRD,,,JEFFERIES-LLC,JEFFERIES-LLC,No,,,,,,,Cowen & Co|Guggenheim Securities LLC|Robert W ...,Cowen & Co|Guggenheim Securities LLC,Cowen & Co|Guggenheim Securities LLC|Robert W ...,COWEN-CO|GUGG|BAIRD,Cowen & Co|Guggenheim Securities LLC|Robert W ...,Bloom Burton & Co,Foreign,Canada,Toronto,NM,NM,NASDQ,Nasdaq,NASDQ,,Toronto,,,,,,,,


In [200]:
price_cols

array(['OfferPrice', 'AmendedHighFilingPrice', 'AmendedLowFilingPrice',
       'AmendedMiddleOfFilingPrice', 'HighPriceOfFilingPriceRnge',
       'LowPriceOfFilingPriceRnge', 'OriginalHighFilingPrice',
       'OriginalLowFilingPrice', 'OriginalMiddleOfFilingPriceRange'],
      dtype=object)

In [202]:
# how large is the price span?
sdc_data.loc[:, 'AmendedFilingPriceSpan'] = sdc_data.loc[: , ['AmendedHighFilingPrice']] - sdc_data.loc[: , ['AmendedLowFilingPrice']]
sdc_data.loc[:, 'OriginalFilingPriceSpan'] = sdc_data.loc[: , ['OriginalHighFilingPrice']] - sdc_data.loc[: , ['OriginalLowFilingPrice']]

In [216]:
sdc_data.loc[: , ['AmendedHighFilingPrice']]


Unnamed: 0,AmendedHighFilingPrice
0,
1,
2,
3,
4,
...,...
43704,28.67
43705,10.00
43706,10.00
43707,


In [215]:
len(sdc_data['AmendedFilingPriceSpan'])


43709

In [207]:
sdc_data[np.isfinite(sdc_data['AmendedFilingPriceSpan'])]['AmendedFilingPriceSpan']

Series([], Name: AmendedFilingPriceSpan, dtype: float64)

In [94]:
# create year variable
sdc_data['Year'] = sdc_data['IssueDate'].dt.year

In [None]:


# Questions: which values do i need?
# IPO Flag (Y/N)
#sum(sdc_data.loc[:, "IPOFlag(Y/N)"] == "Yes")
# Out: 18224
#sum(sdc_data.loc[:, "IPOFlag(Y/N)"] == "No")
# Out: 31503

sdc_data_usa = sdc_data.loc[sdc_data['Nation'] == "United States", :]
sdc_data_usa_ipo = sdc_data_usa.loc[sdc_data['IPOFlagYN'] == 'Yes', :]

df = sdc_data_usa_ipo.copy()
print(df.shape)
df = df.dropna(subset=['FilingDate'])
print(df.shape)
df = df.dropna(subset=['OfferPrice'])
print(df.shape)
df = df.dropna(subset=['OriginalHighFilingPrice'])
print(df.shape)
df = df.dropna(subset=['OriginalLowFilingPrice'])
print(df.shape)
#df = df.dropna(subset=['LowPriceofFilingPriceRnge'])
#print(df.shape)
#df = df.dropna(subset=['HighPriceofFilingPriceRnge'])
#print(df.shape)

df_without_range = df.dropna(subset=['HighPriceofFilingPriceRnge'])
df_without_range = df.dropna(subset=['LowPriceofFilingPriceRnge'])

# Data per Year?
data_per_year = sdc_data.groupby('Year')['IssueDate'].size()

# How many data for US per Year?
data_US_per_year = sdc_data_usa.groupby('Year')['IssueDate'].count()

# How many data for US IPOs per Year?
ipo_per_year = sdc_data_usa_ipo.groupby('Year')['IssueDate'].count()

# How many OfferPrices are per year given?
ipo_US_with_data_per_year = df.groupby('Year')['OfferPrice'].count()

# How many Ranges are per year given?
ipo_US_with_range_data_per_year = df_without_range.groupby('Year')['OfferPrice'].count()

ipos = pd.merge(data_per_year, data_US_per_year, how='outer', on="Year")
ipos = pd.merge(ipos, ipo_per_year, how='outer', on="Year")
ipos = pd.merge(ipos, ipo_US_with_data_per_year, how='outer', on="Year")
ipos = pd.merge(ipos, ipo_US_with_range_data_per_year, how='outer', on='Year')
# create barplot with numbers of IPOs
fig, ax = plt.subplots()
width = 0.35
p1 = ax.bar(ipos.index, ipos['IssueDate_x'])
p2 = ax.bar(ipos.index, ipos['IssueDate_y'])
p3 = ax.bar(ipos.index, ipos['IssueDate'])
p4 = ax.bar(ipos.index, ipos['OfferPrice_x'])
p5 = ax.bar(ipos.index, ipos['OfferPrice_y'])

ax.legend((p1[0], p2[0], p3[0], p4[0], p5[0]),
          ('data in SDC',
           'US data in SDC',
           'US IPOs in SDC',
           'US IPOs in SDC with needed price data',
           'US IPOs in SDC with needed range data')
          )
ax.autoscale_view()
plt.show()


# TODO: set date range
sdc_data_subset = sdc_data_subset.loc[sdc_data['Year'] > 2000]

# set Issue Date as index
sdc_data.index = sdc_data["IssueDate"]

sdc_data.groupby(["Year"]).count()

# TODO: Offer Price
# TODO: Original Low Filing Price
# TODO: Original High Filing Price

#sum(sdc_data.loc[["Filing Date"].notna() , ["Filing Date"]] == sdc_data.index)
#sum(sdc_data.loc[sdc_data["Filing Date"].notna() , ["Filing Date"]].values== sdc_data.loc[sdc_data["Filing Date"].notna() , :].index)

# TODO: how many values are since 1983 given?

# TODO: Get only numeric values
#def to_numeric(df, column):
#    df[df[[column]].apply(lambda x: x[0].isdigit(), axis=1)]
#to_numeric(sdc_data, "High Price of Filing Price Range")

type(sdc_data["HighPriceOfFilingPriceRange"][49724])
test = pd.to_numeric(sdc_data["HighPriceOfFilingPriceRange"], errors='coerce')
cleaned_data = sdc_data[pd.to_numeric(sdc_data["HighPriceOfFilingPriceRange"], errors='coerce').notnull()]
test.isna().sum()

# quick modeling

sdc_data_1987 = sdc_data.loc[(sdc_data["IssueDate"] > "1983-01-01") & (sdc_data["IssueDate"] < "1987-09-30"), :]
# 5009 examples from 1983 to Sept 1987

sum(sdc_data_1987.loc[:, "IPOFlag(Y/N)"] == "Yes")
# 2689 examples with IPO Flag

sdc_data_1987_IPO = sdc_data_1987.loc[sdc_data_1987["IPOFlag(Y/N)"] == "Yes", :]

sdc_data_1987_IPO.isna().sum()
# missing values
#Original Low Filing Price                        488
#Original High Filing Price                       489
#Low Price of Filing Price Range                2679
#High Price of Filing Price Range                2679

sdc_data_1987_IPO = sdc_data_1987_IPO.loc[sdc_data_1987_IPO["OriginalHighFilingPrice"].notna(), :]
sdc_data_1987_IPO = sdc_data_1987_IPO.drop(columns=["IssueDate"])
sdc_data_1987_IPO = sdc_data_1987_IPO.reset_index()


#sdc_data_1987_IPO = sdc_data[pd.to_numeric(sdc_data_1987_IPO["OfferPrice"])]
#sdc_data_1987_IPO = sdc_data[pd.to_numeric(sdc_data_1987_IPO["OriginalHighFilingPrice"])]

#features = np.array[("IssueDate", "IPOFlag(Y/N", "OfferPrice", "OriginalLowFilingPrice", "OriginalHighFilingPrice")]
#results = smf.ols('OfferPrice ~ OriginalLowFilingPrice + OriginalHighFilingPrice', data=sdc_data_1987_IPO).fit()
#print(results.summary())

#X = sdc_data_1987_IPO.loc[:, ["OriginalLowFilingPrice", "OriginalHighFilingPrice"]].values
#y = sdc_data_1987_IPO["OfferPrice"]
#X = sm.add_constant(X)

df = sdc_data
regex = r"\d+.\d+,\d+"

df[~df["OfferPrice"].str.contains(pat=regex, regex=True).fillna(value = True).values]
df[~df["OriginalHighFilingPrice"].str.contains(pat=regex, regex=True).fillna(value = True).values]
df[~df["OriginalLowFilingPrice"].str.contains(pat=regex, regex=True).fillna(value = True).values]

# Test two indizes
#2019-08-20 1.000,000
#2019-08-16 ,810
#2019-08-15 1,250
#2019-07-26 ,75
#2019-04-05 7.500,000

sample = df.loc[df.index.isin(["2019-08-20",
                               "2019-08-16",
                               "2019-08-15",
                               "2019-07-26",
                               "2019-04-05"]), :]

#test = df[pd.to_numeric(df["OriginalLowFilingPrice"], errors='coerce')]

verb = df.loc[df['Issuer'] == "Verb Technology Co Inc", :]


test = sample.loc[:, ['OfferPrice',
               'OriginalHighFilingPrice',
               'OriginalLowFilingPrice',
               'LowPriceOfFilingPriceRange',
               'HighPriceOfFilingPriceRange']]


