# Finance V: Research Topics in Finance, Risk- and Resource management 
## Replication of paper: Lowry, Michaely & Volkova (2017)

<blockquote>
    Author: Stefan Reimer <br>
    Date: 2019-12-28 <br>
    python version: 3.7 <br>
</blockquote>

In [2]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import collections

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# import defined functions
from src.functions.functions import (data_import_chunkwise, convert_NAs, get_duplicates, 
                                     find_char_in_colnames, convert_date, convert_price)

# set the settings for displayed dataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### load cleaned sdc and crsp data

In [3]:
#%%
# define path to files & load data chunkwise

initialFolderPath = "data/initial_data/"
cleanedFolderPath = "data/cleaned_data/"

sdcPath = cleanedFolderPath + 'sdc_data_cleaned.pkl'
crspPath = cleanedFolderPath + 'crsp_only_ret_c.pkl'
sdc = pd.read_pickle(sdcPath)
crsp = pd.read_pickle(crspPath)

sdc = sdc.reset_index()

# Match CRSP and SDC Data

In [48]:
# there are duplicates in SDC
sdc = sdc.sort_values('CUSIP8')
duplicates = sdc['CUSIP8'].duplicated()
sdc.loc[sdc['CUSIP8']=='00154710', :]
sdc.loc[duplicates, :]

Unnamed: 0,level_0,index,DealNumber,CIK,CUSIP6,CUSIP9,Issuer,FilingDate,IssueDate,IPO,OrigIPO,Type,REIT,ADR,Unit,CEF,MainSICCode,Units,SpinOff,ForeignIssue,TrackingStockIssue,BestEftFirmCmtBghtDl,OfferPrice,SharesOutstandingAfterTheOffering,SharesOutstandingBeforeOffering,SharesOfferedSumOfAllMkts,SharesOfrdIncOverSoldSumOfAllMkts,PrimaryShsOfrdSumOfAllMkts,SecondaryShsOfrdSumOfAllMkts,OverallotAmtOptionSumOfAllMktsMil,OverallotAmtSoldSumOfAllMktsmil,TotGlobalOverallotmentSharesSold,SharesFiledSumOfAllMkts,PrimaryShsFiledSumOfAllMkts,SecondaryShsFiledSumOfAllMkts,AmendedShsFiledSumOfAllMkts,AmendedPrimaryShsFiledSumOfAllMkts,AmendedSecondaryShsFiledSumOfAllMkts,AmendMentDate,AmendHistShsFiledSumOfAllMkts,AmendHistSecShsFiledSumOfAllMkts,AmendHistOverallotShsOptionSumOfAllMkts,TickerAtIssue,TickerUltimateParents,TickerCurrent,TickerSpinOffParent,AllManagers_x,NonBookrunners,AllManagersParentsCode,AllMgrRoleCode,Managers_x,CoManagers,LeadManager,ManagerAgentsLawyersCode,DomesticSyndicateMemberCode,Bookrunners_x,BookrunnersParent,VentureBacked,FirmName,FundName,RoundNumberOfInvesTors,DisclosedRoundTotalmil,RoundDate,TotalKnownAmtInvestedInCompany000,AllManagers_y,Bookrunners_y,LeadManagersLongName,LeadManagers,Managers_y,CoManagersLongName,State,Nation,CurrentExchangeLongDescriptio,AllExchangesWhereIssueWillBeListed_1,AllExchangesWhereIssueWillBeListed_2,ExchangeWhereIssuWillBeLi,AllExchangesWhereIssuersStockTrades,FirstTwoExchangesWhereIssueWillBeListed,SpinParExch,PrimaryExchangeWhereIssuersStockTrades,AmendedHighFilingPrice,AmendedLowFilingPrice,AmendedMiddleOfFilingPrice,HighPriceOfFilingPriceRnge,LowPriceOfFilingPriceRnge,OriginalHighFilingPrice,OriginalLowFilingPrice,OriginalMiddleOfFilingPriceRange,Year,CUSIP8


In [49]:
sdc = sdc.drop_duplicates(subset='CUSIP8')

In [58]:
crsp

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,NCUSIP,CUSIP,PRC,RET,SHROUT,NCUSIP6,PERMNO_start_date,NCUSIP_start_date,permno,BEGDAT,BEGPRC,BEGVOL,IssueDate,CUSIP6,C,dif_dt_isdt
15161,10003.0,1986-01-14,11.0,3.0,39031810,39031810,-18.5000,C,1900.0,390318,1986-01-04,1986-01-04,10003.0,1986-01-14,1986-01-14,1986-01-14,1986-01-14,390318,True,0
22872,10008.0,1986-01-16,10.0,3.0,36547310,36547310,-14.0625,C,2945.0,365473,1986-01-04,1986-01-04,10008.0,1986-01-16,1986-01-16,1986-01-16,1986-01-16,365473,True,0
38463,10015.0,1983-09-20,10.0,3.0,00016510,00016510,-6.5625,C,3568.0,000165,1983-01-11,1983-01-11,10015.0,1983-09-20,1983-09-20,1983-09-20,1983-09-20,000165,True,0
43017,10017.0,1986-01-24,10.0,3.0,20670910,20670910,-21.8750,C,11468.0,206709,1986-01-04,1986-01-04,10017.0,1986-01-24,1986-01-24,1986-01-24,1986-01-24,206709,True,0
46173,10019.0,1986-01-24,11.0,3.0,44950710,44950710,-11.5625,C,6175.0,449507,1986-01-04,1986-01-04,10019.0,1986-01-24,1986-01-24,1986-01-24,1986-01-24,449507,True,0
56325,10025.0,1986-01-30,11.0,3.0,00103110,00103110,-12.6250,C,2506.0,001031,1986-01-04,1986-01-04,10025.0,1986-01-30,1986-01-30,1986-01-30,1986-01-30,001031,True,0
129184,10046.0,1986-02-14,11.0,3.0,20225530,91354910,-15.2500,C,300.0,202255,1986-01-04,1986-01-04,10046.0,1986-02-14,1986-02-14,1986-02-14,1986-02-11,202255,True,3
175786,10064.0,1986-02-20,11.0,3.0,90212810,90212810,-6.0625,C,3201.0,902128,1986-01-04,1986-01-04,10064.0,1986-02-20,1986-02-20,1986-02-20,1986-02-19,902128,True,1
192622,10067.0,1986-02-20,10.0,3.0,44903510,44903510,-10.3750,C,3223.0,449035,1986-01-04,1986-01-04,10067.0,1986-02-20,1986-02-20,1986-02-20,1986-02-20,449035,True,0
194499,10071.0,1986-02-28,11.0,3.0,01390210,01390210,29.2500,C,9203.0,013902,1986-01-04,1986-01-04,10071.0,1986-02-28,1986-02-28,1986-02-28,1986-02-28,013902,True,0


In [61]:
# drop SDC own values
crsp = crsp.drop(columns=['CUSIP6', 'IssueDate'])

In [62]:
data_merged = pd.merge(sdc, crsp, how='left', left_on='CUSIP8', right_on='NCUSIP')
#data_merged = pd.merge(sdc, crsp, how='left', left_on='CUSIP6', right_on='CUSIP6')

In [63]:
data_merged = data_merged.loc[data_merged['PRC'].notna(), :]
len(data_merged)

6239

In [64]:
data_merged.to_pickle(cleanedFolderPath + 'data_prepared.pkl')

### removing extra variables
for drop_column in ['REIT', 'Unit', 'CEF', 'CUSIP6', 'CUSIP8', 'CUSIP9', 'IPO', 'OrigIPO']:
    # First_CRSP_date_ncusip, First_CRSP_date_ncusip6, Permno_ncusip, Permno_ncusip6, dif, Depositary
    sdc_data = sdc_data.drop(columns=drop_column)