In [6]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import collections

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# import defined functions
from src.functions.functions import (data_import_chunkwise, convert_NAs, get_duplicates, 
                                     find_char_in_colnames, convert_date, convert_price)

# set the settings for displayed dataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## functions 

In [2]:
def create_date_difference(df, date1, date2, new_col_name='dif', info_print = True):
  """
  function to create the date dif variable for two dates 
  @param df: dataframe containing the two date columns
  @param date1: date to extract date2 from
  @param date2: date which should be extracted from date1
  @return: dataframe with added "dif" column, which describes the range between date1 and date2
  """
  df.loc[:, new_col_name] = df.loc[:, date1] - df.loc[:, date2]
  df.loc[:, new_col_name] = df[new_col_name].dt.days 
  if(info_print):
    difminus60 = (df[new_col_name]<=-60.0).sum()
    difminus30 = ((df[new_col_name]>-60.0) &(df[new_col_name]<=-30.0)).sum()
    difminus10 = ((df[new_col_name]>-30.0) &(df[new_col_name]<-10.0)).sum()
    difminus1 = ((df[new_col_name]>-10.0) & (df[new_col_name]<-1.0)).sum()
    difunder7 = ((df[new_col_name]>=-1.0) & (df[new_col_name]<=7.0)).sum()
    difover7 = ((df[new_col_name]>7.0) & (df[new_col_name]<15.0)).sum()
    difover15 = (df[new_col_name]>=15.0).sum()
    print(f'{new_col_name}:  dif<=-60: {difminus60}, -60 > dif <= -30: {difminus30}, -30 <= dif < -10: {difminus10} \n , -10 < dif < -1: {difminus1}, -1 <= dif <= 7: {difunder7}, 7 < dif < 15: {difover7}, 15 <= dif: {difover15} \n')
  return df

def print_histogram(df, bins='auto', normed=False, xlim=(-500, 100), title=""):
  """
  function to print a histogram
  """
  plt.hist(df, bins=bins, normed=normed)
  plt.xlim(xlim)
  plt.title(title)
  plt.show()



## data loading

In [3]:
# define path to files & load data chunkwise

initialFolderPath = "data/initial_data/"
cleanedFolderPath = "data/cleaned_data/"

filePath = cleanedFolderPath + "crsp_data_merged.pkl"

crsp = pd.read_pickle(filePath)

In [4]:
print(len(crsp['PERMNO'].unique()))
print(len(crsp['NCUSIP'].unique()))

9102
9851


## inspect RET == 'C' as a CRSP marker for IPOs

In [5]:
# create marker from RET
crsp['C'] = (crsp['RET']=='C')

In [None]:
# exemplary case, in which RET == 'C' marks the IssueDate
# crsp.loc[crsp['PERMNO']==93428, :].head(50)
# 93272, 93428: fits the date
# 93330: shifted, PRC at 04.01. is googled PRC at 01.04.

In [None]:
# create difference between date and IssueDate
crsp = create_date_difference(df = crsp, date1 = 'date', date2 = 'IssueDate', 
                        new_col_name='dif_dt_isdt', info_print = False)

In [7]:
# inspect the relation in 'C' and 'dif_dt_isdt'
dif_dt_isdt = crsp.loc[crsp['C'], 'dif_dt_isdt']
print(len(dif_dt_isdt))
print(((dif_dt_isdt >= 7)).sum())
print(((dif_dt_isdt >= -1) &  (dif_dt_isdt <= 7)).sum())
print(((dif_dt_isdt <= -1) & (dif_dt_isdt >= -10)).sum())
print(((dif_dt_isdt <= -10)).sum())

In [29]:
# check, how many 'C' values per NCUSIP are given
C_freq = crsp.groupby('NCUSIP').agg({'C': sum})
collections.Counter(C_freq['C'])

Counter({0.0: 779, 1.0: 7734, 2.0: 1259, 3.0: 60, 4.0: 16, 5.0: 3})

In [41]:
# keep only marked 'C' values and values with fitting difference
crsp_only_c = crsp.loc[((crsp['C']) & (crsp['dif_dt_isdt'] >= -1) &  (crsp['dif_dt_isdt'] <= 7)) , :]
crsp_only_c.to_pickle(cleanedFolderPath + 'crsp_only_ret_c.pkl')

In [42]:
crsp_only_c

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,NCUSIP,CUSIP,PRC,RET,SHROUT,NCUSIP6,PERMNO_start_date,NCUSIP_start_date,permno,BEGDAT,BEGPRC,BEGVOL,IssueDate,CUSIP6,C,dif_dt_isdt
15161,10003.0,1986-01-14,11.0,3.0,39031810,39031810,-18.5000,C,1900.0,390318,1986-01-04,1986-01-04,10003.0,1986-01-14,1986-01-14,1986-01-14,1986-01-14,390318,True,0
22872,10008.0,1986-01-16,10.0,3.0,36547310,36547310,-14.0625,C,2945.0,365473,1986-01-04,1986-01-04,10008.0,1986-01-16,1986-01-16,1986-01-16,1986-01-16,365473,True,0
38463,10015.0,1983-09-20,10.0,3.0,00016510,00016510,-6.5625,C,3568.0,000165,1983-01-11,1983-01-11,10015.0,1983-09-20,1983-09-20,1983-09-20,1983-09-20,000165,True,0
43017,10017.0,1986-01-24,10.0,3.0,20670910,20670910,-21.8750,C,11468.0,206709,1986-01-04,1986-01-04,10017.0,1986-01-24,1986-01-24,1986-01-24,1986-01-24,206709,True,0
46173,10019.0,1986-01-24,11.0,3.0,44950710,44950710,-11.5625,C,6175.0,449507,1986-01-04,1986-01-04,10019.0,1986-01-24,1986-01-24,1986-01-24,1986-01-24,449507,True,0
56325,10025.0,1986-01-30,11.0,3.0,00103110,00103110,-12.6250,C,2506.0,001031,1986-01-04,1986-01-04,10025.0,1986-01-30,1986-01-30,1986-01-30,1986-01-30,001031,True,0
129184,10046.0,1986-02-14,11.0,3.0,20225530,91354910,-15.2500,C,300.0,202255,1986-01-04,1986-01-04,10046.0,1986-02-14,1986-02-14,1986-02-14,1986-02-11,202255,True,3
175786,10064.0,1986-02-20,11.0,3.0,90212810,90212810,-6.0625,C,3201.0,902128,1986-01-04,1986-01-04,10064.0,1986-02-20,1986-02-20,1986-02-20,1986-02-19,902128,True,1
192622,10067.0,1986-02-20,10.0,3.0,44903510,44903510,-10.3750,C,3223.0,449035,1986-01-04,1986-01-04,10067.0,1986-02-20,1986-02-20,1986-02-20,1986-02-20,449035,True,0
194499,10071.0,1986-02-28,11.0,3.0,01390210,01390210,29.2500,C,9203.0,013902,1986-01-04,1986-01-04,10071.0,1986-02-28,1986-02-28,1986-02-28,1986-02-28,013902,True,0


## other data exploration

In [None]:
# create column with first PERMNO date
first_PERMNO_date = crsp.groupby('PERMNO').agg(PERMNO_start_date =('date', min))
crsp = pd.merge(crsp, first_PERMNO_date, how='left', left_on='PERMNO', right_on='PERMNO')

# create column with first NCUSIP date
first_NCUSIP_date = crsp.groupby('NCUSIP').agg(NCUSIP_start_date=('date', min))
crsp = pd.merge(crsp, first_NCUSIP_date, how='left', left_on='NCUSIP', right_on='NCUSIP')

In [None]:
# Question: How many PERMNO are containing multiple NCUSIP?
PERMNO_NCUSIP = crsp.groupby(['PERMNO', 'NCUSIP']).agg(PERMNO_start_date =('date', min))
PERMNO_NCUSIP.head(100)
PERMNO_NCUSIP = PERMNO_NCUSIP.reset_index()

# get unique amount of NCUSIP, PERMNO, CUSIP in CRSP
NCUSIP_crsp = crsp['NCUSIP'].unique()
NCUSIP_crsp = pd.Series(NCUSIP_crsp)
print(len(NCUSIP_crsp))

PERMNO_crsp = crsp['PERMNO'].unique()
PERMNO_crsp = pd.Series(PERMNO_crsp)
print(len(PERMNO_crsp))

CUSIP_crsp = crsp['CUSIP'].unique()
CUSIP_crsp = pd.Series(CUSIP_crsp)
print(len(CUSIP_crsp))

## create additional difference values

In [6]:
# drop unneeded values
crsp = crsp.drop(columns=['date', 'SHRCD', 'EXCHCD', 
                          'CUSIP', 'PRC', 'RET',
                          'SHROUT', 'CUSIP6', 'permno',
                          'BEGPRC', 'BEGVOL'])

In [7]:
# aggregate time series to only descriptive, time invariate attributes 
crsp = crsp.groupby(['PERMNO', 'NCUSIP']).agg({
    'PERMNO_start_date' : min,
    'NCUSIP_start_date' : min,
    'BEGDAT' : min,
    'BEGPRC' : min,
    'BEGVOL' : min,
    'IssueDate' : min})
crsp = crsp.reset_index()
print(len(crsp))

In [58]:
# create attributes as differences between the dates
crsp = create_date_difference(df = crsp, date1 = 'BEGDAT', date2 = 'IssueDate', 
                        new_col_name='dif_issue_beg', info_print = True)

crsp = create_date_difference(df = crsp, date1 = 'PERMNO_start_date', date2 = 'IssueDate', 
                        new_col_name='dif_issue_permno', info_print = True)

crsp = create_date_difference(df = crsp, date1 = 'NCUSIP_start_date', date2 = 'IssueDate', 
                        new_col_name='dif_issue_ncusip', info_print = True)

dif_issue_beg:  dif<=-60: 252, -60 > dif <= -30: 7, -30 <= dif < -10: 2 
 , -10 < dif < -1: 10, -1 <= dif <= 7: 15918, 7 < dif < 15: 109, 15 <= dif: 266 

dif_issue_name:  dif<=-60: 260, -60 > dif <= -30: 5, -30 <= dif < -10: 8 
 , -10 < dif < -1: 4, -1 <= dif <= 7: 9368, 7 < dif < 15: 73, 15 <= dif: 7759 

dif_issue_permno:  dif<=-60: 13610, -60 > dif <= -30: 1645, -30 <= dif < -10: 748 
 , -10 < dif < -1: 104, -1 <= dif <= 7: 930, 7 < dif < 15: 7, 15 <= dif: 416 

dif_issue_st_date:  dif<=-60: 353, -60 > dif <= -30: 7, -30 <= dif < -10: 0 
 , -10 < dif < -1: 4, -1 <= dif <= 7: 4134, 7 < dif < 15: 3910, 15 <= dif: 9070 

dif_issue_ncusip:  dif<=-60: 11652, -60 > dif <= -30: 1325, -30 <= dif < -10: 656 
 , -10 < dif < -1: 75, -1 <= dif <= 7: 808, 7 < dif < 15: 7, 15 <= dif: 2939 



Bei PERMNO und NCUSIP der 'CRSP daily data' sind nur wenige IPOs tatsächlich innerhalb von -1 bis +7 Tagen um das Startdatum.

Der BEGDAT der 'CRSP Header Information' ist in nahezu allen Fällen innerhalb der Range.

## additional: merge stocknames

In [None]:
# merge stocknames to crsp
# import stocknames
# stocknames_path = initialFolderPath + 'stocknames.sas7bdat'
# stocknames = pd.read_sas(stocknames_path)

# crsp = pd.merge(crsp, stocknames[['NAMEDT', 'NAMEENDDT', 'ST_DATE', 'PERMNO']], how='left', left_on='PERMNO', right_on='PERMNO')

#crsp = create_date_difference(df = crsp, date1 = 'NAMEDT', date2 = 'IssueDate', 
#                        new_col_name='dif_issue_name', info_print = True)

#crsp = create_date_difference(df = crsp, date1 = 'ST_DATE', date2 = 'IssueDate', 
#                        new_col_name='dif_issue_st_date', info_print = True)

Der NAMEDT der 'Stocknames' ist in ca. der Hälfte der Fälle innerhalb der Range.

Das ST_Date der 'Stocknames' ist 1/3-1/4 der Fälle innerhalb der Range, ansonsten oberhalb.

## backup

##### test CRSP with first values (dropped by NCUSIP6)
cusip_no_dup = crsp.drop_duplicates('NCUSIP6')
data_merged = pd.merge(sdc_data, cusip_no_dup, left_on=['CUSIP8'], right_on=['NCUSIP'], how='left')
data_merged = create_create_date_difference(df = data_merged, date1 = 'date', 
                                            date 2 = 'IssueDate', new_col_name = 'dif')

print_histogram(df = data_merged.loc[(data_merged['dif']>-400) & (data_merged['dif']<200), 'dif'],
                bins = 100,
                normed = True,
                xlim = (-500, 100),
                title = 'Histogram of difference between first day SDC vs. CRSP')

##### plot the distribution of the difference from all days to the day of begin
##### for each company
print_histogram(df = crsp['dif'],
                bins = 100,
                normed = True,
                xlim = (-10000, 30000),
                title = 'Histogram of difference between first day CRSP vs. \
                and begin date')

##### select only price between -1 and 7 days around the date of begin
crsp = crsp.loc[(crsp['dif']>=-1) & (crsp['dif']<=7), :]

##### keep only the first value for each company
crsp = crsp.drop_duplicates('PERMNO', keep='first')

##### plot the distribution of the difference from all days to the day of begin
##### for each company after selecting
print_histogram(df = crsp['dif'],
                bins = 8,
                normed = True,
                xlim = (-10, 10),
                title = 'Histogram of difference between first day CRSP \
                vs. and begin date')