# ICLINK

In [10]:
import wrds
import pandas as pd

conn = wrds.Connection(wrds_username='xiaomowu')

Loading library list...
Done


In [11]:
#########################
# Step 1: Link by CUSIP #
#########################

# 1.1 IBES: Get the list of IBES Tickers for US firms in IBES
_ibes1 = conn.raw_sql("""
                      select ticker, cusip, cname, sdates from ibes.id
                      where usfirm=1 and cusip != ''
                      """)
len(_ibes1)

86242

In [12]:
# Create first and last 'start dates' for a given cusip
# Use agg min and max to find the first and last date per group
# then rename to fdate and ldate respectively

_ibes1_date = _ibes1.groupby(['ticker','cusip']).sdates.agg(['min', 'max'])\
.reset_index().rename(columns={'min':'fdate', 'max':'ldate'})

# merge fdate ldate back to _ibes1 data
_ibes2 = pd.merge(_ibes1, _ibes1_date,how='left', on =['ticker','cusip'])
_ibes2 = _ibes2.sort_values(by=['ticker','cusip','sdates'])

# keep only the most recent company name
# determined by having sdates = ldate
_ibes2 = _ibes2.loc[_ibes2.sdates == _ibes2.ldate].drop(['sdates'], axis=1)

In [13]:
# 1.2 CRSP: Get all permno-ncusip combinations
_crsp1 = conn.raw_sql("""
                      select permno, ncusip, comnam, namedt, nameenddt
                      from crsp.stocknames
                      where ncusip != ''
                      """)

In [14]:
# first namedt
_crsp1_fnamedt = _crsp1.groupby(['permno','ncusip']).namedt.min().reset_index()

_crsp1_fnamedt

Unnamed: 0,permno,ncusip,namedt
0,10000.0,68391610,1986-01-07
1,10001.0,29269V10,2009-08-04
2,10001.0,29274A10,1993-11-22
3,10001.0,29274A20,2008-02-05
4,10001.0,36720410,2010-07-09
...,...,...,...
44159,93433.0,92870X10,2013-04-10
44160,93433.0,92870X30,2013-04-24
44161,93434.0,78513510,2010-06-14
44162,93435.0,82936G20,2010-06-14


In [15]:
# last nameenddt
_crsp1_lnameenddt = _crsp1.groupby(['permno','ncusip']).nameenddt.max().reset_index()
_crsp1_lnameenddt

Unnamed: 0,permno,ncusip,nameenddt
0,10000.0,68391610,1987-06-11
1,10001.0,29269V10,2010-07-08
2,10001.0,29274A10,2008-02-04
3,10001.0,29274A20,2009-08-03
4,10001.0,36720410,2017-08-03
...,...,...,...
44159,93433.0,92870X10,2013-04-23
44160,93433.0,92870X30,2016-12-22
44161,93434.0,78513510,2018-12-31
44162,93435.0,82936G20,2012-05-18


In [16]:
# merge both 
_crsp1_dtrange = pd.merge(_crsp1_fnamedt, _crsp1_lnameenddt, \
                          on = ['permno','ncusip'], how='inner')
_crsp1_dtrange

Unnamed: 0,permno,ncusip,namedt,nameenddt
0,10000.0,68391610,1986-01-07,1987-06-11
1,10001.0,29269V10,2009-08-04,2010-07-08
2,10001.0,29274A10,1993-11-22,2008-02-04
3,10001.0,29274A20,2008-02-05,2009-08-03
4,10001.0,36720410,2010-07-09,2017-08-03
...,...,...,...,...
44159,93433.0,92870X10,2013-04-10,2013-04-23
44160,93433.0,92870X30,2013-04-24,2016-12-22
44161,93434.0,78513510,2010-06-14,2018-12-31
44162,93435.0,82936G20,2010-06-14,2012-05-18


In [17]:
# replace namedt and nameenddt with the version from the dtrange
_crsp1 = _crsp1.drop(['namedt'],axis=1).rename(columns={'nameenddt':'enddt'})
_crsp2 = pd.merge(_crsp1, _crsp1_dtrange, on =['permno','ncusip'], how='inner')
_crsp2

Unnamed: 0,permno,ncusip,comnam,enddt,namedt,nameenddt
0,10000.0,68391610,OPTIMUM MANUFACTURING INC,1987-06-11,1986-01-07,1987-06-11
1,10001.0,39040610,GREAT FALLS GAS CO,1993-11-21,1986-01-09,1993-11-21
2,10001.0,29274A10,ENERGY WEST INC,2008-02-04,1993-11-22,2008-02-04
3,10001.0,29274A20,ENERGY WEST INC,2009-08-03,2008-02-05,2009-08-03
4,10001.0,29269V10,ENERGY INC,2009-12-17,2009-08-04,2010-07-08
...,...,...,...,...,...,...
55733,93433.0,92870X30,VOLTARI CORP,2016-12-22,2013-04-24,2016-12-22
55734,93434.0,78513510,S & W SEED CO,2018-12-31,2010-06-14,2018-12-31
55735,93435.0,82936G20,SINO CLEAN ENERGY INC,2012-05-18,2010-06-14,2012-05-18
55736,93436.0,88160R10,TESLA MOTORS INC,2017-02-01,2010-06-29,2018-12-31


In [19]:
# keep only most recent company name
_crsp2 = _crsp2.loc[_crsp2.enddt ==_crsp2.nameenddt].drop(['enddt'], axis=1)
_crsp2

Unnamed: 0,permno,ncusip,comnam,namedt,nameenddt
0,10000.0,68391610,OPTIMUM MANUFACTURING INC,1986-01-07,1987-06-11
1,10001.0,39040610,GREAT FALLS GAS CO,1986-01-09,1993-11-21
2,10001.0,29274A10,ENERGY WEST INC,1993-11-22,2008-02-04
3,10001.0,29274A20,ENERGY WEST INC,2008-02-05,2009-08-03
5,10001.0,29269V10,ENERGY INC,2009-08-04,2010-07-08
...,...,...,...,...,...
55732,93433.0,92870X10,VOLTARI CORP,2013-04-10,2013-04-23
55733,93433.0,92870X30,VOLTARI CORP,2013-04-24,2016-12-22
55734,93434.0,78513510,S & W SEED CO,2010-06-14,2018-12-31
55735,93435.0,82936G20,SINO CLEAN ENERGY INC,2010-06-14,2012-05-18


In [21]:
# Link by full cusip, company names and dates
_link1_1 = pd.merge(_ibes2, _crsp2, how='inner', left_on='cusip', right_on='ncusip')\
.sort_values(['ticker','permno','ldate'])
len(_link1_1)
_link1_1

26250

Unnamed: 0,ticker,cusip,cname,fdate,ldate,permno,ncusip,comnam,namedt,nameenddt
0,0000,87482X10,TALMER BANCORP,2014-02-20,2014-03-20,14471.0,87482X10,TALMER BANCORP INC,2014-02-12,2016-08-31
1,0001,26878510,EP ENGR CORP,2014-02-20,2019-06-20,14392.0,26878510,E P ENERGY CORP,2014-01-17,2018-12-31
2,0004,02504D10,AMERICAN CAPITAL,2014-02-20,2014-02-20,14418.0,02504D10,AMERICAN CAPITAL SR FLOATING LT,2014-01-16,2018-08-24
3,000R,14163310,CARECOM,2014-02-20,2014-02-20,14378.0,14163310,CARE COM INC,2014-01-24,2018-12-31
4,000V,15117E10,CELLADON,2014-03-20,2014-03-20,14423.0,15117E10,CELLADON CORP,2014-01-30,2016-03-22
...,...,...,...,...,...,...,...,...,...,...
26245,ZXIS,98876010,Z-AXIS,2004-06-17,2004-10-14,83970.0,98876010,Z AXIS CORP,1983-10-26,1985-11-01
26246,ZXZX,16951E10,CHINA ZENIX,2011-07-14,2018-07-19,12720.0,16951E10,CHINA ZENIX AUTO INTL LTD,2011-05-12,2018-06-13
26248,ZY,98919510,ZAYRE CORP,1977-12-15,1984-11-15,40539.0,98919510,ZAYRE CORP,1968-01-02,1989-06-20
26247,ZY,87254010,TJX,1989-07-20,2016-06-16,40539.0,87254010,T J X COMPANIES INC NEW,1989-06-21,2018-12-31


In [22]:
# Keep link with most recent company name
_link1_1_tmp = _link1_1.groupby(['ticker','permno']).ldate.max().reset_index()
_link1_2 = pd.merge(_link1_1, _link1_1_tmp, how='inner', on =['ticker', 'permno', 'ldate'])
_link1_2

Unnamed: 0,ticker,cusip,cname,fdate,ldate,permno,ncusip,comnam,namedt,nameenddt
0,0000,87482X10,TALMER BANCORP,2014-02-20,2014-03-20,14471.0,87482X10,TALMER BANCORP INC,2014-02-12,2016-08-31
1,0001,26878510,EP ENGR CORP,2014-02-20,2019-06-20,14392.0,26878510,E P ENERGY CORP,2014-01-17,2018-12-31
2,0004,02504D10,AMERICAN CAPITAL,2014-02-20,2014-02-20,14418.0,02504D10,AMERICAN CAPITAL SR FLOATING LT,2014-01-16,2018-08-24
3,000R,14163310,CARECOM,2014-02-20,2014-02-20,14378.0,14163310,CARE COM INC,2014-01-24,2018-12-31
4,000V,28249U10,EIGER,2016-04-14,2016-04-14,14423.0,28249U10,EIGER BIOPHARMACEUTICALS INC,2016-03-23,2018-12-31
...,...,...,...,...,...,...,...,...,...,...
19772,ZVX,98950E40,ZEVEX INTL INC,1997-12-18,2002-03-14,85520.0,98950E40,ZEVEX INTERNATIONAL INC,1997-05-20,2007-03-16
19773,ZXIS,98876010,Z-AXIS,2004-06-17,2004-10-14,83970.0,98876010,Z AXIS CORP,1983-10-26,1985-11-01
19774,ZXZX,16951E10,CHINA ZENIX,2011-07-14,2018-07-19,12720.0,16951E10,CHINA ZENIX AUTO INTL LTD,2011-05-12,2018-06-13
19775,ZY,87254010,TJX,1989-07-20,2016-06-16,40539.0,87254010,T J X COMPANIES INC NEW,1989-06-21,2018-12-31


In [24]:
from fuzzywuzzy import fuzz

_link1_2['name_ratio'] = _link1_2.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1)

name_ratio_p10 = _link1_2.name_ratio.quantile(0.10)

def score1(row):
    if (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']) & (row['name_ratio'] >= name_ratio_p10):
        score = 0
    elif (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']):
        score = 1
    elif row['name_ratio'] >= name_ratio_p10:
        score = 2
    else:
        score = 3
    return score

# assign size portfolio
_link1_2['score']=_link1_2.apply(score1, axis=1)
_link1_2 = _link1_2[['ticker','permno','cname','comnam','name_ratio','score']]
_link1_2 = _link1_2.drop_duplicates()



In [25]:
len(_link1_2)
_link1_2

19777

Unnamed: 0,ticker,permno,cname,comnam,name_ratio,score
0,0000,14471.0,TALMER BANCORP,TALMER BANCORP INC,100,0
1,0001,14392.0,EP ENGR CORP,E P ENERGY CORP,67,0
2,0004,14418.0,AMERICAN CAPITAL,AMERICAN CAPITAL SR FLOATING LT,100,0
3,000R,14378.0,CARECOM,CARE COM INC,74,0
4,000V,14423.0,EIGER,EIGER BIOPHARMACEUTICALS INC,100,0
...,...,...,...,...,...,...
19772,ZVX,85520.0,ZEVEX INTL INC,ZEVEX INTERNATIONAL INC,78,0
19773,ZXIS,83970.0,Z-AXIS,Z AXIS CORP,100,2
19774,ZXZX,12720.0,CHINA ZENIX,CHINA ZENIX AUTO INTL LTD,100,0
19775,ZY,40539.0,TJX,T J X COMPANIES INC NEW,15,1


In [26]:
##########################
# Step 2: Link by TICKER #
##########################

# Find links for the remaining unmatched cases using Exchange Ticker 

# Identify remaining unmatched cases 
_nomatch1 = pd.merge(_ibes2[['ticker']], _link1_2[['permno','ticker']], on='ticker', how='left')
print(len(_nomatch1))

_nomatch1 = _nomatch1.loc[_nomatch1.permno.isnull()].drop(['permno'], axis=1).drop_duplicates()
print(len(_nomatch1))

37452
4065


In [27]:
_ibes2.iloc[0:2]

Unnamed: 0,ticker,cusip,cname,fdate,ldate
1,0,87482X10,TALMER BANCORP,2014-02-20,2014-03-20
5,1,26878510,EP ENGR CORP,2014-02-20,2019-06-20


In [28]:
ibesid = conn.raw_sql(""" select ticker, cname, oftic, sdates, cusip from ibes.id """)
ibesid = ibesid.loc[ibesid.oftic.notna()]

In [30]:
# Add IBES identifying information

ibesid = conn.raw_sql(""" select ticker, cname, oftic, sdates, cusip from ibes.id """)
ibesid = ibesid.loc[ibesid.oftic.notna()]

In [31]:
_nomatch2 = pd.merge(_nomatch1, ibesid, how='inner', on=['ticker'])
len(_nomatch2)
_nomatch2

15781

Unnamed: 0,ticker,cname,oftic,sdates,cusip
0,002B,MEDIENT STUDIOS,MDNT,2014-04-17,58471D10
1,002B,MOON RIVER,MDNT,2014-11-20,58471D10
2,006E,RIGHTSCORP,RIHT,2014-07-17,76658A10
3,008Y,BURCON NUTRASCIE,BUR,2014-09-18,1208311X
4,008Y,BURCON NUTRASCIE,BUR,2015-07-16,1208311X
...,...,...,...,...,...
15776,ZU03,ZUNICOM,ZNCMD,2019-08-15,98981X40
15777,ZWEB,GLOBAL WEB INC,ZWEB,2001-12-20,37938Q10
15778,ZYNX,ZYNEX MED,ZYNX,2005-07-14,98986510
15779,ZYNX,ZYNEX MED,ZYNX,2006-04-20,98986510


In [32]:
_nomatch3 = _nomatch2.groupby(['ticker', 'oftic']).sdates.agg(['min', 'max'])\
.reset_index().rename(columns={'min':'fdate', 'max':'ldate'})
print(len(_nomatch3))

_nomatch3 = pd.merge(_nomatch2, _nomatch3, how='left', on=['ticker','oftic'])
print(len(_nomatch3))


8287
15781


In [35]:
_nomatch3 = _nomatch3.loc[_nomatch3.sdates == _nomatch3.ldate]
print(len(_nomatch3))
_nomatch3

8287


Unnamed: 0,ticker,cname,oftic,sdates,cusip,fdate,ldate
1,002B,MOON RIVER,MDNT,2014-11-20,58471D10,2014-04-17,2014-11-20
2,006E,RIGHTSCORP,RIHT,2014-07-17,76658A10,2014-07-17,2014-07-17
6,008Y,BURCON NUTRASCIE,BUR,2016-11-17,1208311X,2014-09-18,2016-11-17
7,008Y,BURCON NUTRASCIE,BUROF,2018-05-17,1208311X,2018-05-17,2018-05-17
9,00A8,5BARZ,BARZ,2014-12-18,33833F10,2014-10-16,2014-12-18
...,...,...,...,...,...,...,...
15774,ZU03,ZUNICOM INC,ZNCM,2004-05-20,98981X10,2004-04-15,2004-05-20
15776,ZU03,ZUNICOM,ZNCMD,2019-08-15,98981X40,2012-10-18,2019-08-15
15777,ZWEB,GLOBAL WEB INC,ZWEB,2001-12-20,37938Q10,2001-12-20,2001-12-20
15779,ZYNX,ZYNEX MED,ZYNX,2006-04-20,98986510,2005-07-14,2006-04-20


In [36]:
# Get entire list of CRSP stocks with Exchange Ticker information
_crsp_n1 = conn.raw_sql(""" select ticker, comnam, permno, ncusip, namedt, nameenddt
                            from crsp.stocknames """)

In [37]:
_crsp_n1 = _crsp_n1.loc[_crsp_n1.ticker.notna()].sort_values(by=['permno','ticker','namedt'])

In [38]:
len(_crsp_n1)

56374

In [39]:
_crsp_n1_namedt = _crsp_n1.groupby(['permno','ticker']).namedt.min().reset_index().rename(columns={'min':'namedt'})
_crsp_n1_nameenddt = _crsp_n1.groupby(['permno','ticker']).nameenddt.max().reset_index().rename(columns={'max':'nameenddt'})

_crsp_n1_dt = pd.merge(_crsp_n1_namedt, _crsp_n1_nameenddt, how = 'inner', on=['permno','ticker'])

_crsp_n1 = _crsp_n1.rename(columns={'namedt': 'namedt_ind', 'nameenddt':'nameenddt_ind'})

_crsp_n2 = pd.merge(_crsp_n1, _crsp_n1_dt, how ='left', on = ['permno','ticker'])

_crsp_n2 = _crsp_n2.rename(columns={'ticker':'crsp_ticker'})
_crsp_n2 = _crsp_n2.loc[_crsp_n2.nameenddt_ind == _crsp_n2.nameenddt].drop(['namedt_ind', 'nameenddt_ind'], axis=1)


In [40]:
len(_crsp_n2)
_crsp_n2.iloc[0:2]

41039

Unnamed: 0,crsp_ticker,comnam,permno,ncusip,namedt,nameenddt
0,OMFGA,OPTIMUM MANUFACTURING INC,10000.0,68391610,1986-01-07,1987-06-11
3,EGAS,GAS NATURAL INC,10001.0,36720410,2009-08-04,2017-08-03


In [41]:
_link2_1 = pd.merge(_nomatch3, _crsp_n2, how='inner', left_on=['oftic'], right_on=['crsp_ticker'])
_link2_1 = _link2_1.loc[(_link2_1.ldate>=_link2_1.namedt) & (_link2_1.fdate<=_link2_1.nameenddt)]

len(_link2_1)

305

In [42]:
_link2_1['name_ratio'] = _link2_1.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1)

_link2_2 = _link2_1
_link2_2['cusip6'] = _link2_2.apply(lambda x: x.cusip[:6], axis=1)
_link2_2['ncusip6'] = _link2_2.apply(lambda x: x.ncusip[:6], axis=1)

len(_link2_2)

305

In [43]:
def score2(row):
    if (row['cusip6']==row['ncusip6']) & (row['name_ratio'] >= name_ratio_p10):
        score = 0
    elif (row['cusip6']==row['ncusip6']):
        score = 4
    elif row['name_ratio'] >= name_ratio_p10:
        score = 5
    else:
        score = 6
    return score

# assign size portfolio
_link2_2['score']=_link2_2.apply(score2, axis=1)

In [44]:
len(_link2_2)

305

In [47]:
# Some companies may have more than one TICKER-PERMNO link
# so re-sort and keep the case (PERMNO & Company name from CRSP)
# that gives the lowest score for each IBES TICKER 

_link2_2 = _link2_2[['ticker','permno','cname','comnam', 'name_ratio', 'score']].sort_values(by=['ticker','score'])
_link2_2_score = _link2_2.groupby(['ticker']).score.min().reset_index()
len(_link2_2)
len(_link2_2_score)

305

268

In [49]:
_link2_3 = pd.merge(_link2_2, _link2_2_score, how='inner', on=['ticker', 'score'])
len(_link2_3)
_link2_3 = _link2_3[['ticker','permno','cname','comnam','score']].drop_duplicates()
len(_link2_3)

293

288

In [50]:
iclink = _link1_2.append(_link2_3)
len(iclink)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


20065

# PAED

In [53]:
#####################################
# Post Earnings Announcement Drift  #
# June 2019                         #
# Qingyi (Freda) Song Drechsler     #
#####################################

import pandas as pd
import numpy as np
import wrds
import matplotlib.pyplot as plt
import pickle as pkl
from dateutil.relativedelta import *

In [None]:
###################
# Connect to WRDS #
###################
conn=wrds.Connection()

# set sample date range
begdate = '01/01/2010'
enddate = '12/31/2018'

# set CRSP date range a bit wider to guarantee collecting all information
crsp_begdate = '01/01/2009'
crsp_enddate = '12/31/2019'

#################################
# Step 0: Read in ICLINK output #
#################################

# iclink.pkl is the output from the python program iclink
# it contains the linking between crsp and ibes
with open('iclink.pkl', 'rb') as f:
    iclink = pkl.load(f)

In [51]:
##################################
# Step 1. S&P 500 Index Universe #
##################################

# All companies that were ever included in S&P 500 index as an example 
# Linking Compustat GVKEY and IBES Tickers using ICLINK               
# For unmatched GVKEYs, use header IBTIC link in Compustat Security file 

_sp500 = conn.raw_sql(""" select gvkey from comp.idxcst_his where gvkeyx='000003' """)

_ccm = conn.raw_sql(""" select gvkey, lpermco as permco, lpermno as permno, linkdt, linkenddt 
                        from crsp.ccmxpf_linktable 
                        where usedflag=1 and linkprim in ('P', 'C')""")

_ccm[['permco', 'permno']] = _ccm[['permco', 'permno']].astype(int)
_ccm['linkdt'] = pd.to_datetime(_ccm['linkdt'])
_ccm['linkenddt'] = pd.to_datetime(_ccm['linkenddt'])

_sec = conn.raw_sql(""" select ibtic, gvkey from comp.security """)


import datetime
today = datetime.date.today()

In [55]:
iclink_hq = iclink.loc[(iclink.score <=1)]
len(iclink_hq)

19209

In [56]:
# Fill linkenddt missing value (.E in SAS dataset) with today's date
_ccm['linkenddt'] = _ccm.linkenddt.fillna(today)

# Start the sequence of left join
gvkey = pd.merge(_sp500, _ccm, how='left', on=['gvkey'])
gvkey = pd.merge(gvkey, _sec.loc[_sec.ibtic.notna()], how='left', on=['gvkey'])

# high quality links from iclink
# score = 0 or 1
iclink_hq = iclink.loc[(iclink.score <=1)]

gvkey = pd.merge(gvkey, iclink_hq, how='left', on=['permno'])

# fill missing ticker with ibtic
gvkey.ticker = np.where(gvkey.ticker.notnull(), gvkey.ticker, gvkey.ibtic)

# Keep relevant columns and drop duplicates if there is any
gvkey = gvkey[['gvkey', 'permco', 'permno', 'linkdt', 'linkenddt','ticker']]

gvkey = gvkey.drop_duplicates()

In [None]:
len(gvkey)
gvkey

In [None]:
# date ranges from gvkey

# min linkdt for ticker and permno combination
gvkey_mindt = gvkey.groupby(['ticker','permno']).linkdt.min().reset_index()
gvkey_mindt
# max linkenddt for ticker and permno combination
gvkey_maxdt = gvkey.groupby(['ticker','permno']).linkenddt.max().reset_index()
gvkey_maxdt
# link date range 
gvkey_dt = pd.merge(gvkey_mindt, gvkey_maxdt, how='inner', on=['ticker','permno'])


In [62]:
len(gvkey_dt)
gvkey_dt

1735

Unnamed: 0,ticker,permno,linkdt,linkenddt
0,004W,14714.0,2014-06-06,2019-11-13
1,00C6,14939.0,2014-11-03,2019-11-13
2,00VP,15703.0,2015-10-15,2019-11-13
3,01AB,16342.0,2016-10-17,2019-11-13
4,01AF,16347.0,2016-11-01,2019-11-13
...,...,...,...,...
1730,ZION,84129.0,1974-01-01,2019-11-13
1731,ZMH,89070.0,2001-08-07,2019-11-13
1732,ZOTS,13788.0,2013-02-01,2019-11-13
1733,ZRN,45970.0,1968-03-21,1998-06-30


In [77]:
#######################################
# Step 2. Extract Estimates from IBES #
#######################################

# Extract estimates from IBES Unadjusted file and select    
# the latest estimate for a firm within broker-analyst group
# "fpi in (6,7)" selects quarterly forecast for the current 
# and the next fiscal quarter   

# set sample date range
begdate = '01/01/2010'
enddate = '12/31/2018'

ibes_temp = conn.raw_sql(f"""
                        select ticker, estimator, analys, pdf, fpi, value, fpedats, revdats, revtims, anndats, anntims
                        from ibes.detu_epsus 
                        where fpedats between '{begdate}' and '{enddate}'
                        and (fpi='6' or fpi='7')
                        """, date_cols = ['revdats', 'anndats', 'fpedats'])

In [78]:
# merge to get date range linkdt and linkenddt to fulfill date requirement
ibes_temp = pd.merge(ibes_temp, gvkey_dt, how='left', on=['ticker'])
ibes_temp=ibes_temp.loc[(ibes_temp.linkdt<=ibes_temp.anndats) & (ibes_temp.anndats <= ibes_temp.linkenddt)]
len(ibes_temp)
ibes_temp

1142624

Unnamed: 0,ticker,estimator,analys,pdf,fpi,value,fpedats,revdats,revtims,anndats,anntims,permno,linkdt,linkenddt
12776,004W,2342.0,82561.0,D,6,0.21,2014-06-30,2014-08-04,24801.0,2014-06-13,24300.0,14714.0,2014-06-06,2019-11-13
12777,004W,3102.0,75222.0,D,6,0.17,2014-06-30,2014-06-18,9973.0,2014-06-17,61800.0,14714.0,2014-06-06,2019-11-13
12778,004W,31.0,72523.0,D,6,0.13,2014-06-30,2014-07-01,38312.0,2014-07-01,27000.0,14714.0,2014-06-06,2019-11-13
12779,004W,98.0,90646.0,D,6,0.17,2014-06-30,2014-07-01,30994.0,2014-07-01,22320.0,14714.0,2014-06-06,2019-11-13
12780,004W,228.0,79979.0,D,6,0.16,2014-06-30,2014-07-01,47972.0,2014-07-01,22920.0,14714.0,2014-06-06,2019-11-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2964934,ZY,282.0,131673.0,D,7,1.21,2018-10-31,2018-07-09,38223.0,2018-07-09,34440.0,40539.0,1964-05-12,2019-11-13
2964935,ZY,50623.0,507196.0,D,7,1.26,2018-10-31,2018-07-30,21952.0,2018-07-30,21780.0,40539.0,1964-05-12,2019-11-13
2964936,ZY,51534.0,509906.0,D,7,1.22,2018-10-31,2018-08-08,488.0,2018-08-08,360.0,40539.0,1964-05-12,2019-11-13
2964937,ZY,51243.0,637123.0,D,7,1.27,2018-10-31,2018-08-13,14847.0,2018-08-13,10800.0,40539.0,1964-05-12,2019-11-13


In [81]:
# Count number of estimates reported on primary/diluted basis 

p_sub = ibes_temp[['ticker','fpedats','pdf']].loc[ibes_temp.pdf=='P']
d_sub = ibes_temp[['ticker','fpedats','pdf']].loc[ibes_temp.pdf=='D']

p_count = p_sub.groupby(['ticker','fpedats']).pdf.count().reset_index().rename(columns={'pdf':'p_count'})
d_count = d_sub.groupby(['ticker','fpedats']).pdf.count().reset_index().rename(columns={'pdf':'d_count'})

ibes = pd.merge(ibes_temp, d_count, how = 'left', on=['ticker', 'fpedats'])
ibes = pd.merge(ibes, p_count, how='left', on =['ticker','fpedats'])
ibes['d_count'] = ibes.d_count.fillna(0)
ibes['p_count'] = ibes.p_count.fillna(0)

In [82]:
len(ibes)
ibes

1142624

Unnamed: 0,ticker,estimator,analys,pdf,fpi,value,fpedats,revdats,revtims,anndats,anntims,permno,linkdt,linkenddt,d_count,p_count
0,004W,2342.0,82561.0,D,6,0.21,2014-06-30,2014-08-04,24801.0,2014-06-13,24300.0,14714.0,2014-06-06,2019-11-13,20.0,0.0
1,004W,3102.0,75222.0,D,6,0.17,2014-06-30,2014-06-18,9973.0,2014-06-17,61800.0,14714.0,2014-06-06,2019-11-13,20.0,0.0
2,004W,31.0,72523.0,D,6,0.13,2014-06-30,2014-07-01,38312.0,2014-07-01,27000.0,14714.0,2014-06-06,2019-11-13,20.0,0.0
3,004W,98.0,90646.0,D,6,0.17,2014-06-30,2014-07-01,30994.0,2014-07-01,22320.0,14714.0,2014-06-06,2019-11-13,20.0,0.0
4,004W,228.0,79979.0,D,6,0.16,2014-06-30,2014-07-01,47972.0,2014-07-01,22920.0,14714.0,2014-06-06,2019-11-13,20.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142619,ZY,282.0,131673.0,D,7,1.21,2018-10-31,2018-07-09,38223.0,2018-07-09,34440.0,40539.0,1964-05-12,2019-11-13,65.0,0.0
1142620,ZY,50623.0,507196.0,D,7,1.26,2018-10-31,2018-07-30,21952.0,2018-07-30,21780.0,40539.0,1964-05-12,2019-11-13,65.0,0.0
1142621,ZY,51534.0,509906.0,D,7,1.22,2018-10-31,2018-08-08,488.0,2018-08-08,360.0,40539.0,1964-05-12,2019-11-13,65.0,0.0
1142622,ZY,51243.0,637123.0,D,7,1.27,2018-10-31,2018-08-13,14847.0,2018-08-13,10800.0,40539.0,1964-05-12,2019-11-13,65.0,0.0


In [83]:
# Determine whether most analysts report estimates on primary/diluted basis
# following Livnat and Mendenhall (2006)                                   

ibes['basis']=np.where(ibes.p_count>ibes.d_count, 'P', 'D')

ibes = ibes.sort_values(by=['ticker','fpedats','estimator','analys','anndats', 'anntims', 'revdats', 'revtims'])\
.drop(['linkdt', 'linkenddt','p_count','d_count', 'pdf', 'fpi'], axis=1)

len(ibes)
ibes

1142624

Unnamed: 0,ticker,estimator,analys,value,fpedats,revdats,revtims,anndats,anntims,permno,basis
2,004W,31.0,72523.0,0.130,2014-06-30,2014-07-01,38312.0,2014-07-01,27000.0,14714.0,D
3,004W,98.0,90646.0,0.170,2014-06-30,2014-07-01,30994.0,2014-07-01,22320.0,14714.0,D
14,004W,100.0,76979.0,0.099,2014-06-30,2014-08-04,65067.0,2014-07-01,23220.0,14714.0,D
4,004W,228.0,79979.0,0.160,2014-06-30,2014-07-01,47972.0,2014-07-01,22920.0,14714.0,D
12,004W,260.0,71551.0,0.110,2014-06-30,2014-07-24,39318.0,2014-07-01,960.0,14714.0,D
...,...,...,...,...,...,...,...,...,...,...,...
1142574,ZY,91263.0,593895.0,1.200,2018-10-31,2018-11-12,33141.0,2018-08-22,4500.0,40539.0,D
1142616,ZY,91560.0,620865.0,1.210,2018-10-31,2018-06-25,40735.0,2018-06-23,300.0,40539.0,D
1142578,ZY,91560.0,620865.0,1.220,2018-10-31,2018-11-13,34736.0,2018-09-04,5880.0,40539.0,D
1142613,ZY,91613.0,630583.0,1.230,2018-10-31,2018-06-12,62746.0,2018-06-12,55500.0,40539.0,D


In [84]:
# Keep the latest observation for a given analyst
# Group by company fpedats estimator analys then pick the last record in the group

ibes_1 = ibes.groupby(['ticker','fpedats','estimator','analys']).apply(lambda x: x.index[-1]).to_frame().reset_index()

In [85]:
ibes_1

Unnamed: 0,ticker,fpedats,estimator,analys,0
0,004W,2014-06-30,31.0,72523.0,2
1,004W,2014-06-30,98.0,90646.0,3
2,004W,2014-06-30,100.0,76979.0,14
3,004W,2014-06-30,228.0,79979.0,4
4,004W,2014-06-30,260.0,71551.0,12
...,...,...,...,...,...
449155,ZY,2018-10-31,60902.0,502726.0,1142579
449156,ZY,2018-10-31,88989.0,626091.0,1142584
449157,ZY,2018-10-31,91263.0,593895.0,1142574
449158,ZY,2018-10-31,91560.0,620865.0,1142578


In [86]:
# reset index to the old dataframe index for join in the next step
ibes_1=ibes_1.set_index(0)
ibes_1

Unnamed: 0_level_0,ticker,fpedats,estimator,analys
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,004W,2014-06-30,31.0,72523.0
3,004W,2014-06-30,98.0,90646.0
14,004W,2014-06-30,100.0,76979.0
4,004W,2014-06-30,228.0,79979.0
12,004W,2014-06-30,260.0,71551.0
...,...,...,...,...
1142579,ZY,2018-10-31,60902.0,502726.0
1142584,ZY,2018-10-31,88989.0,626091.0
1142574,ZY,2018-10-31,91263.0,593895.0
1142578,ZY,2018-10-31,91560.0,620865.0


In [87]:
# Inner join with the last analyst record per group
ibes = pd.merge(ibes, ibes_1[['analys']], left_index=True, right_index=True)

# drop duplicate column
ibes=ibes.drop(['analys_y'], axis=1).rename(columns={'analys_x': 'analys'})
len(ibes)
ibes

449160

Unnamed: 0,ticker,estimator,analys,value,fpedats,revdats,revtims,anndats,anntims,permno,basis
2,004W,31.0,72523.0,0.130,2014-06-30,2014-07-01,38312.0,2014-07-01,27000.0,14714.0,D
3,004W,98.0,90646.0,0.170,2014-06-30,2014-07-01,30994.0,2014-07-01,22320.0,14714.0,D
14,004W,100.0,76979.0,0.099,2014-06-30,2014-08-04,65067.0,2014-07-01,23220.0,14714.0,D
4,004W,228.0,79979.0,0.160,2014-06-30,2014-07-01,47972.0,2014-07-01,22920.0,14714.0,D
12,004W,260.0,71551.0,0.110,2014-06-30,2014-07-24,39318.0,2014-07-01,960.0,14714.0,D
...,...,...,...,...,...,...,...,...,...,...,...
1142579,ZY,60902.0,502726.0,1.200,2018-10-31,2018-11-19,4078.0,2018-09-11,3840.0,40539.0,D
1142584,ZY,88989.0,626091.0,1.220,2018-10-31,2018-11-01,41989.0,2018-11-01,22320.0,40539.0,D
1142574,ZY,91263.0,593895.0,1.200,2018-10-31,2018-11-12,33141.0,2018-08-22,4500.0,40539.0,D
1142578,ZY,91560.0,620865.0,1.220,2018-10-31,2018-11-13,34736.0,2018-09-04,5880.0,40539.0,D


In [88]:
#######################################
# Step 3. Link Estimates with Actuals #
#######################################

# Link Unadjusted estimates with Unadjusted actuals and CRSP permnos  
# Keep only the estimates issued within 90 days before the report date

# Getting actual piece of data
ibes_act = conn.raw_sql(f"""
                        select ticker, anndats as repdats, value as act, pends as fpedats, pdicity
                        from ibes.actu_epsus 
                        where pends between '{begdate}' and '{enddate}'
                        and pdicity='QTR'
                        """, date_cols = ['repdats', 'fpedats'])

In [89]:
# Join with the estimate piece of the data
ibes1 = pd.merge(ibes, ibes_act, how='left', on = ['ticker','fpedats'])
ibes1['dgap'] = ibes1.repdats - ibes1.anndats
ibes1

Unnamed: 0,ticker,estimator,analys,value,fpedats,revdats,revtims,anndats,anntims,permno,basis,repdats,act,pdicity,dgap
0,004W,31.0,72523.0,0.130,2014-06-30,2014-07-01,38312.0,2014-07-01,27000.0,14714.0,D,2014-08-07,0.35,QTR,37 days
1,004W,98.0,90646.0,0.170,2014-06-30,2014-07-01,30994.0,2014-07-01,22320.0,14714.0,D,2014-08-07,0.35,QTR,37 days
2,004W,100.0,76979.0,0.099,2014-06-30,2014-08-04,65067.0,2014-07-01,23220.0,14714.0,D,2014-08-07,0.35,QTR,37 days
3,004W,228.0,79979.0,0.160,2014-06-30,2014-07-01,47972.0,2014-07-01,22920.0,14714.0,D,2014-08-07,0.35,QTR,37 days
4,004W,260.0,71551.0,0.110,2014-06-30,2014-07-24,39318.0,2014-07-01,960.0,14714.0,D,2014-08-07,0.35,QTR,37 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449155,ZY,60902.0,502726.0,1.200,2018-10-31,2018-11-19,4078.0,2018-09-11,3840.0,40539.0,D,2018-11-20,0.63,QTR,70 days
449156,ZY,88989.0,626091.0,1.220,2018-10-31,2018-11-01,41989.0,2018-11-01,22320.0,40539.0,D,2018-11-20,0.63,QTR,19 days
449157,ZY,91263.0,593895.0,1.200,2018-10-31,2018-11-12,33141.0,2018-08-22,4500.0,40539.0,D,2018-11-20,0.63,QTR,90 days
449158,ZY,91560.0,620865.0,1.220,2018-10-31,2018-11-13,34736.0,2018-09-04,5880.0,40539.0,D,2018-11-20,0.63,QTR,77 days


In [90]:
ibes1['flag'] = np.where( (ibes1.dgap>=datetime.timedelta(days=0)) & (ibes1.dgap<=datetime.timedelta(days=90)) & (ibes1.repdats.notna()) & (ibes1.anndats.notna()), 1, 0)

ibes1 = ibes1.loc[ibes1.flag==1].drop(['flag', 'dgap', 'pdicity'], axis=1)
len(ibes1)
ibes1

311905

Unnamed: 0,ticker,estimator,analys,value,fpedats,revdats,revtims,anndats,anntims,permno,basis,repdats,act
0,004W,31.0,72523.0,0.130,2014-06-30,2014-07-01,38312.0,2014-07-01,27000.0,14714.0,D,2014-08-07,0.35
1,004W,98.0,90646.0,0.170,2014-06-30,2014-07-01,30994.0,2014-07-01,22320.0,14714.0,D,2014-08-07,0.35
2,004W,100.0,76979.0,0.099,2014-06-30,2014-08-04,65067.0,2014-07-01,23220.0,14714.0,D,2014-08-07,0.35
3,004W,228.0,79979.0,0.160,2014-06-30,2014-07-01,47972.0,2014-07-01,22920.0,14714.0,D,2014-08-07,0.35
4,004W,260.0,71551.0,0.110,2014-06-30,2014-07-24,39318.0,2014-07-01,960.0,14714.0,D,2014-08-07,0.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
449155,ZY,60902.0,502726.0,1.200,2018-10-31,2018-11-19,4078.0,2018-09-11,3840.0,40539.0,D,2018-11-20,0.63
449156,ZY,88989.0,626091.0,1.220,2018-10-31,2018-11-01,41989.0,2018-11-01,22320.0,40539.0,D,2018-11-20,0.63
449157,ZY,91263.0,593895.0,1.200,2018-10-31,2018-11-12,33141.0,2018-08-22,4500.0,40539.0,D,2018-11-20,0.63
449158,ZY,91560.0,620865.0,1.220,2018-10-31,2018-11-13,34736.0,2018-09-04,5880.0,40539.0,D,2018-11-20,0.63


In [93]:
# Select all relevant combinations of Permnos and Date

ibes1_dt1 = ibes1[['permno', 'anndats']].drop_duplicates()

ibes1_dt2 = ibes1[['permno', 'repdats']].drop_duplicates().rename(columns={'repdats':'anndats'})

ibes_anndats = pd.concat([ibes1_dt1, ibes1_dt2]).drop_duplicates()

In [96]:
# Adjust all estimate and earnings announcement dates to the closest
# preceding trading date in CRSP to ensure that adjustment factors won't
# be missing after the merge  

# unique anndats from ibes
uniq_anndats = ibes_anndats[['anndats']].drop_duplicates()

# unique trade dates from crsp.dsi
crsp_dats = conn.raw_sql(""" 
                            select date 
                            from crsp.dsi 
                         """, date_cols=['date'])

# Create up to 5 days prior dates relative to anndats

for i in range(0, 5):
    uniq_anndats[i] = uniq_anndats.anndats - datetime.timedelta(days=i)

In [99]:
crsp_dats.iloc[:1]

Unnamed: 0,date
0,1925-12-31


In [100]:
uniq_anndats

Unnamed: 0,anndats,0,1,2,3,4
0,2014-07-01,2014-07-01,2014-06-30,2014-06-29,2014-06-28,2014-06-27
9,2014-06-13,2014-06-13,2014-06-12,2014-06-11,2014-06-10,2014-06-09
10,2014-06-17,2014-06-17,2014-06-16,2014-06-15,2014-06-14,2014-06-13
14,2014-08-05,2014-08-05,2014-08-04,2014-08-03,2014-08-02,2014-08-01
17,2014-07-31,2014-07-31,2014-07-30,2014-07-29,2014-07-28,2014-07-27
...,...,...,...,...,...,...
182476,2019-03-01,2019-03-01,2019-02-28,2019-02-27,2019-02-26,2019-02-25
340652,2019-03-18,2019-03-18,2019-03-17,2019-03-16,2019-03-15,2019-03-14
357749,2019-03-05,2019-03-05,2019-03-04,2019-03-03,2019-03-02,2019-03-01
390324,2019-03-14,2019-03-14,2019-03-13,2019-03-12,2019-03-11,2019-03-10


In [101]:
# reshape (transpose) the df for later join with crsp trading dates

expand_anndats = uniq_anndats.set_index('anndats').stack().reset_index().\
rename(columns={'level_1':'prior', 0:'prior_date'})

expand_anndats

Unnamed: 0,anndats,prior,prior_date
0,2014-07-01,0,2014-07-01
1,2014-07-01,1,2014-06-30
2,2014-07-01,2,2014-06-29
3,2014-07-01,3,2014-06-28
4,2014-07-01,4,2014-06-27
...,...,...,...
15430,2019-03-15,0,2019-03-15
15431,2019-03-15,1,2019-03-14
15432,2019-03-15,2,2019-03-13
15433,2019-03-15,3,2019-03-12


In [102]:
# merge with crsp trading dates
tradedates = pd.merge(expand_anndats, crsp_dats, how='left', left_on=['prior_date'], right_on=['date'])
tradedates

Unnamed: 0,anndats,prior,prior_date,date
0,2014-07-01,0,2014-07-01,2014-07-01
1,2014-07-01,1,2014-06-30,2014-06-30
2,2014-07-01,2,2014-06-29,NaT
3,2014-07-01,3,2014-06-28,NaT
4,2014-07-01,4,2014-06-27,2014-06-27
...,...,...,...,...
15430,2019-03-15,0,2019-03-15,NaT
15431,2019-03-15,1,2019-03-14,NaT
15432,2019-03-15,2,2019-03-13,NaT
15433,2019-03-15,3,2019-03-12,NaT


In [103]:
# create the dgap (days gap) variable for min selection
tradedates['dgap'] = tradedates.anndats-tradedates.date

# choosing the row with the smallest dgap for a given anndats
tradedates = tradedates.loc[tradedates.groupby('anndats')['dgap'].idxmin()]

tradedates = tradedates[['anndats', 'date']]
tradedates

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """


Unnamed: 0,anndats,date
14130.0,2009-11-05,2009-11-05
14140.0,2009-11-06,2009-11-06
14135.0,2009-11-09,2009-11-09
13100.0,2009-11-16,2009-11-16
15030.0,2009-11-19,2009-11-19
...,...,...
,NaT,NaT
,NaT,NaT
,NaT,NaT
,NaT,NaT


In [104]:
len(tradedates)

3087