In [2]:
import wrds

conn = wrds.Connection(wrds_username='xiaomowu')

Loading library list...
Done


In [3]:
#########################
# Step 1: Link by CUSIP #
#########################

# 1.1 IBES: Get the list of IBES Tickers for US firms in IBES
_ibes1 = conn.raw_sql("""
                      select ticker, cusip, cname, sdates from ibes.id
                      where usfirm=1 and cusip != ''
                      """)
len(_ibes1)

86242

In [4]:
# Create first and last 'start dates' for a given cusip
# Use agg min and max to find the first and last date per group
# then rename to fdate and ldate respectively

_ibes1_date = _ibes1.groupby(['ticker','cusip']).sdates.agg(['min', 'max'])\
.reset_index().rename(columns={'min':'fdate', 'max':'ldate'})

# merge fdate ldate back to _ibes1 data
_ibes2 = pd.merge(_ibes1, _ibes1_date,how='left', on =['ticker','cusip'])
_ibes2 = _ibes2.sort_values(by=['ticker','cusip','sdates'])

# keep only the most recent company name
# determined by having sdates = ldate
_ibes2 = _ibes2.loc[_ibes2.sdates == _ibes2.ldate].drop(['sdates'], axis=1)

In [5]:
# 1.2 CRSP: Get all permno-ncusip combinations
_crsp1 = conn.raw_sql("""
                      select permno, ncusip, comnam, namedt, nameenddt
                      from crsp.stocknames
                      where ncusip != ''
                      """)

In [6]:
# first namedt
_crsp1_fnamedt = _crsp1.groupby(['permno','ncusip']).namedt.min().reset_index()

_crsp1_fnamedt

Unnamed: 0,permno,ncusip,namedt
0,10000.0,68391610,1986-01-07
1,10001.0,29269V10,2009-08-04
2,10001.0,29274A10,1993-11-22
3,10001.0,29274A20,2008-02-05
4,10001.0,36720410,2010-07-09
...,...,...,...
44159,93433.0,92870X10,2013-04-10
44160,93433.0,92870X30,2013-04-24
44161,93434.0,78513510,2010-06-14
44162,93435.0,82936G20,2010-06-14


In [7]:
# last nameenddt
_crsp1_lnameenddt = _crsp1.groupby(['permno','ncusip']).nameenddt.max().reset_index()
_crsp1_lnameenddt

Unnamed: 0,permno,ncusip,nameenddt
0,10000.0,68391610,1987-06-11
1,10001.0,29269V10,2010-07-08
2,10001.0,29274A10,2008-02-04
3,10001.0,29274A20,2009-08-03
4,10001.0,36720410,2017-08-03
...,...,...,...
44159,93433.0,92870X10,2013-04-23
44160,93433.0,92870X30,2016-12-22
44161,93434.0,78513510,2018-12-31
44162,93435.0,82936G20,2012-05-18


In [8]:
# merge both 
_crsp1_dtrange = pd.merge(_crsp1_fnamedt, _crsp1_lnameenddt, \
                          on = ['permno','ncusip'], how='inner')
_crsp1_dtrange

Unnamed: 0,permno,ncusip,namedt,nameenddt
0,10000.0,68391610,1986-01-07,1987-06-11
1,10001.0,29269V10,2009-08-04,2010-07-08
2,10001.0,29274A10,1993-11-22,2008-02-04
3,10001.0,29274A20,2008-02-05,2009-08-03
4,10001.0,36720410,2010-07-09,2017-08-03
...,...,...,...,...
44159,93433.0,92870X10,2013-04-10,2013-04-23
44160,93433.0,92870X30,2013-04-24,2016-12-22
44161,93434.0,78513510,2010-06-14,2018-12-31
44162,93435.0,82936G20,2010-06-14,2012-05-18


In [9]:
# replace namedt and nameenddt with the version from the dtrange
_crsp1 = _crsp1.drop(['namedt'],axis=1).rename(columns={'nameenddt':'enddt'})
_crsp2 = pd.merge(_crsp1, _crsp1_dtrange, on =['permno','ncusip'], how='inner')
_crsp2

Unnamed: 0,permno,ncusip,comnam,enddt,namedt,nameenddt
0,10000.0,68391610,OPTIMUM MANUFACTURING INC,1987-06-11,1986-01-07,1987-06-11
1,10001.0,39040610,GREAT FALLS GAS CO,1993-11-21,1986-01-09,1993-11-21
2,10001.0,29274A10,ENERGY WEST INC,2008-02-04,1993-11-22,2008-02-04
3,10001.0,29274A20,ENERGY WEST INC,2009-08-03,2008-02-05,2009-08-03
4,10001.0,29269V10,ENERGY INC,2009-12-17,2009-08-04,2010-07-08
...,...,...,...,...,...,...
55733,93433.0,92870X30,VOLTARI CORP,2016-12-22,2013-04-24,2016-12-22
55734,93434.0,78513510,S & W SEED CO,2018-12-31,2010-06-14,2018-12-31
55735,93435.0,82936G20,SINO CLEAN ENERGY INC,2012-05-18,2010-06-14,2012-05-18
55736,93436.0,88160R10,TESLA MOTORS INC,2017-02-01,2010-06-29,2018-12-31


In [10]:
len(_crsp2)

55738

In [11]:
# keep only most recent company name
_crsp2 = _crsp2.loc[_crsp2.enddt ==_crsp2.nameenddt].drop(['enddt'], axis=1)
_crsp2

Unnamed: 0,permno,ncusip,comnam,namedt,nameenddt
0,10000.0,68391610,OPTIMUM MANUFACTURING INC,1986-01-07,1987-06-11
1,10001.0,39040610,GREAT FALLS GAS CO,1986-01-09,1993-11-21
2,10001.0,29274A10,ENERGY WEST INC,1993-11-22,2008-02-04
3,10001.0,29274A20,ENERGY WEST INC,2008-02-05,2009-08-03
5,10001.0,29269V10,ENERGY INC,2009-08-04,2010-07-08
...,...,...,...,...,...
55732,93433.0,92870X10,VOLTARI CORP,2013-04-10,2013-04-23
55733,93433.0,92870X30,VOLTARI CORP,2013-04-24,2016-12-22
55734,93434.0,78513510,S & W SEED CO,2010-06-14,2018-12-31
55735,93435.0,82936G20,SINO CLEAN ENERGY INC,2010-06-14,2012-05-18


In [12]:
len(_crsp2)

44164

In [13]:
# Link by full cusip, company names and dates
_link1_1 = pd.merge(_ibes2, _crsp2, how='inner', left_on='cusip', right_on='ncusip')\
.sort_values(['ticker','permno','ldate'])
len(_link1_1)
_link1_1

Unnamed: 0,ticker,cusip,cname,fdate,ldate,permno,ncusip,comnam,namedt,nameenddt
0,0000,87482X10,TALMER BANCORP,2014-02-20,2014-03-20,14471.0,87482X10,TALMER BANCORP INC,2014-02-12,2016-08-31
1,0001,26878510,EP ENGR CORP,2014-02-20,2019-06-20,14392.0,26878510,E P ENERGY CORP,2014-01-17,2018-12-31
2,0004,02504D10,AMERICAN CAPITAL,2014-02-20,2014-02-20,14418.0,02504D10,AMERICAN CAPITAL SR FLOATING LT,2014-01-16,2018-08-24
3,000R,14163310,CARECOM,2014-02-20,2014-02-20,14378.0,14163310,CARE COM INC,2014-01-24,2018-12-31
4,000V,15117E10,CELLADON,2014-03-20,2014-03-20,14423.0,15117E10,CELLADON CORP,2014-01-30,2016-03-22
...,...,...,...,...,...,...,...,...,...,...
26245,ZXIS,98876010,Z-AXIS,2004-06-17,2004-10-14,83970.0,98876010,Z AXIS CORP,1983-10-26,1985-11-01
26246,ZXZX,16951E10,CHINA ZENIX,2011-07-14,2018-07-19,12720.0,16951E10,CHINA ZENIX AUTO INTL LTD,2011-05-12,2018-06-13
26248,ZY,98919510,ZAYRE CORP,1977-12-15,1984-11-15,40539.0,98919510,ZAYRE CORP,1968-01-02,1989-06-20
26247,ZY,87254010,TJX,1989-07-20,2016-06-16,40539.0,87254010,T J X COMPANIES INC NEW,1989-06-21,2018-12-31


In [14]:
# Keep link with most recent company name
_link1_1_tmp = _link1_1.groupby(['ticker','permno']).ldate.max().reset_index()
_link1_2 = pd.merge(_link1_1, _link1_1_tmp, how='inner', on =['ticker', 'permno', 'ldate'])
_link1_2

Unnamed: 0,ticker,cusip,cname,fdate,ldate,permno,ncusip,comnam,namedt,nameenddt
0,0000,87482X10,TALMER BANCORP,2014-02-20,2014-03-20,14471.0,87482X10,TALMER BANCORP INC,2014-02-12,2016-08-31
1,0001,26878510,EP ENGR CORP,2014-02-20,2019-06-20,14392.0,26878510,E P ENERGY CORP,2014-01-17,2018-12-31
2,0004,02504D10,AMERICAN CAPITAL,2014-02-20,2014-02-20,14418.0,02504D10,AMERICAN CAPITAL SR FLOATING LT,2014-01-16,2018-08-24
3,000R,14163310,CARECOM,2014-02-20,2014-02-20,14378.0,14163310,CARE COM INC,2014-01-24,2018-12-31
4,000V,28249U10,EIGER,2016-04-14,2016-04-14,14423.0,28249U10,EIGER BIOPHARMACEUTICALS INC,2016-03-23,2018-12-31
...,...,...,...,...,...,...,...,...,...,...
19772,ZVX,98950E40,ZEVEX INTL INC,1997-12-18,2002-03-14,85520.0,98950E40,ZEVEX INTERNATIONAL INC,1997-05-20,2007-03-16
19773,ZXIS,98876010,Z-AXIS,2004-06-17,2004-10-14,83970.0,98876010,Z AXIS CORP,1983-10-26,1985-11-01
19774,ZXZX,16951E10,CHINA ZENIX,2011-07-14,2018-07-19,12720.0,16951E10,CHINA ZENIX AUTO INTL LTD,2011-05-12,2018-06-13
19775,ZY,87254010,TJX,1989-07-20,2016-06-16,40539.0,87254010,T J X COMPANIES INC NEW,1989-06-21,2018-12-31


In [15]:
from fuzzywuzzy import fuzz

_link1_2['name_ratio'] = _link1_2.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1)

name_ratio_p10 = _link1_2.name_ratio.quantile(0.10)

def score1(row):
    if (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']) & (row['name_ratio'] >= name_ratio_p10):
        score = 0
    elif (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']):
        score = 1
    elif row['name_ratio'] >= name_ratio_p10:
        score = 2
    else:
        score = 3
    return score

# assign size portfolio
_link1_2['score']=_link1_2.apply(score1, axis=1)
_link1_2 = _link1_2[['ticker','permno','cname','comnam','name_ratio','score']]
_link1_2 = _link1_2.drop_duplicates()



In [16]:
len(_link1_2)
_link1_2

Unnamed: 0,ticker,permno,cname,comnam,name_ratio,score
0,0000,14471.0,TALMER BANCORP,TALMER BANCORP INC,100,0
1,0001,14392.0,EP ENGR CORP,E P ENERGY CORP,67,0
2,0004,14418.0,AMERICAN CAPITAL,AMERICAN CAPITAL SR FLOATING LT,100,0
3,000R,14378.0,CARECOM,CARE COM INC,74,0
4,000V,14423.0,EIGER,EIGER BIOPHARMACEUTICALS INC,100,0
...,...,...,...,...,...,...
19772,ZVX,85520.0,ZEVEX INTL INC,ZEVEX INTERNATIONAL INC,78,0
19773,ZXIS,83970.0,Z-AXIS,Z AXIS CORP,100,2
19774,ZXZX,12720.0,CHINA ZENIX,CHINA ZENIX AUTO INTL LTD,100,0
19775,ZY,40539.0,TJX,T J X COMPANIES INC NEW,15,1


In [17]:
##########################
# Step 2: Link by TICKER #
##########################

# Find links for the remaining unmatched cases using Exchange Ticker 

# Identify remaining unmatched cases 
_nomatch1 = pd.merge(_ibes2[['ticker']], _link1_2[['permno','ticker']], on='ticker', how='left')
print(len(_nomatch1))

_nomatch1 = _nomatch1.loc[_nomatch1.permno.isnull()].drop(['permno'], axis=1).drop_duplicates()
print(len(_nomatch1))

37452
4065


In [18]:
_ibes2.iloc[0:2]

Unnamed: 0,ticker,cusip,cname,fdate,ldate
1,0,87482X10,TALMER BANCORP,2014-02-20,2014-03-20
5,1,26878510,EP ENGR CORP,2014-02-20,2019-06-20


In [19]:
ibesid = conn.raw_sql(""" select ticker, cname, oftic, sdates, cusip from ibes.id """)
ibesid = ibesid.loc[ibesid.oftic.notna()]

In [20]:
len(ibesid)

222078

In [22]:
# Add IBES identifying information

ibesid = conn.raw_sql(""" select ticker, cname, oftic, sdates, cusip from ibes.id """)
ibesid = ibesid.loc[ibesid.oftic.notna()]

In [23]:
_nomatch2 = pd.merge(_nomatch1, ibesid, how='inner', on=['ticker'])
len(_nomatch2)
_nomatch2

Unnamed: 0,ticker,cname,oftic,sdates,cusip
0,002B,MEDIENT STUDIOS,MDNT,2014-04-17,58471D10
1,002B,MOON RIVER,MDNT,2014-11-20,58471D10
2,006E,RIGHTSCORP,RIHT,2014-07-17,76658A10
3,008Y,BURCON NUTRASCIE,BUR,2014-09-18,1208311X
4,008Y,BURCON NUTRASCIE,BUR,2015-07-16,1208311X
...,...,...,...,...,...
15776,ZU03,ZUNICOM,ZNCMD,2019-08-15,98981X40
15777,ZWEB,GLOBAL WEB INC,ZWEB,2001-12-20,37938Q10
15778,ZYNX,ZYNEX MED,ZYNX,2005-07-14,98986510
15779,ZYNX,ZYNEX MED,ZYNX,2006-04-20,98986510


In [25]:
_nomatch3 = _nomatch2.groupby(['ticker', 'oftic']).sdates.agg(['min', 'max'])\
.reset_index().rename(columns={'min':'fdate', 'max':'ldate'})
print(len(_nomatch3))

_nomatch3 = pd.merge(_nomatch2, _nomatch3, how='left', on=['ticker','oftic'])
print(len(_nomatch3))


8287
15781


In [None]:
_nomatch3 = _nomatch3.loc[_nomatch3.sdates == _nomatch3.ldate]
print(len(_nomatch3)
_nomatch3

In [28]:
# Get entire list of CRSP stocks with Exchange Ticker information
_crsp_n1 = conn.raw_sql(""" select ticker, comnam, permno, ncusip, namedt, nameenddt
                            from crsp.stocknames """)

In [33]:
_crsp_n1 = _crsp_n1.loc[_crsp_n1.ticker.notna()].sort_values(by=['permno','ticker','namedt'])

In [34]:
len(_crsp_n1)

56374

In [35]:
_crsp_n1_namedt = _crsp_n1.groupby(['permno','ticker']).namedt.min().reset_index().rename(columns={'min':'namedt'})
_crsp_n1_nameenddt = _crsp_n1.groupby(['permno','ticker']).nameenddt.max().reset_index().rename(columns={'max':'nameenddt'})

_crsp_n1_dt = pd.merge(_crsp_n1_namedt, _crsp_n1_nameenddt, how = 'inner', on=['permno','ticker'])

_crsp_n1 = _crsp_n1.rename(columns={'namedt': 'namedt_ind', 'nameenddt':'nameenddt_ind'})

_crsp_n2 = pd.merge(_crsp_n1, _crsp_n1_dt, how ='left', on = ['permno','ticker'])

_crsp_n2 = _crsp_n2.rename(columns={'ticker':'crsp_ticker'})
_crsp_n2 = _crsp_n2.loc[_crsp_n2.nameenddt_ind == _crsp_n2.nameenddt].drop(['namedt_ind', 'nameenddt_ind'], axis=1)


In [37]:
len(_crsp_n2)
_crsp_n2.iloc[0:2]

Unnamed: 0,crsp_ticker,comnam,permno,ncusip,namedt,nameenddt
0,OMFGA,OPTIMUM MANUFACTURING INC,10000.0,68391610,1986-01-07,1987-06-11
3,EGAS,GAS NATURAL INC,10001.0,36720410,2009-08-04,2017-08-03


In [38]:
_link2_1 = pd.merge(_nomatch3, _crsp_n2, how='inner', left_on=['oftic'], right_on=['crsp_ticker'])
_link2_1 = _link2_1.loc[(_link2_1.ldate>=_link2_1.namedt) & (_link2_1.fdate<=_link2_1.nameenddt)]

len(_link2_1)

305

In [39]:
_link2_1['name_ratio'] = _link2_1.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1)

_link2_2 = _link2_1
_link2_2['cusip6'] = _link2_2.apply(lambda x: x.cusip[:6], axis=1)
_link2_2['ncusip6'] = _link2_2.apply(lambda x: x.ncusip[:6], axis=1)

len(_link2_2)

305

In [41]:
def score2(row):
    if (row['cusip6']==row['ncusip6']) & (row['name_ratio'] >= name_ratio_p10):
        score = 0
    elif (row['cusip6']==row['ncusip6']):
        score = 4
    elif row['name_ratio'] >= name_ratio_p10:
        score = 5
    else:
        score = 6
    return score

# assign size portfolio
_link2_2['score']=_link2_2.apply(score2, axis=1)

In [42]:
len(_link2_2)

305