# init

**Data Pre-processing**
- 2011-2018 (8 years) . CC before 2011 has no indicator of the MD and QA parts.
- for 2012-2018, ~98% have MD/QA indicators; for 2011, ~93% has MD/QA

In [2]:
# library
library(feather)
library(stringdist)
library(tools)
library(pdftools)
options('sd_num_thread'=8)

# data_dir
DATA_DIR = "C:/Users/rossz/OneDrive/CC/data"
WRDS_DOWNLOAD_DIR = str_c(DATA_DIR, '/WRDS-download')

# WRDS

## establish connection

In [20]:
library(RPostgres)

# connect to wrds
wrds <- dbConnect(Postgres(),
                  host='wrds-pgdata.wharton.upenn.edu',
                  port=9737,
                  dbname='wrds',
                  sslmode='require',
                  user='xiaomowu',
                  password='SLCyz2018')

## unit test

In [17]:
# Determine the data libraries available at WRDS
res <- dbSendQuery(wrds, "select distinct table_schema
                   from information_schema.tables
                   where table_type ='VIEW'
                   or table_type ='FOREIGN TABLE'
                   order by table_schema")
data <- dbFetch(res, n=-1)
dbClearResult(res)
print(setDT(data)[, sort(table_schema)])

  [1] "aha"                "ahasamp"            "audit"             
  [4] "blab"               "block"              "boardex"           
  [7] "boardsmp"           "bvd"                "bvdsamp"           
 [10] "calcbnch"           "cboe"               "centris"           
 [13] "ciq"                "ciqsamp"            "cisdm"             
 [16] "clrvt"              "clrvtsmp"           "comp"              
 [19] "compa"              "compb"              "compdcur"          
 [22] "compg"              "comph"              "compm"             
 [25] "compmcur"           "compsamp"           "compseg"           
 [28] "compsnap"           "comscore"           "contrib"           
 [31] "crsp"               "crspa"              "crspm"             
 [34] "crspq"              "crspsamp"           "csmar"             
 [37] "dealscan"           "djones"             "dmef"              
 [40] "doe"                "emdb"               "etfg"              
 [43] "etfgsamp"           "eureka

In [6]:
# Determine the datasets within a given library
library = 'comp'
res <- dbSendQuery(wrds, sprintf("select distinct table_name
                   from information_schema.columns
                   where table_schema='%s'
                   order by table_name", library))
data <- dbFetch(res, n=-1)
dbClearResult(res)
setDT(data)[, sort(table_name)] %>% print()

  [1] "aco_amda"          "aco_imda"          "aco_indfnta"      
  [4] "aco_indfntq"       "aco_indfntytd"     "aco_indsta"       
  [7] "aco_indstq"        "aco_indstytd"      "aco_notesa"       
 [10] "aco_notesq"        "aco_notessa"       "aco_notesytd"     
 [13] "aco_pnfnda"        "aco_pnfndq"        "aco_pnfndytd"     
 [16] "aco_pnfnta"        "aco_pnfntq"        "aco_pnfntytd"     
 [19] "aco_transa"        "aco_transq"        "aco_transsa"      
 [22] "aco_transytd"      "adsprate"          "anncomp"          
 [25] "asec_amda"         "asec_imda"         "asec_notesa"      
 [28] "asec_notesq"       "asec_transa"       "asec_transq"      
 [31] "bank_aacctchg"     "bank_adesind"      "bank_afnd1"       
 [34] "bank_afnd2"        "bank_afnddc1"      "bank_afnddc2"     
 [37] "bank_afntind"      "bank_funda"        "bank_funda_fncd"  
 [40] "bank_fundq"        "bank_fundq_fncd"   "bank_iacctchg"    
 [43] "bank_idesind"      "bank_ifndq"        "bank_ifndytd"     
 [46] "ban

In [None]:
# Determine the variables (column headers) within a given dataset
library = 'ibes'
dataset = 'det_epsus'
res <- dbSendQuery(wrds, sprintf("select column_name
                   from information_schema.columns
                   where table_schema='%s'
                   and table_name='%s'
                   order by column_name", library, dataset))
data <- dbFetch(res, n=-1)
dbClearResult(res)
setDT(data)[, column_name] %>% print()

In [None]:
# To query the crsp.dsf dataset
library = "ibes"
dataset = 'ptgdet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
data <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
nrow(data)

## I/B/E/S

### Detail

In [None]:
library = 'ibes'
dataset = 'det_epsus' # eps
# dataset = 'det_xepsus' # non-eps
years = 2000:2018

for (year in years) {
    print(sprintf('Start %s at %s', year, now()))
    query = sprintf("select * from %s.%s where to_char(anndats,'yyyy')='%s'", library, dataset, year)
    save_name_short = sprintf('%s_%s_%s', library, dataset, year)
    save_name_full = sprintf('../data/WRDS-download/%s_%s_%s.rds', library, dataset, year)
    
    res <- dbSendQuery(wrds, query)
    data <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
    
    assign(save_name_short, data)
    sprintf('%s_%s: %s', dataset, year, nrow(data))
    saveRDS(data, save_name_full)
}

In [48]:
library = 'ibes'
dataset = 'detu_epsus' # eps
# dataset = 'det_xepsus' # non-eps

query = sprintf("select ticker, estimator, analys, pdf, fpi, value, fpedats, revdats, revtims, anndats, anntims from %s.%s where fpedats between '2000-01-01' and '2018-12-31' and (fpi='6' or fpi='7')", library, dataset, year)

res <- dbSendQuery(wrds, query)
ibes_detu_epsus<- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s', dataset, nrow(ibes_detu_epsus))
sv(ibes_detu_epsus, path=WRDS_DOWNLOAD_DIR)

-ibes_detu_epsus- saved  (22.79 secs)


### actuals

In [69]:
library = 'ibes'
dataset = 'actu_epsus' 

query = sprintf("select ticker, anndats as repdats, value as act, pends as fpedats, pdicity from %s.%s where pends between '2000-01-01' and '2018-12-31' and pdicity='QTR'", library, dataset, year)

res <- dbSendQuery(wrds, query)
ibes_actu_epsus<- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s', dataset, nrow(ibes_actu_epsus))
sv(ibes_actu_epsus, path=WRDS_DOWNLOAD_DIR)
ibes_actu_epsus[1]

-ibes_actu_epsus- saved  (0.82 secs)


ticker,repdats,act,fpedats,pdicity
0,2014-02-14,,2012-12-31,QTR


### Summary

In [9]:
library = 'ibes'
dataset = 'statsum_epsus' # summary eps
query = sprintf("select * from %s.%s", library, dataset)

res <- dbSendQuery(wrds, query)
ibes_statsum_epsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

ibes_statsum_epsus[1]
sv(ibes_statsum_epsus, path=WRDS_DOWNLOAD_DIR)

-statsum_epsus- saved  (43.43 secs)


In [None]:
library = 'ibes'
dataset = 'statsum_xepsus' # summary xeps

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_statsum_xepsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
ibes_statsum_xepsus[1]
sv(ibes_statsum_xepsus, path=WRDS_DOWNLOAD_DIR)

### Surprise

In [None]:
library = 'ibes'
dataset = 'surpsum' # summary surprise 

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_surpsum <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
ibes_surpsum[1]
sv(ibes_surpsum, path=WRDS_DOWNLOAD_DIR)

## price target

In [None]:
library = "ibes"
dataset = 'ptgdet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ptgdet <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
sv(ptgdet, path='/data/WRDS-download')

In [6]:
nrow(ptgdet)

In [5]:
ptgdet[1]

ticker,cusip,oftic,cname,actdats,estimid,alysnam,horizon,value,estcur,curr,amaskcd,usfirm,measure,acttims,anndats,anntims
0,87482X10,TLMR,TALMER BANCORP,2014-03-10,PRMDN082,PERMDENIED,12,16,USD,USD,538750,1,PTG,32043,2014-03-10,1200


## recommend

In [None]:
library = 'ibes'
dataset = 'recddet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
recddet <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
sv(recddet, path='/data/WRDS-download')

In [7]:
nrow(recddet)

In [8]:
recddet[1]

ticker,cusip,cname,oftic,actdats,estimid,analyst,ereccd,etext,ireccd,itext,emaskcd,amaskcd,usfirm,acttims,revdats,revtims,anndats,anntims
0,87482X10,TALMER BANCORP,TLMR,2014-03-10,PRMDN082,PERMDENIED,2,OUTPERFORM,2,BUY,50659,538750,1,32043,2016-01-26,34552,2014-03-10,1200


## stock price

In [None]:
library = 'comp'
dataset = 'secd' # factors
years = 2000:2018

system.time({
secd = list()
for (year in years) {
    print(sprintf('Start %s at %s', year, now()))
    query = sprintf("select * from %s.%s where to_char(datadate,'yyyy')='%s'", library, dataset, year)

    res <- dbSendQuery(wrds, query)
    data <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
    
    print(sprintf('%s_%s: %s obs', dataset, year, nrow(data)))
    secd[[as.character(year)]] = data
}
secd = rbindlist(secd, use=T)
})
sv(secd, path=WRDS_DOWNLOAD_DIR)

Download `CRSP`
It's *deprecated*!

In [None]:
library = 'crspq'
dataset = 'dsf' # factors
years = 2001:2018

for (year in years) {
    print(sprintf('Start %s at %s', year, now()))
    query = sprintf("select * from %s.%s where to_char(date,'yyyy')='%s'", library, dataset, year)
    save_name_short = sprintf('%s_%s', dataset, year)
    save_name_full = sprintf('../data/WRDS-download/%s_%s.rds', dataset, year)

    res <- dbSendQuery(wrds, query)
    data <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
    
    sprintf('%s_%s: %s', dataset, year, nrow(data))
    assign(save_name_short, data)
    saveRDS(data, save_name_full)
}

In [31]:
ld(dsf_2001, path=WRDS_DOWNLOAD_DIR)

-dsf_2001- loaded  (1.11 secs)


In [32]:
dsf_2001[1]

cusip,permno,permco,issuno,hexcd,hsiccd,date,bidlo,askhi,prc,vol,ret,bid,ask,shrout,cfacpr,cfacshr,openprc,numtrd,retx
36720410,10001,7953,10398,2,4925,2001-01-02,9.3125,9.875,9.875,3849,0.01282051,9.5,9.875,2498,1.5,1.5,9.3125,9,0.01282051


## factors

In [7]:
library = 'ff'
dataset = 'factors_daily' # factors

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
factors <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(data))
sv(factors, path='/data/WRDS-download')

-factors- saved  (0.09 secs)


In [6]:
factors[1]

date,mktrf,smb,hml,rf,umd
1926-07-01,0.001,-0.0024,-0.0028,9e-05,


## firm-id

### `ibes.id`

In [4]:
library = 'ibes'
dataset = 'id' # firm names

query = sprintf("select ticker, cusip, cname, sdates from %s.%s where usfirm=1 and cusip != ''", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_id <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(ibes_id))
sv(ibes_id, path=WRDS_DOWNLOAD_DIR)

query = sprintf("select ticker, cusip, cname, oftic, sdates from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_id2 <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ibes_id2))

ibes_id2[1]
sv(ibes_id2, path=WRDS_DOWNLOAD_DIR)

-ibes_id- saved  (0.15 secs)


ticker,cusip,cname,oftic,sdates
0,87482X10,TALMER BANCORP,TLMR,2014-02-20


-ibes_id2- saved  (0.54 secs)


### `comp.security`
The table being used:
- `comp_security`: link I/B/E/S to Compustat

In [391]:
library = 'compm'
dataset = 'security' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_security <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(comp_security))

comp_security[1]
sv(comp_security, path=WRDS_DOWNLOAD_DIR)

tic,gvkey,iid,cusip,dlrsni,dsci,epf,exchg,excntry,ibtic,isin,secstat,sedol,tpci,dldtei
AE.2,1000,1,32102,9,COM USD1,,12,USA,,,I,,0,1978-06-30


-comp_security- saved  (0.34 secs)


### `crsp.stocknames`

In [88]:
library = 'crsp'
dataset = 'stocknames' # firm names

query = sprintf("select permno, ncusip, comnam, namedt, nameenddt from %s.%s where ncusip != ''", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_stocknames <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_stocknames))
sv(crsp_stocknames, path=WRDS_DOWNLOAD_DIR)

query = sprintf("select ticker, comnam, permno, ncusip, namedt, nameenddt from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_stocknames2 <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_stocknames2))
crsp_stocknames2[1]
sv(crsp_stocknames2, path=WRDS_DOWNLOAD_DIR)

permno,ncusip,comnam,namedt,nameenddt
10000,68391610,OPTIMUM MANUFACTURING INC,1986-01-07,1987-06-11


-crsp_stocknames- saved  (0.17 secs)


### `crsp.ccm`

In [22]:
library = 'crsp'
dataset = 'ccmxpf_linktable' # firm names

query = sprintf("select gvkey, lpermco as permco, lpermno as permno, linkdt, linkenddt from %s.%s where usedflag=1 and linkprim in ('P', 'C')", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_ccmlink <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_ccmlink))
sv(crsp_ccmlink, path=WRDS_DOWNLOAD_DIR)
crsp_ccmlink[1]

-crsp_ccmlink- saved  (0.13 secs)


gvkey,permco,permno,linkdt,linkenddt
1000,23369,25881,1970-11-13,1978-06-30


## index-constituent

In [63]:
ld(comp_idx_prof, path=WRDS_DOWNLOAD_DIR)
ld(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)
ld(comp_security, path=WRDS_DOWNLOAD_DIR)
ld(crsp_stocknames, path=WRDS_DOWNLOAD_DIR)

-comp_idx_prof- already exists, will NOT load again!  (0 secs)
-comp_idx_cst- already exists, will NOT load again!  (0 secs)
-comp_security- already exists, will NOT load again!  (0 secs)
-crsp_stocknames- already exists, will NOT load again!  (0 secs)


In [74]:
x = unique(comp_idx_cst[gvkeyx=='000005'
    ][comp_security[, .(gvkey, iid, tic, cusip=str_sub(cusip, 1, 6), sedol)], 
      on=.(gvkey, iid), nomatch=0])
x
# x[1]
# crsp_stocknames[1]
# y = x[crsp_stocknames[, .(cusip=str_sub(ncusip, 1, 6), comnam)], on=.(cusip)]
# y[1]

gvkey,iid,gvkeyx,from,thru,tic,cusip,sedol
1300,1,5,1925-12-07,2008-02-18,HON,438516,2020459
1356,1,5,1959-06-01,2013-09-22,AA.3,013817,BYVZDB3
1447,1,5,1982-08-30,,AXP,025816,2026082
1487,1,5,2004-04-08,2008-09-21,AIG,026874,2027342
1581,1,5,1939-03-14,2004-04-07,T.2,001957,2064888
1690,1,5,2015-03-19,,AAPL,037833,2046251
2136,1,5,2004-04-08,,VZ,92343V,2090571
2285,1,5,1987-03-12,,BA,097023,2108601
2817,1,5,1991-05-06,,CAT,149123,2180201
2968,1,5,2001-01-02,,JPM,46625H,2190385


In [12]:
library = 'compa'
dataset = 'idx_index' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_idx_prof <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(comp_idx_prof))

comp_idx_prof[1]
sv(comp_idx_prof, path=WRDS_DOWNLOAD_DIR)

conm,gvkeyx,idx13key,idxcstflg,idxstat,indexcat,indexgeo,indexid,indextype,indexval,spii,spmi,tic,tici
S&P Industrials-Wed,1,0,N,A,S&P,USA,500,LGCAP,0,,,I0001,I0001


-comp_idx_prof- saved  (0.03 secs)


In [16]:
library = 'compa'
dataset = 'idxcst_his' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_idx_cst <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(comp_idx_cst))

comp_idx_cst[1]
sv(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)

gvkey,iid,gvkeyx,from,thru
1004,1,30824,1994-10-01,


-comp_idx_cst- saved  (0.18 secs)


## keydev

In [30]:
library = 'ciq'
dataset = 'wrds_keydev' # everything except for `headline` and `situation`

query = sprintf("select * from %s.%s 
    where keydeveventtypeid in (28, 48, 55, 61, 144)", library, dataset)
res <- dbSendQuery(wrds, query)
ciq_wrds_keydev <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ciq_wrds_keydev))

# `mostimportantdateutc` is in UTC but R doesn't recognize,
# so we need to set it explictly
ciq_wrds_keydev[, ':='(mostimportantdateutc=force_tz(mostimportantdateutc, 'UTC'))]

sv(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)

In [None]:
library = 'ciq'
dataset = 'ciqkeydev' # `headline` and `situation`

query = sprintf("select * from %s.%s 
    where keydevid in 
        (select keydevid from ciq.wrds_keydev
        where keydeveventtypeid in (28, 48, 55, 61, 144))",
    library, dataset)
res <- dbSendQuery(wrds, query)
ciq_keydev <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ciq_keydev))

# `mostimportantdateutc` is in UTC but R doesn't recognize,
# so we need to set it explictly
ciq_keydev[, ':='(mostimportantdateutc=force_tz(mostimportantdateutc, 'UTC'))]

sv(ciq_keydev, path=WRDS_DOWNLOAD_DIR)

# Parse CC

## `ICLINK`

More explanation on score system:
- 0: BEST match: using (cusip, cusip dates and company names)       
          or (exchange ticker, company names and 6-digit cusip)     
- 1: Cusips and cusip dates match but company names do not match    
- 2: Cusips and company names match but cusip dates do not match    
- 3: Cusips match but cusip dates and company names do not match    
- 4: tickers and 6-digit cusips match but company names do not match   
- 5: tickers and company names match but 6-digit cusips do not match       
- 6: tickers match but company names and 6-digit cusips do not match  

### Link by `CUSIP`

**1.1 IBES: Get the list of IBES Tickers for US firms in IBES**

In [3]:
ld(ibes_id, path=WRDS_DOWNLOAD_DIR, force=T)

ibes_id = ibes_id[, ':='(fdate=min(sdates), ldate=max(sdates)), 
     keyby=.(ticker, cusip)
    ][order(ticker, cusip, sdates)
    ][sdates==ldate
    ][, .(ticker, cusip, cname, fdate, ldate)]
ibes_id[1]

-ibes_id- loaded  (0.12 secs)


ticker,cusip,cname,fdate,ldate
0,87482X10,TALMER BANCORP,2014-02-20,2014-03-20


**1.2 CRSP: Get all permno-ncusip combinations**

In [4]:
ld(crsp_stocknames, path=WRDS_DOWNLOAD_DIR, force=T)

crsp_stocknames = crsp_stocknames[, ':='(enddt=nameenddt)
    ][, ':='(namedt=min(namedt), nameenddt=max(nameenddt)), 
     keyby=.(permno, ncusip)
    ][enddt==nameenddt
    ][, ':='(enddt=NULL)]

-crsp_stocknames- loaded  (0.09 secs)


**1.3 Create CUSIP Link Table**

In [5]:
# Link by full cusip, company names and dates
# Keep link with most recent company name

cusip_link = crsp_stocknames[ibes_id, on=c('ncusip==cusip'), nomatch=0
    ][order(ticker, permno, -ldate),
    ][, .SD[1], keyby=.(ticker, permno)
    ][, ':='(name_ratio=stringsim(comnam, cname))]

In [6]:
# Calculate name matching ratio using

name_ratio_p10 = quantile(cusip_link$name_ratio, 0.1)

cusip_link = cusip_link[, ':='(score=ifelse((fdate<nameenddt) & (ldate>namedt) & (name_ratio>name_ratio_p10), 0, 
    ifelse((fdate<nameenddt) & (ldate>namedt), 1, 
    ifelse(name_ratio>name_ratio_p10, 2, 3))))
    ][, .(ticker, permno, cname, comnam, name_ratio, score)] %>% unique()

### Link by Ticker

**Find links for the remaining unmatched cases using Exchange Ticker**

In [7]:
# Identify remaining unmatched cases 
# Add IBES identifying information

# Create first and last 'start dates' for Exchange Tickers
# Label date range variables and keep only most recent company name

ld(ibes_id2, path=WRDS_DOWNLOAD_DIR)

nomatch = unique(cusip_link[, .(permno, ticker)
    ][ibes_id[, (ticker)], on=.(ticker), nomatch=NA
    ][is.na(permno)])[, ':='(permno=NULL)
    ][ibes_id2[!is.na(oftic)], on=.(ticker), nomatch=0
    ][, ':='(fdate=min(sdates), ldate=max(sdates)), 
     keyby=.(ticker, oftic)
    ][sdates==fdate]
nrow(nomatch)

-ibes_id2- loaded  (0.43 secs)


In [11]:
# Get entire list of CRSP stocks with Exchange Ticker information
# Arrange effective dates for link by Exchange Ticker

# Merge remaining unmatched cases using Exchange Ticker 
# Note: Use ticker date ranges as exchange tickers are reused overtime

# Score using company name using 6-digit CUSIP and company name spelling distance

# Some companies may have more than one TICKER-PERMNO link
# so re-sort and keep the case (PERMNO & Company name from CRSP)
# that gives the lowest score for each IBES TICKER 

ld(crsp_stocknames2, path=WRDS_DOWNLOAD_DIR)

cusip_link2 = crsp_stocknames2[!is.na(ticker)
    ][order(permno, ticker, namedt)
    ][, ':='(namedt_ind=namedt, nameenddt_ind=nameenddt)
    ][, ':='(namedt=min(namedt), nameenddt=max(nameenddt)), keyby=.(permno, ticker)
    ][nameenddt_ind==nameenddt
    ][, .(crsp_ticker=ticker, comnam, permno, ncusip, namedt, nameenddt)
    ][nomatch, on=c('crsp_ticker==oftic'), nomatch=0
    ][ldate>namedt & fdate<nameenddt
    ][, ':='(name_ratio=stringsim(comnam, cname), 
             cusip6=str_sub(cusip, 1, 6),
             ncusip6=str_sub(ncusip, 1, 6))
    ][, ':='(score=ifelse(cusip6==ncusip6 & name_ratio>name_ratio_p10, 0,
     ifelse(cusip6==ncusip6, 4,
     ifelse(name_ratio>name_ratio_p10, 5, 6))))
    ][order(ticker, score), .SD[1], keyby=.(ticker)
    ][, .(ticker, permno, cname, comnam, score)
    ] %>% unique()

cusip_link2[, .N]

-crsp_stocknames2- already exists, will NOT load again!  (0 secs)


In [12]:
cusip_link2[1]

ticker,permno,cname,comnam,score
008Y,13038,BURCON NUTRASCIE,BURCON NUTRASCIENCE CORP,0


### Combine two link tables

In [17]:
iclink = rbindlist(list(cusip_link, cusip_link2), fill=T)
sv(iclink)

-iclink- saved  (0.11 secs)


In [15]:
iclink[1:5]

ticker,permno,cname,comnam,name_ratio,score
0000,14471,TALMER BANCORP,TALMER BANCORP INC,0.7777778,0
0001,14392,EP ENGR CORP,E P ENERGY CORP,0.7333333,0
0004,14418,AMERICAN CAPITAL,AMERICAN CAPITAL SR FLOATING LT,0.516129,0
000R,14378,CARECOM,CARE COM INC,0.5833333,0
000V,14423,EIGER,EIGER BIOPHARMACEUTICALS INC,0.1785714,1


In [16]:
nrow(iclink)

## `SUE`

In [70]:
ld(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)
ld(comp_security, path=WRDS_DOWNLOAD_DIR)
ld(crsp_ccmlink, path=WRDS_DOWNLOAD_DIR)
ld(ibes_detu_epsus, path=WRDS_DOWNLOAD_DIR)
ld(ibes_actu_epsus, path=WRDS_DOWNLOAD_DIR)
ld(iclink)

-comp_idx_cst- already exists, will NOT load again!  (0 secs)
-comp_security- already exists, will NOT load again!  (0 secs)
-crsp_ccmlink- already exists, will NOT load again!  (0 secs)
-ibes_detu_epsus- already exists, will NOT load again!  (0 secs)
-ibes_actu_epsus- already exists, will NOT load again!  (0 secs)
-iclink- already exists, will NOT load again!  (0 secs)


In [44]:
##################################
# Step 1. S&P 500 Index Universe #
##################################

# All companies that were ever included in S&P 500 index as an example 
# Linking Compustat GVKEY and IBES Tickers using ICLINK               
# For unmatched GVKEYs, use header IBTIC link in Compustat Security file 

# Fill linkenddt missing value (.E in SAS dataset) with today's date

sp500 = comp_idx_cst[gvkeyx=='000003', .(gvkey)]
ccm = crsp_ccmlink[is.na(linkenddt), linkenddt := today()]
sec = comp_security[, .(ibtic, gvkey)]

# Start the sequence of left join
gvkey = ccm[sp500, on=.(gvkey)]
gvkey = sec[!is.na(ibtic)][gvkey, on=.(gvkey)]

# high quality links from iclink
# score = 0 or 1
# fill missing ticker with ibtic
# Keep relevant columns and drop duplicates if there is any

iclink_hq = iclink[score<=1]
gvkey = iclink_hq[gvkey, on=.(permno)
    ][is.na(ticker), ticker := ibtic
    ][, .(gvkey, permco, permno, linkdt, linkenddt, ticker)
    ] %>% unique()
# date ranges from gvkey

# min linkdt for ticker and permno combination
# max linkenddt for ticker and permno combination
# link date range 
gvkey_dt = gvkey[!is.na(ticker) & !is.na(permno)
    ][, .(linkdt=min(linkdt, na.rm=T), linkenddt=max(linkenddt, na.rm=T)), keyby=.(ticker, permno)]
nrow(gvkey_dt)
gvkey_dt[1:3]

ticker,permno,linkdt,linkenddt
004W,14714,2014-06-06,2019-11-14
00C6,14939,2014-11-03,2019-11-14
00VP,15703,2015-10-15,2019-11-14


In [68]:
#######################################
# Step 2. Extract Estimates from IBES #
#######################################

# Extract estimates from IBES Unadjusted file and select    
# the latest estimate for a firm within broker-analyst group
# "fpi in (6,7)" selects quarterly forecast for the current 
# and the next fiscal quarter  

ibes = gvkey_dt[ibes_detu_epsus[fpedats>=as.Date('2010-01-01')], 
     on=.(ticker), allow.cartesian=T
    ][linkdt<=anndats & anndats<=linkenddt]

# merge to get date range linkdt and linkenddt to fulfill date requirement
# Count number of estimates reported on primary/diluted basis 

# Determine whether most analysts report estimates on primary/diluted basis
# following Livnat and Mendenhall (2006)   

# Keep the latest observation for a given analyst
# Group by company fpedats estimator analys then pick the last record in the group

ibes[pdf=='P', ':='(p_count=.N), by=.(ticker, fpedats)]
ibes[pdf=='D', ':='(d_count=.N), by=.(ticker, fpedats)]

ibes = ibes[, ':='(p_count=fillna(p_count, 0), d_count=fillna(d_count, 0))
    ][, ':='(basis=ifelse(p_count>d_count, 'P', 'D'))
    ][order(ticker, fpedats, estimator, analys, -anndats, anntims, revdats, revtims)
    ][, .(ticker, fpedats, estimator, analys, anndats, anntims, revdats, revtims, value, permno, basis)
    ][, .SD[1], keyby=.(ticker, fpedats, estimator, analys)]

nrow(ibes)
tail(ibes)

ticker,fpedats,estimator,analys,anndats,anntims,revdats,revtims,value,permno,basis
ZY,2018-10-31,52135,563949,2018-10-15,68640,2018-11-14,73439,0.647,40539,D
ZY,2018-10-31,60902,502726,2018-09-11,3840,2018-11-19,4078,1.2,40539,D
ZY,2018-10-31,88989,626091,2018-11-01,22320,2018-11-01,41989,1.22,40539,D
ZY,2018-10-31,91263,593895,2018-08-22,4500,2018-11-12,33141,1.2,40539,D
ZY,2018-10-31,91560,620865,2018-09-04,5880,2018-11-13,34736,1.22,40539,D
ZY,2018-10-31,91613,630583,2018-11-15,52080,2018-11-16,15690,0.61,40539,D


In [74]:
#######################################
# Step 3. Link Estimates with Actuals #
#######################################

# Link Unadjusted estimates with Unadjusted actuals and CRSP permnos  
# Keep only the estimates issued within 90 days before the report date
# Join with the estimate piece of the data

ibes_act = ibes_actu_epsus
ibes_act = ibes_act[ibes, on=.(ticker, fpedats)
    ][, ':='(dgap=repdats-anndats)
    ][, ':='(flag=ifelse((dgap>=0) & (dgap<=90) & !is.na(repdats) & !is.na(anndats), 1, 0))
    ][flag==1
    ][, ':='(flag=NULL, dgap=NULL, pdicity=NULL)]

# Select all relevant combinations of Permnos and Date

ibes_act

ticker,repdats,act,fpedats,estimator,analys,anndats,anntims,revdats,revtims,value,permno,basis
004W,2014-08-07,0.35,2014-06-30,31,72523,2014-07-01,27000,2014-07-01,38312,0.13,14714,D
004W,2014-08-07,0.35,2014-06-30,98,90646,2014-07-01,22320,2014-07-01,30994,0.17,14714,D
004W,2014-08-07,0.35,2014-06-30,100,76979,2014-07-01,23220,2014-08-04,65067,0.099,14714,D
004W,2014-08-07,0.35,2014-06-30,228,79979,2014-07-01,22920,2014-07-01,47972,0.16,14714,D
004W,2014-08-07,0.35,2014-06-30,260,71551,2014-07-01,960,2014-07-24,39318,0.11,14714,D
004W,2014-08-07,0.35,2014-06-30,282,127095,2014-07-01,20160,2014-08-04,1188,0.13,14714,D


## `cc_transcript`

Unit test

In [453]:
path = 'C:/Users/rossz/OneDrive/CC/data/ciq-transcripts/ciq-sp500-2018-0201-0400/cc_2018/ETRADE Financial Corporation, Q4 2017 Earnings Call, Jan 25, 2018.pdf'
cc = pdf_text(path)
n_page = length(cc)

str_sub(cc[n_page], 1, 100)
header = str_extract(cc[n_page], '^.+(\\r\\n)*20\\d{2}\\r\\n')
header
cc = str_replace_all(cc, fixed(header), '')
str_sub(cc[5], 1, 100)

load data

In [1]:
cc_paths = list.files(
    'C:/Users/rossz/OneDrive/CC/data/ciq-transcripts',
    recursive=T,
    pattern='.pdf$',
    full=T)
cat('N of all transcripts:', length(cc_paths), '\n')

cc_paths = cc_paths[str_detect(cc_paths, regex('Earnings', ignore=T))]
cat('N of Earnings Call:', length(cc_paths), '\n')

N of all transcripts: 6478 
N of Earnings Call: 2917 


In [4]:
parse_one_page <- function(cc_path) {
    obs = list()
    
    cc = pdf_text(cc_path)
    n_page = length(cc)
    
    # cc_filename
    cc_filename = str_match(
        cc_path,
        regex('/([^/]+?)\\.pdf', ignore=T))[,2]
    
    # remove header
    header = str_extract(cc[n_page], '^.+(\\r\\n)*20\\d{2}\\r\\n')
    cc = str_replace_all(cc, fixed(header), '')
    
    # remove footer
    cc = str_replace_all(cc, '\\r\\n.+\\r\\n.+\\r\\n$', '')
    
    # determin which page is TOC
    toc_pagenum = str_which(cc, regex('contents[\\s\\S]+table of contents', ignore=T))
    if (length(toc_pagenum) > 1) {
        stop('len(toc_pagenum) > 1 !!')
    }
    
    # extract pagenum
    toc_page = cc[toc_pagenum]
    toc_participants = str_match(
        toc_page, 
        regex('contents[\\s\\S]+table of contents[\\s.]+(call participants.+)?', ignore=T))[,2]
    participants_pagenum = as.integer(str_match(toc_participants, '[\\. ]+(\\d+)')[,2])
    
    toc_md = str_match(
        toc_page,
        regex('contents[\\s\\S]+table of contents[\\s\\S]+(presentation.+)', ignore=T))[,2]
    md_pagenum = as.integer(str_match(toc_md, '[\\. ]+(\\d+)')[,2])
    
    toc_qa = str_match(
        toc_page,
        regex('contents[\\s\\S]+table of contents[\\s\\S]+(question and answer.+)', ignore=T))[,2]
    qa_pagenum = as.integer(str_match(toc_qa, '[\\. ]+(\\d+)')[,2])
    
    # extract {md, qa}
    if (!is.na(qa_pagenum)) {
        md = str_c(cc[md_pagenum:(qa_pagenum-1)], collapse=' ') %>% str_replace_all('\\r\\n', ' ')
        qa = str_c(cc[qa_pagenum:(n_page-1)], collapse='') %>% str_replace_all('\\r\\n', ' ')

        # test if `md` and `qa` parsed successfully
        # if failed, parse again
        if ((str_sub(md, 1, 12) != 'Presentation') | (str_sub(qa, 1, 19) != 'Question and Answer')) {
            md_pagenum = str_which(cc, '^Presentation')
            qa_pagenum = str_which(cc, '^Question and Answer')
            
            md = str_c(cc[md_pagenum:(qa_pagenum-1)], collapse='') %>% str_replace_all('\\r\\n', ' ')
            qa = str_c(cc[qa_pagenum:(n_page-1)], collapse='') %>% str_replace_all('\\r\\n', ' ')
            
            if ((str_sub(md, 1, 12) != 'Presentation') | (str_sub(qa, 1, 19) != 'Question and Answer')) {
                cat('MD/QA parsing error!:', cc_path, '\n')
                cat('\t MD:', str_sub(md, 1, 13), '\n')
                cat('\t QA:', str_sub(qa, 1, 20), '\n')
#                 cat('\t New MD:', str_sub(md, 1, 13), '\n')
#                 cat('\t New QA:', str_sub(qa, 1, 20), '\n')
            }

        }
        
    } else {
        md = str_c(cc[md_pagenum:(n_page-1)], collapse=' ') %>% str_replace_all('\\r\\n', ' ')
        qa = NA
        if (str_sub(md, 1, 12) != 'Presentation') {
            md_pagenum = str_which(cc, '^Presentation')
            md = str_c(cc[md_pagenum:(n_page-1)], collapse=' ') %>% str_replace_all('\\r\\n', ' ')
            
            if (str_sub(md, 1, 12) != 'Presentation') {
                cat('MD parsing error!:', cc_path, '\n')
                cat('\t MD:', str_sub(md, 1, 13), '\n')
                cat('\t New MD:', str_sub(md, 1, 13), '\n')
            }
        }
    }

    # get date
    date=mdy(
      str_match(cc_filename, regex('Call, (.+, 20\\d{2})', ignore=T))[,2])
    
    # prepare output
    obs$cc_filename = cc_filename
    obs$date = date
    # obs$participants_pagenum = participants_pagenum
    # obs$md_pagenum = md_pagenum
    # obs$qa_pagenum = qa_pagenum
    obs$md = md
    obs$qa = qa
    obs$cc_path = cc_path
    obs
    
}

# parse all pdf
ciq_transcripts = list()
for (i in sample(1:length(cc_paths), 10)) {
# for (i in 1:length(cc_paths)) {
    ciq_transcripts[[i]] = parse_one_page(cc_paths[i])
}
ciq_transcripts = rbindlist(ciq_transcripts)
ciq_transcripts[1][, ':='(md = str_sub(md, 1, 20), qa = str_sub(qa, 1, 20))][]

cc_filename,date,md,qa,cc_path
"Kimberly-Clark Corporation, Q1 2017 Earnings Call, Apr 24, 2017",2017-04-24,Presentation Operato,Question and Answer,"C:/Users/rossz/OneDrive/CC/data/ciq-transcripts/ciq-sp500-2017-1001-1200/cc_2017/Kimberly-Clark Corporation, Q1 2017 Earnings Call, Apr 24, 2017.pdf"


In [15]:
sv(ciq_transcripts)

-ciq_transcripts- saved  (5.15 secs)


## `cc_meta`

Notes: **linking `ibtic` and `gvkey`**
- one `gvkey` may have multiple `iid`, but in this situation, `iid` only have two variants: `01` and `01C`, so we assume these `iid` are the same.
- The case where `gvkey` has multiple `iid` only accounts for 2.2% of the data.
- Therefore, we assume `gvkey` and `ibtic` has one-to-one map
- `f_comp_security`: the final linking table

In [3]:
ld(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)
ld(ciq_keydev, path=WRDS_DOWNLOAD_DIR)
ld(comp_security, path=WRDS_DOWNLOAD_DIR)

-ciq_wrds_keydev- loaded  (4.55 secs)
-ciq_keydev- loaded  (26.62 secs)
-comp_security- loaded  (0.22 secs)


In [5]:
f_comp_security = comp_security[!is.na(ibtic),
     .(gvkey, ibtic)
    ] %>% unique(by=c('gvkey'))
f_comp_security[1]

gvkey,ibtic
1001,AMFD


In [11]:
ciq_wrds_keydev[keydevid=='300677948']

keydevid,companyid,companyname,keydeveventtypeid,eventtype,keydevstatusid,statustype,keydevtoobjectroletypeid,objectroletype,announcedate,announcetime,enterdate,entertime,lastmodifieddate,mostimportantdateutc,gvkey
300677948,18511,3i Group plc,48,Earnings Calls,1,Active,1,Target,2015-05-11,0,2015-05-11,13560,2017-06-29 11:01:00,2015-05-14 09:00:00,210835


In [12]:
ciq_keydev[keydevid=='300677948']

headline,situation,keydevid,announceddate,entereddate,lastmodifieddate,mostimportantdateutc
"3i Group plc, Q4 2015 Earnings Call, May 14, 2015","3i Group plc, Q4 2015 Earnings Call, May 14, 2015",300677948,2015-05-11,2015-05-11 03:46:00,2017-06-29 11:01:00,2015-05-14 09:00:00


Notes on `cc_meta`
- there's only 1 obs where `headline_date` is missing (`keydevid==1853842`)

In [6]:
# `announceddate` (ciq_keydev) and `announcedate` (ciq_wrds_keydev) are the same
# same for `entereddate` and `enterdate`

extract_headline_date <- function(headline) {
    regex_date = regex('results.+?((January|February|March|April|May|June|July|August|September|October|November|December) ?\\d+,? ?20\\d{2}).*?;?', ignore=T)
    date = str_match(headline, regex_date)[,2]
    date = mdy(date)
}

cc_meta = ciq_keydev[
     ciq_wrds_keydev[keydeveventtypeid %in% c(28, 48)],
     on=.(keydevid), nomatch=0
    ][, .(keydeveventtypeid,
      headline, 
      headline_date=mdy(
        str_match(headline, regex('Calls?, (.+?20\\d{2})', ignore=T))[,2]),
      announcedate, 
      companyname, gvkey, keydevid,
      mostimportantdateutc)
    ][keydeveventtypeid==28,
      headline_date:=extract_headline_date(headline)
    ][f_comp_security, on=.(gvkey), nomatch=0]

nrow(cc_meta)
# cc_meta[keydeveventtypeid==28][1]

" 29 failed to parse."

In [28]:
cc_meta[keydevid=='289919480']

keydeveventtypeid,headline,headline_date,announcedate,companyname,gvkey,keydevid,mostimportantdateutc,i.mostimportantdateutc,ibtic
48,"Apple Inc., Q2 2015 Earnings Call, Apr 28, 2015",2015-04-28,2015-03-30,Apple Inc.,1690,289919480,2015-04-27 21:00:00,2015-04-27 21:00:00,AAPL


In [32]:
cc_meta[keydevid=='538707756', with_tz(mostimportantdateutc, 'America/New_York')]

[1] "2017-10-23 20:00:00 EDT"

In [44]:
cc_meta[keydeveventtypeid==48
    ][headline_date != date(with_tz(mostimportantdateutc, 'America/New_York'))
    ][1:100]

keydeveventtypeid,headline,headline_date,announcedate,companyname,gvkey,keydevid,mostimportantdateutc,i.mostimportantdateutc,ibtic
48,"AMR Corporation, Q1 2005 Earnings Call, Apr 20, 2005",2005-04-20,2005-04-04,American Airlines Group Inc.,1045,1551245,2005-04-20 00:00:00,2005-04-20 00:00:00,AMR
48,"Adams Resources & Energy, Inc., Q3 2017 Pre Recorded Earnings Call, Nov 10, 2017",2017-11-10,2017-11-07,"Adams Resources & Energy, Inc.",1121,542907441,2017-11-10 00:00:00,2017-11-10 00:00:00,AA3B
48,"Rio Tinto Alcan, Inc., Q1 2007 Earnings Call, Apr 24, 2007",2007-04-24,2007-04-24,Rio Tinto Alcan Inc.,1243,4021541,2007-04-24 00:00:00,2007-04-24 00:00:00,AL
48,"AFLAC Inc., Q4 2004 Earnings Call, Feb-03-2005",2005-02-03,2005-01-24,Aflac Incorporated,1449,1010210,2005-02-03 00:10:00,2005-02-03 00:10:00,AFL
48,"AFLAC Inc., Q4 2006 Earnings Call, Feb-01-2007",2007-02-01,2007-01-23,Aflac Incorporated,1449,3436575,2007-02-01 00:10:00,2007-02-01 00:10:00,AFL
48,"AFLAC Inc., Q4 2005 Earnings Call, Feb-01-2006",2006-02-01,2006-01-23,Aflac Incorporated,1449,2550730,2006-02-01 00:10:00,2006-02-01 00:10:00,AFL
48,"American International Group, Inc., Q3 2010 Earnings Call, Nov 05, 2010",2010-11-05,2010-11-02,"American International Group, Inc.",1487,115247098,2010-11-05 00:00:00,2010-11-05 00:00:00,AIG
48,"Andrew Corporation, Q3 2007 Earnings Call, Aug 01, 2007",2007-08-01,2006-12-23,Andrew LLC,1651,3386785,2007-08-01 00:00:00,2007-08-01 00:00:00,ANDW
48,"Andrew Corporation, Q4 2007 Earnings Call, Nov 01, 2007",2007-11-01,2006-12-23,Andrew LLC,1651,3386786,2007-11-01 00:00:00,2007-11-01 00:00:00,ANDW
48,"Apple Inc., Q2 2005 Earnings Call, Apr 13, 2005",2005-04-13,2005-03-28,Apple Inc.,1690,1533943,2005-04-13 00:00:00,2005-04-13 00:00:00,AAPL


In [33]:
ld(cc_2)

-cc_2- loaded  (29.32 secs)


In [36]:
cc_meta[keydeveventtypeid==48
    ][, 
      .(n_nomatch=sum(
          with_tz(headline_date, 'America/New_York') != date(mostimportantdateutc),
          na.rm=T),
        n=.N)]

n_nomatch,n
923,272704


In [13]:
cc_meta[keydeveventtypeid==48
    ][headline_date != date(mostimportantdateutc)
    ][1:10]

keydeveventtypeid,headline,headline_date,announcedate,companyname,gvkey,keydevid,mostimportantdateutc,i.mostimportantdateutc,ibtic
48,"Aflac Incorporated, Q4 2002 Earnings Calls, Feb 03, 2003",2003-02-03,2003-02-03,Aflac Incorporated,1449,586581035,2003-02-04 00:10:00,2003-02-04 00:10:00,AFL
48,"Aflac Incorporated, Q4 2001 Earnings Calls, Feb 04, 2002",2002-02-04,2002-02-04,Aflac Incorporated,1449,586581028,2002-02-05 00:10:00,2002-02-05 00:10:00,AFL
48,"Aflac Incorporated, Q4 2003 Earnings Calls, Feb 04, 2004",2004-02-04,2004-02-04,Aflac Incorporated,1449,584967140,2004-02-05 00:10:00,2004-02-05 00:10:00,AFL
48,"Apple Inc., Q2 2015 Earnings Call, Apr 28, 2015",2015-04-28,2015-03-30,Apple Inc.,1690,289919480,2015-04-27 21:00:00,2015-04-27 21:00:00,AAPL
48,"MUFG Americas Holdings Corporation, Q4 2002 Earnings Calls, Jan 17, 2003",2003-01-17,2003-01-17,MUFG Americas Holdings Corporation,2620,584965927,2003-01-18 04:30:00,2003-01-18 04:30:00,CFBK
48,"Cogo Group, Inc., Q2 2013 Earnings Call, Aug 15, 2013",2013-08-15,2013-08-08,"Viewtran Group, Inc.",3820,244293666,2013-08-16 00:00:00,2013-08-16 00:00:00,TRGI
48,"Cogo Group, Inc., Q1 2013 Earnings Call, May 15, 2013",2013-05-15,2013-05-06,"Viewtran Group, Inc.",3820,238221771,2013-05-16 00:00:00,2013-05-16 00:00:00,TRGI
48,"SORL Auto Parts, Inc., Q1 2019 Earnings Call, May 15, 2019",2019-05-15,2019-05-15,"SORL Auto Parts, Inc.",4341,613664010,2019-05-16 00:00:00,2019-05-16 00:00:00,SAUP
48,"SORL Auto Parts, Inc., Q3 2018 Earnings Call, Nov 14, 2018",2018-11-14,2018-11-12,"SORL Auto Parts, Inc.",4341,590956963,2018-11-15 00:01:00,2018-11-15 00:01:00,SAUP
48,"DMC Global Inc., Q2 2017 Earnings Call, Jul 28, 2017",2017-07-28,2017-07-13,DMC Global Inc.,4497,530301302,2017-07-27 21:00:00,2017-07-27 21:00:00,BOOM


In [9]:
cc_meta[keydeveventtypeid==48
    ][1]

keydeveventtypeid,headline,headline_date,announcedate,companyname,gvkey,keydevid,mostimportantdateutc,i.mostimportantdateutc,ibtic
48,"AAR Corp., Q2 2017 Earnings Call, Dec 21, 2016",2016-12-21,2016-12-12,AAR Corp.,1004,412785388,2016-12-21 21:45:00,2016-12-21 21:45:00,AIR


In [14]:
cc_meta[keydeveventtypeid==48
    ][, .(n=.N, n_na=sum(is.na(headline_date)))][]
cc_meta[keydeveventtypeid==28
    ][, .(n=.N, n_na=sum(is.na(headline_date)))][]
cc_meta[keydeveventtypeid==28][is.na(headline_date)][1]

n,n_na
272704,1


n,n_na
527683,105312


keydeveventtypeid,headline,headline_date,announcedate,companyname,gvkey,keydevid,ibtic
28,AAR Corp. Announces Financial Results for the Fourth Quarter of 2014,,2014-11-13,AAR Corp.,1004,280597404,AIR


In [12]:
headline = 'AAR Corp. Announces Unaudited Consolidated Earnings Results for the First Quarter Ended August 31, 2015'

regex_date = regex('results.+?((January|February|March|April|May|June|July|August|September|October|November|December) ?\\d+,? ?20\\d{2}).*?;?', ignore=T)[,2]

date = str_match(headline, regex_date)
date

0,1,2
"Results for the First Quarter Ended August 31, 2015","August 31, 2015",August


## `cc_eps`

`ibtic` is the unique IBES ticker that's good for linking

In [20]:
ld(ibes_statsum_epsus, path=WRDS_DOWNLOAD_DIR)
ld(ibes_surpsum, path=WRDS_DOWNLOAD_DIR)

-ibes_statsum_epsus- loaded  (24.98 secs)
-ibes_surpsum- already exists, will NOT load again!  (0 secs)


In [21]:
ibes_surpsum[1]

ticker,oftic,measure,fiscalp,pyear,pmon,usfirm,anndats,actual,surpmean,surpstdev,suescore
0,TLMR,BPS,QTR,2014,3,1,2014-05-06,9.97,9.81,0.22906,0.69852


In [26]:
nrow(ibes_statsum_epsus)
nrow(ibes_surpsum[measure=='EPS' & usfirm==1])

In [29]:
surpsum = ibes_surpsum[measure=='EPS' & usfirm==1
    ][order(ticker, anndats)]
surpsum[1:20]

ticker,oftic,measure,fiscalp,pyear,pmon,usfirm,anndats,actual,surpmean,surpstdev,suescore
0,TLMR,EPS,QTR,2014,3,1,2014-05-06,0.12,0.08,0.01414,2.82845
0,TLMR,EPS,QTR,2014,6,1,2014-08-06,0.27,0.13,0.01225,11.43137
0,TLMR,EPS,QTR,2014,9,1,2014-11-04,0.26,0.232,0.05848,0.47879
0,TLMR,EPS,ANN,2014,12,1,2015-01-30,1.21,1.194,0.03131,0.5111
0,TLMR,EPS,QTR,2014,12,1,2015-01-30,0.16,0.17167,0.01169,-0.998
0,TLMR,EPS,QTR,2015,3,1,2015-04-30,0.12,0.17833,0.01835,-3.17927
0,TLMR,EPS,QTR,2015,6,1,2015-07-30,0.23,0.214,0.00894,1.78891
0,TLMR,EPS,QTR,2015,9,1,2015-10-28,0.27,0.24,0.01414,2.12134
0,TLMR,EPS,ANN,2015,12,1,2016-01-26,1.02,0.856,0.0994,1.64993
0,TLMR,EPS,QTR,2015,12,1,2016-01-26,0.4,0.23,0.09055,1.87733


In [48]:
surpsum[ticker=='CMVT' & anndats==as.Date('2010-10-04')
    ][order(ticker, anndats, pyear, pmon)]

ticker,oftic,measure,fiscalp,pyear,pmon,usfirm,anndats,actual,surpmean,surpstdev,suescore
CMVT,CMVT,EPS,ANN,2006,1,1,2010-10-04,0.57,0.58,0.04761,-0.21004
CMVT,CMVT,EPS,QTR,2006,1,1,2010-10-04,0.14,0.1775,0.025,-1.5
CMVT,CMVT,EPS,QTR,2006,4,1,2010-10-04,-0.36,0.1675,0.06076,-8.68184
CMVT,CMVT,EPS,QTR,2006,7,1,2010-10-04,-0.45,0.195,0.02887,-22.34308
CMVT,CMVT,EPS,QTR,2006,10,1,2010-10-04,-0.14,0.18,0.02944,-10.86993
CMVT,CMVT,EPS,ANN,2007,1,1,2010-10-04,-1.55,0.698,0.03115,-72.17852
CMVT,CMVT,EPS,QTR,2007,1,1,2010-10-04,-0.88,0.1725,0.05315,-19.80207
CMVT,CMVT,EPS,QTR,2007,4,1,2010-10-04,-0.53,0.09125,0.03271,-18.99557
CMVT,CMVT,EPS,QTR,2007,7,1,2010-10-04,-0.82,0.16375,0.02134,-46.10104
CMVT,CMVT,EPS,QTR,2007,10,1,2010-10-04,-0.16,0.1825,0.02605,-13.1483


In [46]:
x = cc_eps[surpsum, on=.(ticker, fiscalp, anndats_act=anndats),
     nomatch=0,
     .(ticker, fpedats, anndats_act, actual, i.actual, pyear, pmon,
       surpmean, surpstdev, suescore)]
x[is.na(i.actual)]

ticker,fpedats,anndats_act,actual,i.actual,pyear,pmon,surpmean,surpstdev,suescore
AA0A,2017-12-31,2018-02-01,0.22,,2017,12,0.22667,0.00577,
BBWA,2017-12-31,2018-02-15,0.6,,2017,12,0.515,0.03535,
BBWA,2017-12-31,2018-02-15,0.43,,2017,12,0.34333,0.02517,
CMVT,2006-01-31,2010-10-04,0.14,,2008,4,0.1375,0.0685,
CMVT,2006-04-30,2010-10-04,-0.36,,2008,4,0.1375,0.0685,
CMVT,2006-01-31,2010-10-04,0.14,,2008,7,0.1525,0.06652,
CMVT,2006-04-30,2010-10-04,-0.36,,2008,7,0.1525,0.06652,
CMVT,2006-01-31,2010-10-04,0.14,,2008,10,0.155,0.06557,
CMVT,2006-04-30,2010-10-04,-0.36,,2008,10,0.155,0.06557,
CMVT,2006-01-31,2010-10-04,0.14,,2009,1,0.13,0.04359,


In [40]:
x[1:10]
x[, .(n=.N, n_nomatch=sum(actual != i.actual))]

ticker,fpedats,anndats_act,actual,i.actual,pyear,pmon
0,2014-03-31,2014-05-06,0.12,0.12,2014,3
0,2014-06-30,2014-08-06,0.27,0.27,2014,6
0,2014-09-30,2014-11-04,0.26,0.26,2014,9
0,2014-12-31,2015-01-30,1.21,1.21,2014,12
0,2014-12-31,2015-01-30,0.16,0.16,2014,12
0,2015-03-31,2015-04-30,0.12,0.12,2015,3
0,2015-06-30,2015-07-30,0.23,0.23,2015,6
0,2015-09-30,2015-10-28,0.27,0.27,2015,9
0,2015-12-31,2016-01-26,1.02,1.02,2015,12
0,2015-12-31,2016-01-26,0.4,0.4,2015,12


n,n_nomatch
378837,


In [5]:
cc_eps = ibes_statsum_epsus[
    !is.na(actual) & fpi %in% c(1, 6) & fpedats >= as.Date('2000-01-01')
    ][usfirm==1 & measure=='EPS' & curcode==curr_act & estflag=='P' & curcode=='USD'
    ][order(ticker, fpedats, fiscalp, -statpers), 
      .SD[1], 
      keyby=.(ticker, fpedats, fiscalp)
    ][, .(ticker, fpedats, statpers, anndats_act, 
      fiscalp, actual, medest, meanest, stdev, highest, lowest, 
      numest, numup, numdown, cname)
    ]
nrow(cc_eps)
cc_eps[1]

ticker,fpedats,statpers,anndats_act,fiscalp,actual,medest,meanest,stdev,highest,lowest,numest,numup,numdown,cname
0,2014-03-31,2014-04-17,2014-05-06,QTR,0.12,0.07,0.08,0.01,0.1,0.07,4,0,4,TALMER BANCORP


In [None]:
x = cc_eps[cc_meta[keydeveventtypeid==48], 
     on=c('ticker==ibtic'),
     nomatch=0
    ][order(ticker, fpedats, fiscalp)
    ][, .(ticker, fiscalp, fpedats, anndats_act, headline_date, headline, companyname, cname)]

In [21]:
x = cc_eps[cc_meta[keydeveventtypeid==48], 
     on=c('ticker==ibtic', 'anndats_act==announcedate'),
     nomatch=0
    ][order(ticker, fpedats, fiscalp)
    ][, .(ticker, fiscalp, fpedats, anndats_act, headline_date, headline, companyname, cname)]
# x[1:10]

x2 = x[!is.na(headline_date)
    ][, ':='(diff=abs(fpedats-headline_date))][order(-diff)]
x2[, .(n=.N, n_nomatch=sum(diff>=5, na.rm=T))
    ][, pct:=round(n_nomatch/n*100)][]
x2[diff>=5]

n,n_nomatch,pct
8694,8692,100


ticker,fiscalp,fpedats,anndats_act,headline_date,headline,companyname,cname,diff
CSC,ANN,2009-03-31,2009-05-20,2004-05-17,"DXC Technology Company, Q4 2004 Earnings Calls, May 17, 2004",DXC Technology Company,COMP SCIENCES,1779 days
CSC,QTR,2009-03-31,2009-05-20,2004-05-17,"DXC Technology Company, Q4 2004 Earnings Calls, May 17, 2004",DXC Technology Company,COMP SCIENCES,1779 days
GLPC,ANN,2004-12-31,2008-10-10,2008-10-14,"Impax Laboratories Inc., Q2 2008 Earnings Call, Oct-14-2008","Impax Laboratories, Inc.",IMPAX LAB INC,1383 days
GLPC,ANN,2005-12-31,2008-10-10,2008-10-14,"Impax Laboratories Inc., Q2 2008 Earnings Call, Oct-14-2008","Impax Laboratories, Inc.",IMPAX LAB INC,1018 days
SNRZ,QTR,2006-03-31,2008-03-24,2008-04-02,"Sunrise Senior Living Inc., Q4 2007 Earnings Call, Apr-02-2008","Sunrise Senior Living, LLC",SUNRISE SENIOR L,733 days
RWAG,ANN,2007-12-31,2008-02-22,2009-11-12,"RWE AG, Q3 2009 Earnings Call, Nov-12-2009",RWE Aktiengesellschaft,RWE,682 days
FFIV,QTR,2006-12-31,2007-01-24,2008-10-22,"F5 Networks, Inc., Q4 2008 Earnings Call, Oct-22-2008","F5 Networks, Inc.",F5 NETWORKS INC,661 days
RWAG,ANN,2007-12-31,2008-02-22,2009-08-13,"RWE AG, Q2 2009 Earnings Call, Aug-13-2009",RWE Aktiengesellschaft,RWE,591 days
SCHS,QTR,2007-10-31,2007-11-13,2009-06-11,"School Specialty Inc., Q4 2009 Earnings Call, Jun-11-2009","School Specialty, Inc.",SCHOOL SPECIALTY,589 days
FFIV,QTR,2006-12-31,2007-01-24,2008-07-23,"F5 Networks, Inc., Q3 2008 Earnings Call, Jul-23-2008","F5 Networks, Inc.",F5 NETWORKS INC,570 days


In [22]:
cc_eps[ticker=='GLPC' & fpedats==as.Date('2004-12-31')]

ticker,fpedats,statpers,anndats_act,fiscalp,actual,medest,meanest,stdev,highest,lowest,numest,numup,numdown,cname
GLPC,2004-12-31,2008-05-15,2008-10-10,ANN,-0.84,0.01,0.01,,0.01,0.01,1,0,0,IMPAX LAB INC


In [20]:
cc_meta[ibtic=='CSC' & announcedate==as.Date('2009-05-20')]

keydeveventtypeid,headline,headline_date,announcedate,companyname,gvkey,keydevid,ibtic
48,"DXC Technology Company, Q4 2004 Earnings Calls, May 17, 2004",2004-05-17,2009-05-20,DXC Technology Company,3336,586593426,CSC
28,"Computer Sciences Corporation Reports Unaudited Consolidated Earnings Results for the Fourth Quarter and Full Year Ended April 3, 2009; Provides Earnings Guidance for the Fiscal Year of 2010 ; Announces Impairment Charges",2009-04-03,2009-05-20,DXC Technology Company,3336,6639889,CSC


## Match `cc_transcripts` and `cc_meta`

In [20]:
match_transcripts_and_meta <- function(ciq_transcripts, cc_meta) {
    len_ciq_transcripts = nrow(ciq_transcripts)
    
    matched_headline = character(len_ciq_transcripts)
    matched_keydevid = character(len_ciq_transcripts)
    
    for (i in 1:len_ciq_transcripts) {
        transcript_date = ciq_transcripts[i, date]
        cc_filename = ciq_transcripts[i, cc_filename]
        lookup = cc_meta[date==transcript_date]
        rowid = amatch(
            toupper(cc_filename),
            toupper(lookup[, headline]),
            method='dl',
            maxDist=1,
            nthread=4)
        matched_headline[i] = lookup[rowid, headline]
        matched_keydevid[i] = lookup[rowid, keydevid]
    }
    
    ciq_transcripts[, .(cc_filename, 
                        matched_headline=matched_headline,
                        keydevid=matched_keydevid,
                        date, md, qa,
#                         participants_pagenum, md_pagenum, qa_pagenum,
                        cc_path)]
}

ciq_cc = match_transcripts_and_meta(ciq_transcripts, ciq_cc_meta)

In [21]:
ciq_cc[1:.N][, ':='(md=str_sub(md, 1, 10), qa=str_sub(qa, 1, 10))][]

cc_filename,matched_headline,keydevid,date,md,qa,cc_path
"Weyerhaeuser Company, Q1 2018 Earnings Call, Apr 27, 2018","Weyerhaeuser Company, Q1 2018 Earnings Call, Apr 27, 2018",557610842,2018-04-27,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-transcripts/ciq-sp500-2018-1401-1600/cc_2018/Weyerhaeuser Company, Q1 2018 Earnings Call, Apr 27, 2018.pdf"


## output `feather`

In [4]:
ld(cc_2)
cat('nrow(cc_2):', nrow(cc_2), '\n')
cc_2[1, .(date, title, id, n_words, source, source_code, md=str_sub(md, 1, 20), qa=str_sub(qa, 1, 20), len_md, len_qa)]

-cc_2- already exists, will NOT load again!  (0 secs)
nrow(cc_2): 106410 


date,title,id,n_words,source,source_code,md,qa,len_md,len_qa
2018-12-31,Q4 2018 RCI Hospitality Holdings Inc Earnings Call - Final,article-FNDW000020190109eecv00001,9559,CQ FD Disclosure,FNDW,Presentation OPERATO,Questions and Answer,13493,35988


write as `feather`

In [6]:
system.time({
write_feather(
    cc_2[year(date)>=2014, .(id, date, md, qa)],
    str_c(DATA_DIR,'/cc_5y.feather'))
})

   user  system elapsed 
   1.94    3.00   10.67 

write as `TSV`

In [4]:
system.time({fwrite(cc_2, 'data/cc_2.tsv', sep='\t')})

   user  system elapsed 
   2.19    1.50    7.56 