# init

**Data Pre-processing**
- 2011-2018 (8 years) . CC before 2011 has no indicator of the MD and QA parts.
- for 2012-2018, ~98% have MD/QA indicators; for 2011, ~93% has MD/QA

In [2]:
# library
library(feather)
library(stringdist)
library(jsonlite)
library(pdftools)
options('sd_num_thread'=4)

# data_dir
if (tolower(str_sub(getwd(), -11)) == 'onedrive/cc') {
    DATA_DIR = str_c(getwd(), '/data')
    WRDS_DOWNLOAD_DIR = str_c(DATA_DIR, '/WRDS-download')
    cat(str_c('Current working directory: ', getwd()))
} else {
    cat(str_c('Please set working dir to "~/onedrive/cc"'))
}


Current working directory: C:/Users/rossz/Onedrive/CC

# WRDS

## establish connection

In [3]:
library(RPostgres)

# connect to wrds
wrds <- dbConnect(Postgres(),
                  host='wrds-pgdata.wharton.upenn.edu',
                  port=9737,
                  dbname='wrds',
                  sslmode='require',
                  user='xiaomowu',
                  password='SLCyz2018')

## unit test

In [None]:
# Determine the data libraries available at WRDS
res <- dbSendQuery(wrds, "select distinct table_schema
                   from information_schema.tables
                   where table_type ='VIEW'
                   or table_type ='FOREIGN TABLE'
                   order by table_schema")
data <- dbFetch(res, n=-1)
dbClearResult(res)
print(setDT(data)[, sort(table_schema)])

In [6]:
# Determine the datasets within a given library
library = 'comp'
res <- dbSendQuery(wrds, sprintf("select distinct table_name
                   from information_schema.columns
                   where table_schema='%s'
                   order by table_name", library))
data <- dbFetch(res, n=-1)
dbClearResult(res)
setDT(data)[, sort(table_name)] %>% print()

  [1] "aco_amda"          "aco_imda"          "aco_indfnta"      
  [4] "aco_indfntq"       "aco_indfntytd"     "aco_indsta"       
  [7] "aco_indstq"        "aco_indstytd"      "aco_notesa"       
 [10] "aco_notesq"        "aco_notessa"       "aco_notesytd"     
 [13] "aco_pnfnda"        "aco_pnfndq"        "aco_pnfndytd"     
 [16] "aco_pnfnta"        "aco_pnfntq"        "aco_pnfntytd"     
 [19] "aco_transa"        "aco_transq"        "aco_transsa"      
 [22] "aco_transytd"      "adsprate"          "anncomp"          
 [25] "asec_amda"         "asec_imda"         "asec_notesa"      
 [28] "asec_notesq"       "asec_transa"       "asec_transq"      
 [31] "bank_aacctchg"     "bank_adesind"      "bank_afnd1"       
 [34] "bank_afnd2"        "bank_afnddc1"      "bank_afnddc2"     
 [37] "bank_afntind"      "bank_funda"        "bank_funda_fncd"  
 [40] "bank_fundq"        "bank_fundq_fncd"   "bank_iacctchg"    
 [43] "bank_idesind"      "bank_ifndq"        "bank_ifndytd"     
 [46] "ban

In [None]:
# Determine the variables (column headers) within a given dataset
library = 'ibes'
dataset = 'det_epsus'
res <- dbSendQuery(wrds, sprintf("select column_name
                   from information_schema.columns
                   where table_schema='%s'
                   and table_name='%s'
                   order by column_name", library, dataset))
data <- dbFetch(res, n=-1)
dbClearResult(res)
setDT(data)[, column_name] %>% print()

In [None]:
# To query the crsp.dsf dataset
library = "ibes"
dataset = 'ptgdet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
data <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
nrow(data)

## I/B/E/S

### Detail

In [None]:
library = 'ibes'
dataset = 'det_epsus' # eps
# dataset = 'det_xepsus' # non-eps
years = 2000:2018

for (year in years) {
    print(sprintf('Start %s at %s', year, now()))
    query = sprintf("select * from %s.%s where to_char(anndats,'yyyy')='%s'", library, dataset, year)
    save_name_short = sprintf('%s_%s_%s', library, dataset, year)
    save_name_full = sprintf('../data/WRDS-download/%s_%s_%s.rds', library, dataset, year)
    
    res <- dbSendQuery(wrds, query)
    data <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
    
    assign(save_name_short, data)
    sprintf('%s_%s: %s', dataset, year, nrow(data))
    saveRDS(data, save_name_full)
}

In [48]:
library = 'ibes'
dataset = 'detu_epsus' # eps
# dataset = 'det_xepsus' # non-eps

query = sprintf("select ticker, estimator, analys, pdf, fpi, value, fpedats, revdats, revtims, anndats, anntims from %s.%s where fpedats between '2000-01-01' and '2018-12-31' and (fpi='6' or fpi='7')", library, dataset, year)

res <- dbSendQuery(wrds, query)
ibes_detu_epsus<- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s', dataset, nrow(ibes_detu_epsus))
sv(ibes_detu_epsus, path=WRDS_DOWNLOAD_DIR)

-ibes_detu_epsus- saved  (22.79 secs)


### actuals

In [69]:
library = 'ibes'
dataset = 'actu_epsus' 

query = sprintf("select ticker, anndats as repdats, value as act, pends as fpedats, pdicity from %s.%s where pends between '2000-01-01' and '2018-12-31' and pdicity='QTR'", library, dataset, year)

res <- dbSendQuery(wrds, query)
ibes_actu_epsus<- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s', dataset, nrow(ibes_actu_epsus))
sv(ibes_actu_epsus, path=WRDS_DOWNLOAD_DIR)
ibes_actu_epsus[1]

-ibes_actu_epsus- saved  (0.82 secs)


ticker,repdats,act,fpedats,pdicity
0,2014-02-14,,2012-12-31,QTR


### Summary

In [9]:
library = 'ibes'
dataset = 'statsum_epsus' # summary eps
query = sprintf("select * from %s.%s", library, dataset)

res <- dbSendQuery(wrds, query)
ibes_statsum_epsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

ibes_statsum_epsus[1]
sv(ibes_statsum_epsus, path=WRDS_DOWNLOAD_DIR)

-statsum_epsus- saved  (43.43 secs)


In [None]:
library = 'ibes'
dataset = 'statsum_xepsus' # summary xeps

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_statsum_xepsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
ibes_statsum_xepsus[1]
sv(ibes_statsum_xepsus, path=WRDS_DOWNLOAD_DIR)

### Surprise

In [None]:
library = 'ibes'
dataset = 'surpsum' # summary surprise 

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_surpsum <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
ibes_surpsum[1]
sv(ibes_surpsum, path=WRDS_DOWNLOAD_DIR)

## price target

In [None]:
library = "ibes"
dataset = 'ptgdet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ptgdet <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
sv(ptgdet, path='/data/WRDS-download')

In [6]:
nrow(ptgdet)

In [5]:
ptgdet[1]

ticker,cusip,oftic,cname,actdats,estimid,alysnam,horizon,value,estcur,curr,amaskcd,usfirm,measure,acttims,anndats,anntims
0,87482X10,TLMR,TALMER BANCORP,2014-03-10,PRMDN082,PERMDENIED,12,16,USD,USD,538750,1,PTG,32043,2014-03-10,1200


## recommend

In [None]:
library = 'ibes'
dataset = 'recddet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
recddet <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
sv(recddet, path='/data/WRDS-download')

In [7]:
nrow(recddet)

In [8]:
recddet[1]

ticker,cusip,cname,oftic,actdats,estimid,analyst,ereccd,etext,ireccd,itext,emaskcd,amaskcd,usfirm,acttims,revdats,revtims,anndats,anntims
0,87482X10,TALMER BANCORP,TLMR,2014-03-10,PRMDN082,PERMDENIED,2,OUTPERFORM,2,BUY,50659,538750,1,32043,2016-01-26,34552,2014-03-10,1200


## stock price

In [None]:
library = 'comp'
dataset = 'secd' # factors
years = 2000:2018

system.time({
secd = list()
for (year in years) {
    print(sprintf('Start %s at %s', year, now()))
    query = sprintf("select * from %s.%s where to_char(datadate,'yyyy')='%s'", library, dataset, year)

    res <- dbSendQuery(wrds, query)
    data <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
    
    print(sprintf('%s_%s: %s obs', dataset, year, nrow(data)))
    secd[[as.character(year)]] = data
}
secd = rbindlist(secd, use=T)
})
sv(secd, path=WRDS_DOWNLOAD_DIR)

Download `CRSP`
It's *deprecated*!

In [None]:
library = 'crspq'
dataset = 'dsf' # factors
years = 2001:2018

for (year in years) {
    print(sprintf('Start %s at %s', year, now()))
    query = sprintf("select * from %s.%s where to_char(date,'yyyy')='%s'", library, dataset, year)
    save_name_short = sprintf('%s_%s', dataset, year)
    save_name_full = sprintf('../data/WRDS-download/%s_%s.rds', dataset, year)

    res <- dbSendQuery(wrds, query)
    data <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
    
    sprintf('%s_%s: %s', dataset, year, nrow(data))
    assign(save_name_short, data)
    saveRDS(data, save_name_full)
}

In [31]:
ld(dsf_2001, path=WRDS_DOWNLOAD_DIR)

-dsf_2001- loaded  (1.11 secs)


In [32]:
dsf_2001[1]

cusip,permno,permco,issuno,hexcd,hsiccd,date,bidlo,askhi,prc,vol,ret,bid,ask,shrout,cfacpr,cfacshr,openprc,numtrd,retx
36720410,10001,7953,10398,2,4925,2001-01-02,9.3125,9.875,9.875,3849,0.01282051,9.5,9.875,2498,1.5,1.5,9.3125,9,0.01282051


## factors

In [7]:
library = 'ff'
dataset = 'factors_daily' # factors

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
factors <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(data))
sv(factors, path='/data/WRDS-download')

-factors- saved  (0.09 secs)


In [6]:
factors[1]

date,mktrf,smb,hml,rf,umd
1926-07-01,0.001,-0.0024,-0.0028,9e-05,


## firm-id

### `ibes.id`

In [4]:
library = 'ibes'
dataset = 'id' # firm names

query = sprintf("select ticker, cusip, cname, sdates from %s.%s where usfirm=1 and cusip != ''", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_id <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(ibes_id))
sv(ibes_id, path=WRDS_DOWNLOAD_DIR)

query = sprintf("select ticker, cusip, cname, oftic, sdates from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_id2 <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ibes_id2))

ibes_id2[1]
sv(ibes_id2, path=WRDS_DOWNLOAD_DIR)

-ibes_id- saved  (0.15 secs)


ticker,cusip,cname,oftic,sdates
0,87482X10,TALMER BANCORP,TLMR,2014-02-20


-ibes_id2- saved  (0.54 secs)


### `comp.security`

In [391]:
library = 'compm'
dataset = 'security' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_security <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(comp_security))

comp_security[1]
sv(comp_security, path=WRDS_DOWNLOAD_DIR)

tic,gvkey,iid,cusip,dlrsni,dsci,epf,exchg,excntry,ibtic,isin,secstat,sedol,tpci,dldtei
AE.2,1000,1,32102,9,COM USD1,,12,USA,,,I,,0,1978-06-30


-comp_security- saved  (0.34 secs)


### `ciq.wrds_gvkey`

In [33]:
library = 'ciq'
dataset = 'wrds_gvkey' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ciq_wrds_gvkey <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ciq_wrds_gvkey))

ciq_wrds_gvkey[1]
sv(ciq_wrds_gvkey, path=WRDS_DOWNLOAD_DIR)

companyid,gvkey,startdate,enddate,companyname
18507,235716,,,2M Invest A/S


-ciq_wrds_gvkey- saved  (0.45 secs)


### `crsp.stocknames`

In [88]:
library = 'crsp'
dataset = 'stocknames' # firm names

query = sprintf("select permno, ncusip, comnam, namedt, nameenddt from %s.%s where ncusip != ''", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_stocknames <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_stocknames))
sv(crsp_stocknames, path=WRDS_DOWNLOAD_DIR)

query = sprintf("select ticker, comnam, permno, ncusip, namedt, nameenddt from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_stocknames2 <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_stocknames2))
crsp_stocknames2[1]
sv(crsp_stocknames2, path=WRDS_DOWNLOAD_DIR)

permno,ncusip,comnam,namedt,nameenddt
10000,68391610,OPTIMUM MANUFACTURING INC,1986-01-07,1987-06-11


-crsp_stocknames- saved  (0.17 secs)


### `crsp.ccm`

In [22]:
library = 'crsp'
dataset = 'ccmxpf_linktable' # firm names

query = sprintf("select gvkey, lpermco as permco, lpermno as permno, linkdt, linkenddt from %s.%s where usedflag=1 and linkprim in ('P', 'C')", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_ccmlink <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_ccmlink))
sv(crsp_ccmlink, path=WRDS_DOWNLOAD_DIR)
crsp_ccmlink[1]

-crsp_ccmlink- saved  (0.13 secs)


gvkey,permco,permno,linkdt,linkenddt
1000,23369,25881,1970-11-13,1978-06-30


## index-cst

In [63]:
ld(comp_idx_prof, path=WRDS_DOWNLOAD_DIR)
ld(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)
ld(comp_security, path=WRDS_DOWNLOAD_DIR)
ld(crsp_stocknames, path=WRDS_DOWNLOAD_DIR)

-comp_idx_prof- already exists, will NOT load again!  (0 secs)
-comp_idx_cst- already exists, will NOT load again!  (0 secs)
-comp_security- already exists, will NOT load again!  (0 secs)
-crsp_stocknames- already exists, will NOT load again!  (0 secs)


In [11]:
library = 'compa'
dataset = 'idx_index' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_idx_profile <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

comp_idx_profile[conm=='S&P 500 Comp-Ltd']
sv(comp_idx_profile, path=WRDS_DOWNLOAD_DIR)

conm,gvkeyx,idx13key,idxcstflg,idxstat,indexcat,indexgeo,indexid,indextype,indexval,spii,spmi,tic,tici
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
S&P 500 Comp-Ltd,3,500,Y,A,S&P,USA,500,LGCAP,500,,10,I0003,I0003


-comp_idx_profile- saved  (0.01 secs)


In [26]:
library = 'compa'
dataset = 'idxcst_his' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_idx_cst <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

comp_idx_cst[1]
sv(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)

gvkey,iid,gvkeyx,from,thru
<chr>,<chr>,<chr>,<date>,<date>
1004,1,30824,1994-10-01,


-comp_idx_cst- saved  (0.1 secs)


## keydev

In [30]:
library = 'ciq'
dataset = 'wrds_keydev' # everything except for `headline` and `situation`

query = sprintf("select * from %s.%s 
    where keydeveventtypeid in (28, 48, 55, 61, 144)", library, dataset)
res <- dbSendQuery(wrds, query)
ciq_wrds_keydev <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ciq_wrds_keydev))

# `mostimportantdateutc` is in UTC but R doesn't recognize,
# so we need to set it explictly
ciq_wrds_keydev[, ':='(mostimportantdateutc=force_tz(mostimportantdateutc, 'UTC'))]

sv(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)

In [None]:
library = 'ciq'
dataset = 'ciqkeydev' # `headline` and `situation`

query = sprintf("select * from %s.%s 
    where keydevid in 
        (select keydevid from ciq.wrds_keydev
        where keydeveventtypeid in (28, 48, 55, 61, 144))",
    library, dataset)
res <- dbSendQuery(wrds, query)
ciq_keydev <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ciq_keydev))

# `mostimportantdateutc` is in UTC but R doesn't recognize,
# so we need to set it explictly
ciq_keydev[, ':='(mostimportantdateutc=force_tz(mostimportantdateutc, 'UTC'))]

sv(ciq_keydev, path=WRDS_DOWNLOAD_DIR)

## CCM

`ccmxpf_lnkhist` from CCM provides a linktable between CRSP and Compustat

In [12]:
library = 'crspq'
dataset = 'ccmxpf_lnkhist'

query = sprintf("select * from %s.%s",
    library, dataset)
res <- dbSendQuery(wrds, query)
ccm <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ccm))
sv(ccm, path=WRDS_DOWNLOAD_DIR)

-ccm- saved  (0.28 secs)


# Coverage

In [11]:
ld(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)
ld(comp_security, path=WRDS_DOWNLOAD_DIR)

-comp_idx_cst- loaded  (0.1 secs)


In [12]:
SDATE = as.Date('2005-01-01')
EDATE = as.Date('2018-12-31')

In [44]:
# DJI
dji_cst = unique(comp_idx_cst[gvkeyx=='000005'
    ][comp_security[, .(gvkey, iid, tic, cusip=str_sub(cusip, 1, 6), sedol)], 
      on=.(gvkey, iid), nomatch=0])
dji_cst
sv(dji_cst)

gvkey,iid,gvkeyx,from,thru,tic,cusip,sedol
1300,1,5,1925-12-07,2008-02-18,HON,438516,2020459
1356,1,5,1959-06-01,2013-09-22,AA.3,013817,BYVZDB3
1447,1,5,1982-08-30,,AXP,025816,2026082
1487,1,5,2004-04-08,2008-09-21,AIG,026874,2027342
1581,1,5,1939-03-14,2004-04-07,T.2,001957,2064888
1690,1,5,2015-03-19,,AAPL,037833,2046251
2136,1,5,2004-04-08,,VZ,92343V,2090571
2285,1,5,1987-03-12,,BA,097023,2108601
2817,1,5,1991-05-06,,CAT,149123,2180201
2968,1,5,2001-01-02,,JPM,46625H,2190385


-dji_cst- saved  (0.01 secs)


# Y

## `SUE`

Variable Description
- `datadate`(COMP): End Date of Earnings Report(earlier than `rdq` and `repdats`)
- `fyearq`(COMP): fiscal year
- `fyr`(COMP): fiscal year - end month
- `fqtr`(COMP): fiscal quarter
- `repdats`(IBES): Report Date of Quarterly Earnigns
- `rdq`(COMP): Report Date of Quarterly Earnings
- `leadrdq`(COMP): Report Date of NEXT Quarter's Earnings

- `numest`: Number of Forecasts
- `medest`: Median Forecast (based on estimates in the 90 days prior to the EAD)
- `basis`: Whether most analysts report estimates on primary(P) / diluted(D)
- `act`(IBES): actual earnings
- `se`(COMP): Standard Earnings (=act/price_close)
- `sest`: Standard Estimates (= se-sue = est/price_close)
- `sue1`: SUE based on a rolling seasonal random walk model (LM,p. 185)
- `sue2`: SUE accounting for  exclusion of special items
- `sue3`: SUE based on IBES reported analyst forecasts and actuals  

- `mcap`: Market Cap

In [59]:
sue = fread('data/sue_final.csv', colClasses=c('gvkey'='character'))
sue[, ':='(permno=as.character(permno),
           datadate=ymd(datadate),
           rdq=ymd(rdq),
           rdq1=ymd(rdq1),
           leadrdq1=ymd(leadrdq1),
           repdats=ymd(repdats),
           sest=se-sue3)]
sue[1]
sv(sue)

gvkey,ticker,permno,conm,fyearq,fqtr,datadate,fyr,rdq,rdq1,leadrdq1,repdats,mcap,medest,act,numest,basis,sue1,sue2,sue3,se,sest
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<date>,<dbl>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,ADCT,50906,ADC TELECOMMUNICATIONS INC,2010,4,2010-09-30,9,2010-11-23,2010-11-23,2011-02-23,2010-11-23,1231.524,0.16,0.15,1,D,,,-0.000789266,0.01183899,0.01262826


-sue- saved  (0.25 secs)


# X

## Find SP 500

In [46]:
# thru is missing means it's still effective
# fill the missing with today()
ld(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)

sp500_cst = comp_idx_cst[gvkeyx=='000003'
    ][is.na(from), ':='(from=as.Date('1900-01-01'))
    ][is.na(thru), ':='(thru=today())
    ][order(gvkey, iid)
    ][, .SD[1], keyby=.(gvkey)
    ][, .(gvkey, from, thru)] %>% unique()

sv(sp500_cst)

-comp_idx_cst- already exists, will NOT load again!  (0 secs)
-sp500_cst- saved  (0.01 secs)


In [48]:
sp500_cst[, uniqueN(gvkey)]
sp500_cst[1]

gvkey,from,thru
<chr>,<date>,<date>
1010,1964-03-31,1984-06-30


## CAR

Compute the CAR based on the following two event types: (1) earnings announcment; (2) earnings call. Do as follows:

1. Create a dataset where the first col is `permno` and the second is `event_date`.
2. Feed the dataset to WRDS's online event service
3. Get results

Before generating the event samples, we need to first add `permno` to `ciq_wrds_keydev`. We'll use `ccm` to link these two variables.

In the next cell, I:
- only select "primary" links (LC,LU,LS) which are considered to be robust
- fill missing `linkdt` and `linkenddt`
- for very `gvkey`, only select its **first** issue.
- link to `ciq_wrds_keydev`
- only select events that: 
    - later than 2000-01-01
    - S&P 500 companies

In [3]:
ld(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)
ld(ccm, path=WRDS_DOWNLOAD_DIR)
ld(sp500_cst)

-ciq_wrds_keydev- loaded  (4.41 secs)
-ccm- loaded  (0.1 secs)
-sp500_cst- loaded  (0 secs)


In [4]:
gvkey_permno_link = ccm[linktype %in% c('LC', 'LU', 'LS')
    ][is.na(linkenddt) | linkenddt>=as.Date('2000-01-01'), 
     .(gvkey, lpermno, liid, linkdt, linkenddt)
    ][is.na(linkenddt), ':='(linkenddt=as.Date('2020-12-31'))
    ][is.na(linkdt), ':='(linkdt=as.Date('1990-01-01'))
    ][order(gvkey, liid)
    ][, .SD[1], keyby=.(gvkey)
    ][order(gvkey, linkdt)
    ][, ':='(liid=NULL)] %>% 
    na.omit(cols=c('gvkey', 'lpermno')) %>% 
    unique(by=c('gvkey', 'lpermno'))

event_samples = ciq_wrds_keydev[gvkey_permno_link, on=.(gvkey), nomatch=0
    ][as.Date(mostimportantdateutc)>linkdt & as.Date(mostimportantdateutc)<linkenddt
    ][as.Date(mostimportantdateutc) >= as.Date('2000-01-01')
    ][gvkey %in% sp500_cst[, unique(gvkey)]
    ][, ':='(linkdt=NULL, linkenddt=NULL)]

Then we generate two event samples:
- `event_samples_earnings_call`: the earnings CALL (type 48)
- `event_samples_earnings_announce`: the earnings announcement (type28)

In [5]:
event_samples_test = event_samples[keydeveventtypeid==48,
     .(permno=lpermno, edate=format(mostimportantdateutc, '%m/%d/%Y'))
    ][order(permno, edate)
    ][1:1000] %>% unique()

toJSON(event_samples_test, pretty=T) %>% write('data/car/event_samples_test.json')

event_samples_earnings_call = event_samples[keydeveventtypeid==48,
     .(permno=lpermno, edate=format(mostimportantdateutc, '%m/%d/%Y'))
    ][order(permno, edate)] %>% unique()

toJSON(event_samples_earnings_call, pretty=T) %>% write('data/car/event_samples_earnings_call.json')

event_samples_earnings_announce = event_samples[keydeveventtypeid==28,
     .(permno=lpermno, edate=format(mostimportantdateutc, '%m/%d/%Y'))
    ][order(permno, edate)] %>% unique()

toJSON(event_samples_earnings_announce, pretty=T) %>% write('data/car/event_samples_earnings_announce.json')

## `cc_transcript`

In [8]:
cc_paths = list.files(
    'C:/Users/rossz/OneDrive/CC/data/ciq-dji',
    recursive=T,
    pattern='.pdf$',
    full=T)
cat('N of all transcripts:', length(cc_paths), '\n')

cc_paths = cc_paths[str_detect(cc_paths, regex('Earnings? Call', ignore=T))]
cat('N of Earnings Call:', length(cc_paths), '\n')

N of all transcripts: 0 
N of Earnings Call: 0 


In [3]:
parse_one_page <- function(cc_path) {
    obs = list()
    
    cc = pdf_text(cc_path)
    n_page = length(cc)
    
    # cc_filename
    cc_filename = str_match(
        cc_path,
        regex('/([^/]+?)\\.pdf', ignore=T))[,2]
    
    # remove header
    header = str_extract(cc[n_page], '^.+(\\r\\n)*20\\d{2}\\r\\n')
    cc = str_replace_all(cc, fixed(header), '')
    
    # remove footer
    cc = str_replace_all(cc, '\\r\\n.+\\r\\n.+\\r\\n$', '')
    
    # determin which page is TOC
    toc_pagenum = str_which(cc, regex('contents[\\s\\S]+table of contents', ignore=T))
    if (length(toc_pagenum) > 1) {
        stop('len(toc_pagenum) > 1 !!')
    }
    
    # extract pagenum
    toc_page = cc[toc_pagenum]
    toc_participants = str_match(
        toc_page, 
        regex('contents[\\s\\S]+table of contents[\\s.]+(call participants.+)?', ignore=T))[,2]
    participants_pagenum = as.integer(str_match(toc_participants, '[\\. ]+(\\d+)')[,2])
    
    toc_md = str_match(
        toc_page,
        regex('contents[\\s\\S]+table of contents[\\s\\S]+(presentation.+)', ignore=T))[,2]
    md_pagenum = as.integer(str_match(toc_md, '[\\. ]+(\\d+)')[,2])
    
    toc_qa = str_match(
        toc_page,
        regex('contents[\\s\\S]+table of contents[\\s\\S]+(question and answer.+)', ignore=T))[,2]
    qa_pagenum = as.integer(str_match(toc_qa, '[\\. ]+(\\d+)')[,2])
    
    # extract {md, qa}
    if (!is.na(qa_pagenum)) {
        md = str_c(cc[md_pagenum:(qa_pagenum-1)], collapse=' ') %>% str_replace_all('\\r\\n', ' ')
        qa = str_c(cc[qa_pagenum:(n_page-1)], collapse='') %>% str_replace_all('\\r\\n', ' ')

        # test if `md` and `qa` parsed successfully
        # if failed, parse again
        if ((str_sub(md, 1, 12) != 'Presentation') | (str_sub(qa, 1, 19) != 'Question and Answer')) {
            md_pagenum = str_which(cc, '^Presentation')
            qa_pagenum = str_which(cc, '^Question and Answer')
            
            md = str_c(cc[md_pagenum:(qa_pagenum-1)], collapse='') %>% str_replace_all('\\r\\n', ' ')
            qa = str_c(cc[qa_pagenum:(n_page-1)], collapse='') %>% str_replace_all('\\r\\n', ' ')
            
            if ((str_sub(md, 1, 12) != 'Presentation') | (str_sub(qa, 1, 19) != 'Question and Answer')) {
                cat('MD/QA parsing error!:', cc_path, '\n')
                cat('\t MD:', str_sub(md, 1, 13), '\n')
                cat('\t QA:', str_sub(qa, 1, 20), '\n')
                # cat('\t New MD:', str_sub(md, 1, 13), '\n')
                # cat('\t New QA:', str_sub(qa, 1, 20), '\n')
            }

        }
        
    } else {
        md = str_c(cc[md_pagenum:(n_page-1)], collapse=' ') %>% str_replace_all('\\r\\n', ' ')
        qa = NA
        if (str_sub(md, 1, 12) != 'Presentation') {
            md_pagenum = str_which(cc, '^Presentation')
            md = str_c(cc[md_pagenum:(n_page-1)], collapse=' ') %>% str_replace_all('\\r\\n', ' ')
            
            if (str_sub(md, 1, 12) != 'Presentation') {
                cat('MD parsing error!:', cc_path, '\n')
                cat('\t MD:', str_sub(md, 1, 13), '\n')
                cat('\t New MD:', str_sub(md, 1, 13), '\n')
            }
        }
    }

    # get date
    date=mdy(
      str_match(cc_filename, regex('Call, (.+20\\d{2})', ignore=T))[,2])
    
    # prepare output
    obs$cc_filename = cc_filename
    obs$date = date
    # obs$participants_pagenum = participants_pagenum
    # obs$md_pagenum = md_pagenum
    # obs$qa_pagenum = qa_pagenum
    obs$md = md
    obs$qa = qa
    obs$cc_path = cc_path
    obs
    
}

# parse all pdf
cc_transcripts = list()
# for (i in sample(1:length(cc_paths), 10)) {
for (i in 1:length(cc_paths)) {
    cc_transcripts[[i]] = parse_one_page(cc_paths[i])
}

cc_transcripts = rbindlist(cc_transcripts)[date <= EDATE]

sv(cc_transcripts)

-cc_transcripts- saved  (8.16 secs)


In [4]:
cc_transcripts[1][, ':='(md = str_sub(md, 1, 20), qa = str_sub(qa, 1, 20))][]

cc_filename,date,md,qa,cc_path
"Honeywell International Inc., Q1 2008 Earnings Call, Apr-18-2008",2008-04-18,Presentation Operato,Question and Answer,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/001300-hon/Honeywell International Inc., Q1 2008 Earnings Call, Apr-18-2008.pdf"


## `cc_meta`

In [3]:
system.time({
ld(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)
ld(ciq_keydev, path=WRDS_DOWNLOAD_DIR)
ld(comp_security, path=WRDS_DOWNLOAD_DIR)

ciq_wrds_keydev = ciq_wrds_keydev[as.Date(mostimportantdateutc)>=as.Date('2005-01-01')]
ciq_keydev = ciq_keydev[as.Date(mostimportantdateutc)>=as.Date('2005-01-01')]
})

-ciq_wrds_keydev- loaded  (11.63 secs)
-ciq_keydev- loaded  (53.35 secs)
-comp_security- loaded  (0.4 secs)


   user  system elapsed 
 53.370   1.258  68.959 

In [1]:
# `announceddate` (ciq_keydev) and `announcedate` (ciq_wrds_keydev) are the same
# same for `entereddate` and `enterdate`

extract_headline_date <- function(headline) {
    regex_date = regex('results.+?((January|February|March|April|May|June|July|August|September|October|November|December) ?\\d+,? ?20\\d{2}).*?;?', ignore=T)
    date = str_match(headline, regex_date)[,2]
    date = mdy(date)
}

cc_meta = ciq_keydev[ciq_wrds_keydev[keydeveventtypeid %in% c(48)],
     on=.(keydevid), nomatch=0
    ][, .(keydeveventtypeid,
      headline, 
      headline_date=mdy(
        str_match(headline, regex('Calls?, (.+?20\\d{2})', ignore=T))[,2]),
      announcedate, 
      companyname, gvkey, keydevid,
      mostimportantdateutc)
    ][is.na(headline_date), ':='(headline_date=as.Date(with_tz(mostimportantdateutc, 'America/New_York')))
    ][, ':='(headline_year=year(headline_date), headline_month=month(headline_date))]
setkey(cc_meta, headline_year, headline_month)

sv(cc_meta)
nrow(cc_meta)

ERROR: Error in eval(expr, envir, enclos): object 'ciq_keydev' not found


In [66]:
cc_meta[1]

keydeveventtypeid,headline,headline_date,announcedate,companyname,gvkey,keydevid,mostimportantdateutc,headline_year,headline_month
48,"AT&T Corp., Q4 2004 Earnings Call, Jan-20-2005",2005-01-20,2005-01-06,AT&T Corp.,1581,1031931,2005-01-20 13:15:00,2005,1


## `cc_transcripts` + `cc_meta`

In [2]:
ld(cc_transcripts)
ld(cc_meta)

-cc_transcripts- loaded  (0.96 secs)
-cc_meta- loaded  (1.49 secs)


In [50]:
match_transcripts_and_meta <- function(cc_transcripts, cc_meta) {
    len_cc_transcripts = nrow(cc_transcripts)
    
    matched_headline = character(len_cc_transcripts)
    matched_headline_date = rep(as.Date('1970-01-01'), len_cc_transcripts)
    matched_keydevid = numeric(len_cc_transcripts)
    matched_gvkey = character(len_cc_transcripts)
    matched_mid_utc = rep(as.POSIXct('1970-01-01'), len_cc_transcripts)
    
    for (i in 1:len_cc_transcripts) {
        transcript_date = cc_transcripts[i, date]
        transcript_year = year(transcript_date)
        transcript_month = month(transcript_date)
        
        cc_filename = cc_transcripts[i, cc_filename]
        lookup = cc_meta[headline_year==transcript_year & headline_month==transcript_month]
        # lookup = cc_meta
        rowid = amatch(
            toupper(cc_filename),
            toupper(lookup[, headline]),
            method='dl',
            maxDist=5,
            nthread=4)
        matched_headline[i] = lookup[rowid, headline]
        matched_keydevid[i] = lookup[rowid, keydevid]
        matched_gvkey[i] = lookup[rowid, gvkey]
        matched_mid_utc[i] = lookup[rowid, mostimportantdateutc]
        matched_headline_date[i] = lookup[rowid, headline_date]
    }
    
    res = cc_transcripts[, .(gvkey=matched_gvkey,
         headline_date=matched_headline_date,
         mid_utc=matched_mid_utc,
         headline=matched_headline,
         keydevid=matched_keydevid,
         md, qa,
         cc_path)
    ][order(gvkey, headline_date)]
}

cc = match_transcripts_and_meta(cc_transcripts, cc_meta)
sv(cc)

-cc- saved  (7.78 secs)


In [3]:
ld(cc)

-cc- loaded  (0.56 secs)


In [5]:
cc[is.na(keydevid)][, ':='(md=str_sub(md, 1, 10), qa=str_sub(qa, 1, 10))][]

gvkey,headline_date,mid_utc,headline,keydevid,md,qa,cc_path
,2010-08-12,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/005073-gm/General Motors Company, Q2 2010 Earnings Call, Aug-12-2010.pdf"
,2010-11-10,,,,Presentati,,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/005073-gm/General Motors Company, Q3 2010 Earnings Call, Nov 10, 2010.pdf"
,2019-06-27,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/007906-nke/NIKE, Inc., Q4 2019 Earnings Call, Jun 27, 2019.pdf"
,2019-06-27,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/011264-wba/Walgreens Boots Alliance, Inc., Q3 2019 Earnings Call, Jun 27, 2019.pdf"
,2019-07-15,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/003243-c/Citigroup Inc., Q2 2019 Earnings Call, Jul 15, 2019.pdf"
,2019-07-16,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/002968-jpm/JPMorgan Chase Co., Q2 2019 Earnings Call, Jul 16, 2019.pdf"
,2019-07-16,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/006266-jnj/Johnson Johnson, Q2 2019 Earnings Call, Jul 16, 2019.pdf"
,2019-07-16,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/114628-gs/The Goldman Sachs Group, Inc., Q2 2019 Earnings Call, Jul 16, 2019.pdf"
,2019-07-17,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/006066-ibm/International Business Machines Corporation, Q2 2019 Earnings Call, Jul 17, 2019.pdf"
,2019-07-17,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/007647-bac/Bank of America Corporation, Q2 2019 Earnings Call, Jul 17, 2019.pdf"


In [11]:
query = 'General Motors Company, Q2 2010 Earnings Call, Aug-12-2010'
key_dt = cc_meta[headline_year==2010 & headline_month==8]
key = key_dt[, headline]
rowid = amatch(
    toupper(query),
    toupper(key),
    method='dl',
    maxDist=5,
    nthread=4)
key_dt[rowid]

keydeveventtypeid,headline,headline_date,announcedate,companyname,gvkey,keydevid,mostimportantdateutc,headline_year,headline_month
48,"General Motors Company, Q2 2010 Earnings Call, Aug-12-2010",2010-08-12,2010-08-10,General Motors Company,,111581311,2010-08-12 14:00:00,2010,8


# `Dataset`

## `CC` + `SUE`

In [15]:
ld(sue)
ld(cc)

-sue- loaded  (0.1 secs)
-cc- already exists, will NOT load again!  (0 secs)


In [16]:
sue[1]

gvkey,ticker,permno,conm,fyearq,fqtr,datadate,fyr,rdq,rdq1,leadrdq1,repdats,mcap,medest,act,numest,basis,sue1,sue2,sue3,se
1013,ADCT,50906,ADC TELECOMMUNICATIONS INC,2010,4,2010-09-30,9,2010-11-23,2010-11-23,2011-02-23,2010-11-23,1231.524,0.16,0.15,1,D,,,-0.000789266,0.01183899


In [52]:
cc[1][, ':='(md=str_sub(md, 1, 10), qa=str_sub(qa, 1, 10))][]

gvkey,headline_date,mid_utc,headline,keydevid,md,qa,cc_path
1300,2007-07-19,2007-07-19 08:00:00,"Honeywell International Inc., Q2 2007 Earnings Call, Jul-19-2007",4527845,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/001300-hon/Honeywell International Inc., Q2 2007 Earnings Call, Jul-19-2007.pdf"


In [53]:
cc[is.na(gvkey)][, ':='(md=str_sub(md, 1, 10), qa=str_sub(qa, 1, 10))][]

gvkey,headline_date,mid_utc,headline,keydevid,md,qa,cc_path
,2010-08-12,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/005073-gm/General Motors Company, Q2 2010 Earnings Call, Aug-12-2010.pdf"
,2010-11-10,,,,Presentati,,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/005073-gm/General Motors Company, Q3 2010 Earnings Call, Nov 10, 2010.pdf"
,2019-06-27,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/007906-nke/NIKE, Inc., Q4 2019 Earnings Call, Jun 27, 2019.pdf"
,2019-06-27,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/011264-wba/Walgreens Boots Alliance, Inc., Q3 2019 Earnings Call, Jun 27, 2019.pdf"
,2019-07-15,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/003243-c/Citigroup Inc., Q2 2019 Earnings Call, Jul 15, 2019.pdf"
,2019-07-16,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/002968-jpm/JPMorgan Chase Co., Q2 2019 Earnings Call, Jul 16, 2019.pdf"
,2019-07-16,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/006266-jnj/Johnson Johnson, Q2 2019 Earnings Call, Jul 16, 2019.pdf"
,2019-07-16,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/114628-gs/The Goldman Sachs Group, Inc., Q2 2019 Earnings Call, Jul 16, 2019.pdf"
,2019-07-17,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/006066-ibm/International Business Machines Corporation, Q2 2019 Earnings Call, Jul 17, 2019.pdf"
,2019-07-17,,,,Presentati,Question a,"C:/Users/rossz/OneDrive/CC/data/ciq-dji/007647-bac/Bank of America Corporation, Q2 2019 Earnings Call, Jul 17, 2019.pdf"


In [19]:
x = sue[cc[, .(gvkey, headline_date, headline, keydevid)],
     on=c('gvkey', 'rdq==headline_date')]
x[1]

gvkey,ticker,permno,conm,fyearq,fqtr,datadate,fyr,rdq,rdq1,leadrdq1,repdats,mcap,medest,act,numest,basis,sue1,sue2,sue3,se,headline,keydevid
1300,ALD,10145,HONEYWELL INTERNATIONAL INC,2007,2,2007-06-30,12,2007-07-19,2007-07-19,2007-10-19,2007-07-19,42119.73,0.76,0.78,16,D,0.002665245,0.002919561,0.000355366,0.01385928,"Honeywell International Inc., Q2 2007 Earnings Call, Jul-19-2007",4527845
