# Init

In [16]:
# library
library(repr)
library(Matrix)
suppressMessages(suppressWarnings({
    library(jsonlite)
    library(utilr)
    # library(comet)
}))

WORK_DIR = '/home/yu/OneDrive/CC'
DATA_DIR = str_c(WORK_DIR, '/data')
WRDS_DOWNLOAD_DIR = str_c(DATA_DIR, '/WRDS-download')
setwd(WORK_DIR)
cat(str_c('Current working directory: ', getwd(), '\n'))

# options for plot
options(repr.plot.width=7, repr.plot.height=4, repr.plot.res = 300)

Current working directory: /home/yu/OneDrive/CC


# WRDS

## establish connection

In [None]:
library(RPostgres)

# connect to wrds
wrds <- dbConnect(Postgres(),
                  host='wrds-pgdata.wharton.upenn.edu',
                  port=9737,
                  dbname='wrds',
                  sslmode='require',
                  user='xiaomowu')

## unit test

In [62]:
# Determine the data libraries available at WRDS
res <- dbSendQuery(wrds, "select distinct table_schema
                   from information_schema.tables
                   where table_type ='VIEW'
                   or table_type ='FOREIGN TABLE'
                   order by table_schema")
data <- dbFetch(res, n=-1)
dbClearResult(res)
print(setDT(data)[, sort(table_schema)])

  [1] "aha"                "ahasamp"            "audit"             
  [4] "auditsmp"           "blab"               "block"             
  [7] "boardex"            "boardsmp"           "bvd"               
 [10] "bvdsamp"            "calcbnch"           "cboe"              
 [13] "centris"            "ciq"                "ciqsamp"           
 [16] "cisdm"              "cisdmsmp"           "clrvt"             
 [19] "clrvtsmp"           "comp"               "compa"             
 [22] "compb"              "compdcur"           "compg"             
 [25] "comph"              "compm"              "compmcur"          
 [28] "compsamp"           "compseg"            "compsnap"          
 [31] "comscore"           "contrib"            "crsp"              
 [34] "crspa"              "crspm"              "crspq"             
 [37] "crspsamp"           "csmar"              "dealscan"          
 [40] "djones"             "dmef"               "doe"               
 [43] "emdb"               "etfg" 

In [None]:
# Determine the datasets within a given library
library = 'taqmsec'
res <- dbSendQuery(wrds, sprintf("select distinct table_name
                   from information_schema.columns
                   where table_schema='%s'
                   order by table_name", library))
data <- dbFetch(res, n=-1)
dbClearResult(res)
setDT(data)[, sort(table_name)] %>% print()

In [65]:
data[, unique(str_sub(table_name, 1, 4))]

In [None]:
# Determine the variables (column headers) within a given dataset
library = 'ibes'
dataset = 'det_epsus'
res <- dbSendQuery(wrds, sprintf("select column_name
                   from information_schema.columns
                   where table_schema='%s'
                   and table_name='%s'
                   order by column_name", library, dataset))
data <- dbFetch(res, n=-1)
dbClearResult(res)
setDT(data)[, column_name] %>% print()

In [None]:
# To query the crsp.dsf dataset
library = "taqmsec"
dataset = 'ctm_20140905'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
data <- dbFetch(res, n=10) %>% setDT()
dbClearResult(res)
data

In [71]:
data

date,time_m,ex,sym_root,sym_suffix,tr_scond,size,price,tr_stop_ind,tr_corr,tr_seqnum,tr_id,tr_source,tr_rf,part_time,trf_time,tte_ind
<date>,<time>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<time>,<time>,<chr>
2018-09-05,09:30:01.004,N,A,,O,12117,67.28,N,0,241801,52983593648973,C,,09:30:01.002,,0
2018-09-05,09:30:01.372,K,A,,F I,2,67.39,N,0,261701,52983525028193,C,,09:30:01.371,,1
2018-09-05,09:30:02.204,N,A,,,100,67.14,N,0,267501,52983593973839,C,,09:30:02.204,,0
2018-09-05,09:30:02.204,N,A,,I,52,67.13,N,0,267601,52983593973842,C,,09:30:02.204,,0
2018-09-05,09:30:06.343,D,A,,I,5,67.39,N,0,297301,71675223102037,C,T,09:30:01.372,09:30:06.343,0
2018-09-05,09:30:21.361,D,A,,4 I,1,67.28,N,0,462501,71675223816503,C,T,09:30:21.290,09:30:21.360,1
2018-09-05,09:30:21.362,Y,A,,F,100,67.38,N,0,462601,52983525027946,C,,09:30:21.361,,1
2018-09-05,09:30:22.390,P,A,,F,100,67.38,N,0,463701,52983525037641,C,,09:30:22.389,,1
2018-09-05,09:30:22.390,P,A,,Q,100,67.38,N,0,463801,52983525037642,C,,09:30:22.389,,0
2018-09-05,09:30:22.390,T,A,,F,100,67.38,N,0,463901,62879133546566,C,,09:30:22.390,,1


## CIQ-Transcripts

> Transcript meta data

In [None]:
# Last downlaod: 2021-1-6 (N obs: 1173989)
# from: beginning

library = "ciq"
dataset = 'wrds_transcript_detail'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ciq_transcript_detail <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ciq_transcript_detail)) %>% cat()
sv(ciq_transcript_detail, path=WRDS_DOWNLOAD_DIR)

ciq_transcript_detail[1]

> Transcript speaker

In [None]:
# Last downlaod: 2021-1-6 (N obs: 62783582)
# from: beginning

library = "ciq"
dataset = 'wrds_transcript_person'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ciq_transcript_speaker <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ciq_transcript_speaker)) %>% cat()
sv(ciq_transcript_speaker, path=WRDS_DOWNLOAD_DIR)

ciq_transcript_speaker[1]

> Transcript component

In [None]:
# Last downlaod: 2021-1-6 (N obs: 62958141)
# from: beginning

library = "ciq_transcripts"
dataset = 'ciqtranscriptcomponent'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ciq_transcript_component <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ciq_transcript_component)) %>% cat()
sv(ciq_transcript_component, path=WRDS_DOWNLOAD_DIR)

ciq_transcript_component[1]

> Transcript component_type_name

In [None]:
# Last downlaod: 2021-1-6 (N obs: 8)
# from: beginning

library = "ciq_transcripts"
dataset = 'ciqtranscriptcomponenttype'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ciq_transcript_componenttype <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ciq_transcript_componenttype)) %>% cat()
sv(ciq_transcript_componenttype, path=WRDS_DOWNLOAD_DIR)

ciq_transcript_componenttype[1]

## CIQ-people

In [None]:
# wrds_professional: career history of a persion
library = 'ciq'
dataset = 'wrds_professional'
res <- dbSendQuery(wrds, sprintf("select *
                   from %s.%s", library, dataset))
ciq_professional <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

ciq_professional[1]
sv(ciq_professional, path=WRDS_DOWNLOAD_DIR)

In [None]:
# ciqperson: person info
# - include name, email, prefix (gender), YOB, phone
library = 'ciq'
dataset = 'ciqperson'
res <- dbSendQuery(wrds, sprintf("select *
                   from %s.%s limit 10", library, dataset))
ciq_person <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

ciq_person[1]
sv(ciq_person, path=WRDS_DOWNLOAD_DIR)

In [36]:
ciq_person[1]

personid,firstname,middlename,lastname,emailaddress,prefix,suffix,salutation,yearborn,phonevalue
<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
65008,M.,John,Sterba,,Mr.,Jr.,,,212-832-7300


In [31]:
# ciqpersonbiography: biography
library = 'ciq'
dataset = 'ciqpersonbiography'
res <- dbSendQuery(wrds, sprintf("select *
                   from %s.%s limit 10", library, dataset))
ciq_person_biography <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

ciq_person_biography[1]
sv(ciq_person_biography, path=WRDS_DOWNLOAD_DIR)

personid,biography
<dbl>,<chr>
65008,"Mr. M. John Sterba, Jr., serves as a Director of Alimansky Capital Group Inc. and Bhirud Associates, Inc. Mr. Sterba served as Director of Bhirud Funds Inc. - Apex Mid Cap Growth Fund."


"ciq_person_biography" saved as "ciq_person_biography.feather" (9.3 KB) (0 secs)


## IBES-earning forcast

### Detail

In [None]:
# Last downlaod: 2021-1-15 (N obs: 18305258)
# from: 2006

# ADJUSTED

library = 'ibes'
dataset = 'det_epsus' # eps
# dataset = 'det_xepsus' # non-eps

query = sprintf("select * from %s.%s where anndats>='2006-01-01'::date", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_det_epsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ibes_det_epsus)) %>% cat()
sv(ibes_det_epsus, path=WRDS_DOWNLOAD_DIR)

ibes_det_epsus[1]

In [16]:
# Last downlaod: 2021-1-15 (N obs: 5337607)

# UNADJUSTED
# from 2000

library = 'ibes'
dataset = 'detu_epsus' # eps
# dataset = 'det_xepsus' # non-eps

query = sprintf("select ticker, estimator, analys, pdf, fpi, value, fpedats, revdats, revtims, anndats, anntims from %s.%s where fpedats>='2000-01-01'::date and (fpi='6' or fpi='7')", library, dataset, year)

res <- dbSendQuery(wrds, query)
ibes_detu_epsus<- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s', dataset, nrow(ibes_detu_epsus))
sv(ibes_detu_epsus, path=WRDS_DOWNLOAD_DIR)

"ibes_detu_epsus" saved as "ibes_detu_epsus.feather" (2.55 secs)


### actuals

In [17]:
# Last downlaod: 2021-1-15 (N obs: 504166)

# from 2000-01-01

library = 'ibes'
dataset = 'actu_epsus' 

query = sprintf("select ticker, anndats as repdats, value as act, pends as fpedats, pdicity from %s.%s where pends>='2000-01-01'::date and pdicity='QTR'", library, dataset, year)

res <- dbSendQuery(wrds, query)
ibes_actu_epsus<- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s', dataset, nrow(ibes_actu_epsus))
sv(ibes_actu_epsus, path=WRDS_DOWNLOAD_DIR)
ibes_actu_epsus[1]

"ibes_actu_epsus" saved as "ibes_actu_epsus.feather" (0.14 secs)


ticker,repdats,act,fpedats,pdicity
<chr>,<date>,<dbl>,<date>,<chr>
0,2014-02-14,,2012-12-31,QTR


### Summary

In [18]:
# Last downlaod: 2021-1-15 (N obs: 12417306)

# from beginning

library = 'ibes'
dataset = 'statsum_epsus' # summary eps
query = sprintf("select * from %s.%s", library, dataset)

res <- dbSendQuery(wrds, query)
ibes_statsum_epsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ibes_statsum_epsus)) %>% cat()

ibes_statsum_epsus[1]
sv(ibes_statsum_epsus, path=WRDS_DOWNLOAD_DIR)

ticker,cusip,oftic,cname,statpers,measure,fiscalp,fpi,estflag,curcode,numest,numup,numdown,medest,meanest,stdev,highest,lowest,usfirm,fpedats,actual,actdats_act,acttims_act,anndats_act,anntims_act,curr_act
<chr>,<chr>,<chr>,<chr>,<date>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<date>,<dbl>,<date>,<dbl>,<date>,<dbl>,<chr>
0,87482X10,TLMR,TALMER BANCORP,2014-04-17,EPS,QTR,6,P,USD,4,0,4,0.07,0.08,0.01,0.1,0.07,1,2014-03-31,0.12,2014-05-12,40087,2014-05-06,38700,USD


"ibes_statsum_epsus" saved as "ibes_statsum_epsus.feather" (16.91 secs)


In [20]:
# Last downlaod: 2021-1-15 (N obs: 61639036)

# from beginning

library = 'ibes'
dataset = 'statsum_xepsus' # summary xeps

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_statsum_xepsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ibes_statsum_xepsus))

ibes_statsum_xepsus[1]
sv(ibes_statsum_xepsus, path=WRDS_DOWNLOAD_DIR)

ticker,cusip,oftic,cname,statpers,measure,fiscalp,fpi,estflag,curcode,numest,numup,numdown,medest,meanest,stdev,highest,lowest,usfirm,fpedats,actual,actdats_act,acttims_act,anndats_act,anntims_act,curr_act
<chr>,<chr>,<chr>,<chr>,<date>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<date>,<dbl>,<date>,<dbl>,<date>,<dbl>,<chr>
0,87482X10,TLMR,TALMER BANCORP,2014-04-17,BPS,QTR,6,P,USD,4,0,0,9.84,9.81,0.23,10.03,9.53,1,2014-03-31,9.97,2014-05-12,40087,2014-05-06,38700,USD


"ibes_statsum_xepsus" saved as "ibes_statsum_xepsus.feather" (1.33 mins)


### Surprise

In [11]:
# Last downlaod: 2021-1-15 (N obs: 9813099)

# from beginning

library = 'ibes'
dataset = 'surpsum' # summary surprise 

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_surpsum <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ibes_surpsum)) %>% cat()
sv(ibes_surpsum, path=WRDS_DOWNLOAD_DIR)

ibes_surpsum[1]

surpsum: 9813099"ibes_surpsum" saved as "ibes_surpsum.feather" (367.2 MB) (5.66 secs)


ticker,oftic,measure,fiscalp,pyear,pmon,usfirm,anndats,actual,surpmean,surpstdev,suescore
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>
0,TLMR,BPS,QTR,2014,3,1,2014-05-06,9.97,9.81,0.22906,0.69852


## IBES-price target

In [None]:
# Last downlaod: 2021-1-15 (N obs: 5506116)

# from beginning

library = "ibes"
dataset = 'ptgdet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ibes_ptgdet <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ibes_ptgdet)) %>% cat()
sv(ibes_ptgdet, path='/data/WRDS-download')

ibes_ptgdet[1]

## recommend

In [None]:
# Last downlaod: 2021-1-15 (N obs: 2861784)

# from beginning

library = 'ibes'
dataset = 'recddet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ibes_recddet <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ibes_recddet)) %>% cat()
sv(ibes_recddet, path=WRDS_DOWNLOAD_DIR)

ibes_recddet[1]

## financial ratio

> **Warning!**

> Even though I updated on 2021-1-15, the latest observation I can get is still way earlier: 2019-12-31. Perhaps this database is updated yearly, I've submit a query to the customer service.

> Update: the database is scheduled to be updated on July 2021

There's no Python or R API to download financial-ratio. So I download the csv through web query and import as R datatable.

Download path: **CIQ -> North America (daily) -> Financial Ratios Firm Level**

- last update: 2021-1-15 (N obs: 1049363)
- From: 1999-12
- Date Format: YYMMDDs10. (e.g. 1984/07/25)
- `bm`: book-to-market
- `debt_assets`: Total Debt/Total Assets
- `debt_at` *(deprecated)*: Total Debt/Total Assets
- `roa`: return on assets

In [2]:
WRDS_DOWNLOAD_DIR = '/home/yu/OneDrive/CC/data/WRDS-download'

In [30]:
wrds_finratio = fread(str_c(WRDS_DOWNLOAD_DIR, '/financial-ratio-firm.csv'), colClass=c(rep('character', times=4), rep('double', 4)))[, ':='(adate=ymd(adate), qdate=ymd(qdate), public_date=ymd(public_date), debt_at=NULL)
    ][order(gvkey, public_date)]

sprintf('wrds_finratio: %s\n', nrow(wrds_finratio)) %>% cat()
sv(wrds_finratio, path=WRDS_DOWNLOAD_DIR)

wrds_finratio[1]

wrds_finratio: 1049363
"wrds_finratio" saved as "wrds_finratio.feather" (11.6 MB) (0.19 secs)


gvkey,adate,qdate,public_date,bm,roa,debt_assets
<chr>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>
1004,1999-05-31,1999-08-31,1999-12-31,0.649,0.135,0.552


## index price

In [24]:
# Last downlaod: 2021-1-15 (N obs: 7090166)

# from beginning

library = 'comp'
dataset = 'idx_daily' # factors

res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
idx_price <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(idx_price)) %>% cat()
sv(idx_price, path=WRDS_DOWNLOAD_DIR)

idx_price[1]

idx_daily: 7091295
"idx_price" saved as "idx_price.feather" (226.8 MB) (5.18 secs)


gvkeyx,dvpsxd,newnum,oldnum,prccd,prccddiv,prccddivn,prchd,prcld,datadate
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<date>
1,,,,1476.6,,,1502.92,1465.44,1999-01-04


## stock price

In [27]:
# Last downlaod: 2021-1-15 (N obs: )

# from 2000-01-01

library = 'comp'
dataset = 'secd' # factors

query = sprintf("select * from %s.%s where datadate>='2008-01-01'::date", library, dataset)

res <- dbSendQuery(wrds, query)
comp_secd <- setDT(dbFetch(res, n=-1))
dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(comp_secd)) %>% cat()
sv(comp_secd, path=WRDS_DOWNLOAD_DIR)

comp_secd[1]

secd: 59295051
"comp_secd" saved as "comp_secd.feather" (6.5 GB) (3.87 mins)


gvkey,iid,datadate,tic,cusip,conm,curcddv,capgn,cheqv,div,divd,divdpaydateind,divrc,divsp,dvrated,paydateind,anncdate,capgnpaydate,cheqvpaydate,divdpaydate,divrcpaydate,divsppaydate,paydate,recorddate,curcdd,adrrc,ajexdi,cshoc,cshtrd,dvi,eps,epsmo,prccd,prchd,prcld,prcod,prcstd,trfd,exchg,secstat,tpci,cik,fic
<chr>,<chr>,<date>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<date>,<date>,<date>,<date>,<date>,<date>,<date>,<date>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
1003,1,2008-01-02,ANTQ,354100,A.A.IMPORTINGCOINC,,,,,,,,,,,,,,,,,,,USD,,1,2683000,5000,,,,0.15,0.15,0.15,0.15,3,,19,I,0,730052,USA


Download `CRSP`
It's *deprecated*!

In [None]:
library = 'crspq'
dataset = 'dsf' # factors
years = 2001:2018

for (year in years) {
    print(sprintf('Start %s at %s', year, now()))
    query = sprintf("select * from %s.%s where to_char(date,'yyyy')='%s'", library, dataset, year)
    save_name_short = sprintf('%s_%s', dataset, year)
    save_name_full = sprintf('../data/WRDS-download/%s_%s.rds', dataset, year)

    res <- dbSendQuery(wrds, query)
    data <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
    
    sprintf('%s_%s: %s', dataset, year, nrow(data))
    assign(save_name_short, data)
    saveRDS(data, save_name_full)
}

## stock industry

In [29]:
# Last downlaod: 2021-1-15 (N obs: 7090166)

# from: beginning

library = 'comp'
dataset = 'co_industry' # factors

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_industry <- setDT(dbFetch(res, n=-1))
dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(idx_price)) %>% cat()
sv(comp_industry, path=WRDS_DOWNLOAD_DIR)

comp_industry[1]


co_industry: 7090166
"comp_industry" saved as "comp_industry.feather" (5.2 MB) (0.12 secs)


gvkey,consol,popsrc,fyr,naicsh,sich,year,datadate
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<date>
1001,C,D,12,722,,1985,1985-12-31


## FF-factors

Notes:
- `factors` are in decimal, not percentage point. So `smb=0.0024` means the return is 0.24 percentage points

In [31]:
# Last downlaod: 2021-1-15 (N obs: 24874)

# from: beginning

library = 'ff'
dataset = 'factors_daily' # factors

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ff_factors <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ff_factors)) %>% cat()
sv(ff_factors, path=WRDS_DOWNLOAD_DIR)

ff_factors[1]

factors_daily: 24874
"ff_factors" saved as "ff_factors.feather" (437.7 KB) (0 secs)


date,mktrf,smb,hml,rf,umd
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1926-07-01,0.001,-0.0024,-0.0028,9e-05,


## firm ID

### `ibes.id`

In [None]:
# Last downlaod: 

# from: beginning

library = 'ibes'
dataset = 'id' # firm names

query = sprintf("select ticker, cusip, cname, sdates from %s.%s where usfirm=1 and cusip != ''", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_id <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(ibes_id))
sv(ibes_id, path=WRDS_DOWNLOAD_DIR)

query = sprintf("select ticker, cusip, cname, oftic, sdates from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_id2 <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ibes_id2))

ibes_id2[1]
sv(ibes_id2, path=WRDS_DOWNLOAD_DIR)

### `comp.security`

In [32]:
# Last downlaod: 2021-1-15 (N obs: 58376)

# from: beginning

library = 'compm'
dataset = 'security' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_security <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(comp_security)) %>% cat()
sv(comp_security, path=WRDS_DOWNLOAD_DIR)

comp_security[1]

security: 58376
"comp_security" saved as "comp_security.feather" (4.5 MB) (0.12 secs)


tic,gvkey,iid,cusip,dlrsni,dsci,epf,exchg,excntry,ibtic,isin,secstat,sedol,tpci,dldtei
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<date>
AE.2,1000,1,32102,9,COM USD1,,12,USA,,,I,,0,1978-06-30


### `ciq.wrds_gvkey`

In [33]:
# Last downlaod: 2021-1-15 (N obs: 115536)

# from: beginning

library = 'ciq'
dataset = 'wrds_gvkey' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ciq_wrds_gvkey <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ciq_wrds_gvkey)) %>% cat()
sv(ciq_wrds_gvkey, path=WRDS_DOWNLOAD_DIR)

ciq_wrds_gvkey[1]

wrds_gvkey: 115536
"ciq_wrds_gvkey" saved as "ciq_wrds_gvkey.feather" (3.7 MB) (0.08 secs)


companyid,gvkey,startdate,enddate,companyname
<dbl>,<chr>,<date>,<date>,<chr>
18507,235716,,,2M Invest A/S


### `crsp.stocknames`

In [None]:
# Last downlaod: 

# from: beginning

library = 'crsp'
dataset = 'stocknames' # firm names

query = sprintf("select permno, ncusip, comnam, namedt, nameenddt from %s.%s where ncusip != ''", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_stocknames <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_stocknames))
sv(crsp_stocknames, path=WRDS_DOWNLOAD_DIR)

query = sprintf("select ticker, comnam, permno, ncusip, namedt, nameenddt from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_stocknames2 <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_stocknames2))
crsp_stocknames2[1]
sv(crsp_stocknames2, path=WRDS_DOWNLOAD_DIR)

### `crsp.ccm` (CRSP+COMPUSTAT)

In [34]:
# Last downlaod: 2021-1-15 (N obs: 32871)

# from: beginning

library = 'crsp'
dataset = 'ccmxpf_linktable' # firm names

query = sprintf("select gvkey, lpermco as permco, lpermno as permno, linkdt, linkenddt from %s.%s where usedflag=1 and linkprim in ('P', 'C')", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_ccmlink <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(crsp_ccmlink)) %>% cat()
sv(crsp_ccmlink, path=WRDS_DOWNLOAD_DIR)

crsp_ccmlink[1]

ccmxpf_linktable: 32871
"crsp_ccmlink" saved as "crsp_ccmlink.feather" (680.4 KB) (0.01 secs)


gvkey,permco,permno,linkdt,linkenddt
<chr>,<dbl>,<dbl>,<date>,<date>
1000,23369,25881,1970-11-13,1978-06-30


## index constitution

In [37]:
# Last downlaod: 2021-1-15 (N obs: 2133)

# from: beginning

library = 'compa'
dataset = 'idx_index' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_idx_profile <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(comp_idx_profile)) %>% cat()
sv(comp_idx_profile, path=WRDS_DOWNLOAD_DIR)

comp_idx_profile[conm=='S&P 500 Comp-Ltd']

idx_index: 2133
"comp_idx_profile" saved as "comp_idx_profile.feather" (178.5 KB) (0.04 secs)


conm,gvkeyx,idx13key,idxcstflg,idxstat,indexcat,indexgeo,indexid,indextype,indexval,spii,spmi,tic,tici
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
S&P 500 Comp-Ltd,3,500,Y,A,S&P,USA,500,LGCAP,500,,10,I0003,I0003


In [None]:
# Last downlaod: 2021-1-15 (N obs: 87146)

# from: beginning

library = 'compa'
dataset = 'idxcst_his' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_idx_cst <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(comp_idx_cst)) %>% cat()
sv(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)

comp_idx_cst[1]

idxcst_his: 87146
"comp_idx_cst" saved as "comp_idx_cst.feather" (1.4 MB) (0.03 secs)


gvkey,iid,gvkeyx,from,thru
<chr>,<chr>,<chr>,<date>,<date>
1004,1,30824,1994-10-01,


## keydev

In [39]:
# Last downlaod: 2021-1-15 (N obs: 3454300)

# from: beginning

# 28: Announcements of Earnings
# 48: Earnings Calls
# 55: Expected Earnings Release Date
# 61: Delayed Earnings Announcements
# 144: Expected Earnings Release Date (CIQ Expected)

library = 'ciq'
dataset = 'wrds_keydev' # everything except for `headline` and `situation`

query = sprintf("select * from %s.%s 
    where keydeveventtypeid in (28, 48, 55, 61, 144)", library, dataset)
res <- dbSendQuery(wrds, query)
ciq_wrds_keydev <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

# `mostimportantdateutc` is in UTC but R doesn't recognize,
# so we need to set it explictly
ciq_wrds_keydev[, ':='(mostimportantdateutc=force_tz(mostimportantdateutc, 'UTC'))]

sprintf('%s: %s\n', dataset, nrow(ciq_wrds_keydev)) %>% cat()
sv(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)

ciq_wrds_keydev[1]

wrds_keydev: 3454300
"ciq_wrds_keydev" saved as "ciq_wrds_keydev.feather" (168.7 MB) (2.55 secs)


keydevid,companyid,companyname,keydeveventtypeid,eventtype,keydevstatusid,statustype,keydevtoobjectroletypeid,objectroletype,announcedate,announcetime,enterdate,entertime,lastmodifieddate,mostimportantdateutc,gvkey
<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<date>,<dbl>,<date>,<dbl>,<dttm>,<dttm>,<chr>
613802945,18511,3i Group plc,28,Announcements of Earnings,1,Active,1,Target,2019-05-16,21600,2019-05-16,27060,2019-05-16 07:31:00,2019-05-16 06:00:00,210835


In [40]:
# Last downlaod: 2021-1-15 (N obs: 3452193)

# from: beginning

# 28: Announcements of Earnings
# 48: Earnings Calls
# 55: Expected Earnings Release Date
# 61: Delayed Earnings Announcements
# 144: Expected Earnings Release Date (CIQ Expected)

library = 'ciq'
dataset = 'ciqkeydev' # `headline` and `situation`

query = sprintf("select * from %s.%s 
    where keydevid in 
        (select keydevid from ciq.wrds_keydev
        where keydeveventtypeid in (28, 48, 55, 61, 144))",
    library, dataset)
res <- dbSendQuery(wrds, query)
ciq_keydev <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

# `mostimportantdateutc` is in UTC but R doesn't recognize,
# so we need to set it explictly
ciq_keydev[, ':='(mostimportantdateutc=force_tz(mostimportantdateutc, 'UTC'))]

sprintf('%s: %s\n', dataset, nrow(ciq_keydev)) %>% cat()
sv(ciq_keydev, path=WRDS_DOWNLOAD_DIR)

ciq_keydev[1]

ciqkeydev: 3452193
"ciq_keydev" saved as "ciq_keydev.feather" (899.8 MB) (7.8 secs)


headline,situation,keydevid,announceddate,entereddate,lastmodifieddate,mostimportantdateutc
<chr>,<chr>,<dbl>,<dttm>,<dttm>,<dttm>,<dttm>
Awilco LNG ASA Reports Earnings Results for the First Quarter of 2017,"Awilco LNG ASA reported earnings results for the first quarter of 2017. For the quarter, the company reported net freight income of $0.8 million. LBITDA was $2.0 million. Loss before tax was $10.4 million. Loss was $10.4 million. Net cash used in operating activities was $1.6 million.",430225859,2017-05-05 07:30:00,2017-05-09 05:37:00,2017-07-22 03:57:00,2017-05-05 07:30:00


## CCM (CRSP+COMPUSTAT)

In [41]:
# Last downlaod: 2021-1-15 (N obs: 105148)

# from: beginning

library = 'crspq'
dataset = 'ccmxpf_lnkhist'

query = sprintf("select * from %s.%s",
    library, dataset)
res <- dbSendQuery(wrds, query)
ccm <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(ccm)) %>% cat()
sv(ccm, path=WRDS_DOWNLOAD_DIR)

ccm[1]

ccmxpf_lnkhist: 105148
"ccm" saved as "ccm.feather" (3.1 MB) (0.06 secs)


gvkey,linkprim,liid,linktype,lpermno,lpermco,linkdt,linkenddt
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<date>,<date>
1000,C,00X,NU,,,1961-01-01,1970-09-29


## money inflow

In [8]:
# Last downlaod: 2021-1-15 (N obs: 24836274)

# from: 2000-01-01

library = 'crsp'
dataset = 'holdings'

query = sprintf("select crsp_portno, report_dt, percent_tna, nbr_shares, market_val, cusip, permno from %s.%s where report_dt>='2000-01-01'::date",
    library, dataset)
                
res <- dbSendQuery(wrds, query)
crsp_holdings <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s\n', dataset, nrow(crsp_holdings)) %>% cat()
sv(crsp_holdings, path=WRDS_DOWNLOAD_DIR)

holdings: 24836274"crsp_holdings" saved as "crsp_holdings.feather" (4.6 GB) (55.24 secs)


## retail tracking

Steps:
- Download 'taq-permno-link.sas7bdat' from WRDS
    - Located at "Home -> Get Data -> Linking Suite by WRDS -> TAQ CRSP Link"
    - Select all variables and all companies

# Coverage

## Find SP 500

In [5]:
# thru is missing means it's still effective
# fill the missing with today()
ld(comp_idx_cst, ldtype='feather', path=WRDS_DOWNLOAD_DIR)

sp500_cst = comp_idx_cst[gvkeyx=='000003'
    ][is.na(from), ':='(from=as.Date('1900-01-01'))
    ][is.na(thru), ':='(thru=today())
    ][order(gvkey, iid)
    ][, .SD[1], keyby=.(gvkey)
    ][, .(gvkey, from, thru)] %>% unique()

sv(sp500_cst)

"comp_idx_cst.feather" (1.4 MB) loaded (0.05 secs)
"sp500_cst" saved as "sp500_cst.feather" (24.8 KB) (0.01 secs)


## Find DJI

In [5]:
ld(comp_idx_cst, ldtype='feather', path=WRDS_DOWNLOAD_DIR)
ld(comp_security, ldtype='feather', path=WRDS_DOWNLOAD_DIR)

# SDATE = as.Date('2005-01-01')
# EDATE = as.Date('2018-12-31')

# DJI
dji_cst = unique(comp_idx_cst[gvkeyx=='000005'
    ][comp_security[, .(gvkey, iid, tic, cusip=str_sub(cusip, 1, 6), sedol)], 
      on=.(gvkey, iid), nomatch=0])
dji_cst
sv(dji_cst)

"comp_idx_cst.feather" (1.4 MB) loaded (0.05 secs)
"comp_security.feather" (4.5 MB) loaded (0.15 secs)
"dji_cst" saved as "dji_cst.feather" (5.5 KB) (0.03 secs)


gvkey,iid,gvkeyx,from,thru,tic,cusip,sedol
<chr>,<chr>,<chr>,<date>,<date>,<chr>,<chr>,<chr>
1300,1,5,1925-12-07,2008-02-18,HON,438516,2020459
1356,1,5,1959-06-01,2013-09-22,AA.3,013817,BYVZDB3
1447,1,5,1982-08-30,,AXP,025816,2026082
1487,1,5,2004-04-08,2008-09-21,AIG,026874,2027342
1581,1,5,1939-03-14,2004-04-07,T.2,001957,2064888
1690,1,5,2015-03-19,,AAPL,037833,2046251
2136,1,5,2004-04-08,,VZ,92343V,2090571
2285,1,5,1987-03-12,,BA,097023,2108601
2817,1,5,1991-05-06,,CAT,149123,2180201
2968,1,5,2001-01-02,,JPM,46625H,2190385


# Y (CAR+SUE+Inflow+Revision)

## gvkey <-> permno

We'll use `ccm` to link these two variables.

In the next cell, I:
- only select "primary" links (LC,LU,LS) which are considered to be robust
- fill missing `linkdt` and `linkenddt`
- for every `gvkey`, only select its **first** issue.

In [91]:
# Last run: 2021-3-1

ld(ccm, path=WRDS_DOWNLOAD_DIR, force=T)

gvkey_permno_link = ccm[linktype %in% c('LC', 'LU', 'LS')
    ][is.na(linkenddt) | linkenddt>=as.Date('2000-01-01'), 
     .(gvkey, lpermno, liid, linkdt, linkenddt)
    ][is.na(linkenddt), ':='(linkenddt=today())
    ][is.na(linkdt), ':='(linkdt=as.Date('1990-01-01'))
    ][order(gvkey, liid)
    ][, .SD[1], keyby=.(gvkey)
    ][order(gvkey, linkdt)
    ][, ':='(liid=NULL)] %>% 
    na.omit(cols=c('gvkey', 'lpermno')) %>% 
    unique(by=c('gvkey', 'lpermno'))
sv(gvkey_permno_link)

"ccm.feather" (3.1 MB) loaded (0.02 secs) (2021-03-01 2:27 AM)
"gvkey_permno_link" saved as "gvkey_permno_link.feather" (312.9 KB) (0.01 secs, 2021-03-01 02:27:30)


## call <-> release

Task:
- link call and release
- method
  1. For any release, find all the calls within the [-180d, 180d] window, get `dt_find_post_call`
  2. For any call, find all the releases within the [-180d, 180d] window, get `dt_find_prev_release`
  3. merge `dt_find_post_call` and `dt_find_prev_release`, get `call_release_link`
  4. For every `call_keydevid`, find its **closest** **previous**`release_keydevid` within [-1d, 1d]

Warnings:
- In `call_release_link`:
    - `call_keydevid` is unique key
    - `release_keydevid` *not* unique key, because different calls may be matched to the *same* release.
- In `ciq_wrds_keydev`, one `keydevid` may have multiple obs because it may corresponds to multiple `gvkey`

Notes:
- `keyeventtypeid`
  - 28: earnings announcement
  - 48: earnings calls
  - 61: accnounce earnings release delay
  


In [7]:
ld(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)
ld(ciq_keydev, path=WRDS_DOWNLOAD_DIR)
ld(gvkey_permno_link)
ld(sp500_cst)

"ciq_wrds_keydev" (168.7 MB) already in .GlobalEnv, will NOT load again! (0 secs)
"ciq_keydev" (899.8 MB) already in .GlobalEnv, will NOT load again! (0 secs)
"gvkey_permno_link" (312.9 KB) already in .GlobalEnv, will NOT load again! (0 secs)
"sp500_cst" (24.8 KB) already in .GlobalEnv, will NOT load again! (0 secs)


In [8]:
find_prev_release <- function(t, date, keydevid, keydeveventtypeid, companyname) {
    # t: the row_id of one call
    # n: the row_ids of the closet earnings release
    ns = which((date>=(date[t]-180)) & (date<=date[t]+180) & (keydeveventtypeid==28))
    if (length(ns) >= 1) { # has match
        return(list('release_date' = date[ns],
             'release_keydevid' = keydevid[ns],
             'call_date' = date[rep(t, length(ns))],
             'call_keydevid' = keydevid[rep(t, length(ns))],
             'companyname' = companyname[ns]))
    } else if (length(ns) == 0) {# fail to find a release
        return(list('release_date' = ymd(NA),
             'release_keydevid' = NA_real_,
             'call_date' = date[t],
             'call_keydevid' = keydevid[t],
             'companyname' = companyname[1]))
    }
}

dt_find_prev_release = ciq_wrds_keydev[!is.na(gvkey)
    ][keydeveventtypeid %in% c(28, 48)
    ][order(gvkey, mostimportantdateutc),
      .(gvkey, date=as.Date(mostimportantdateutc), keydevid, keydeveventtypeid, companyname, eventtype)
    ][, {ts = which(keydeveventtypeid==48)
      lapply(ts, partial(find_prev_release, date=date, 
                           keydevid=keydevid, keydeveventtypeid=keydeveventtypeid,
                           companyname=companyname)
              ) %>% rbindlist(fill=T, use=T)
     },
     keyby=.(gvkey)
    ]

In [9]:
find_post_call <- function(t, date, keydevid, keydeveventtypeid, companyname) {
    # t: the row_id of one announcement
    # ns: the row_id of the closest call
    ns = which((date>=(date[t]-180)) & (date<=date[t]+180) & (keydeveventtypeid==48))
    if (length(n) == 1) { # has match
        return(list('call_date' = date[ns],
             'call_keydevid' = keydevid[ns],
             'release_date' = date[rep(t, length(ns))],
             'release_keydevid' = keydevid[rep(t, length(ns))],
             'companyname' = companyname[ns]))
    } else if (length(n) == 0) {# fail to find a release
        return(list('call_date' = ymd(NA),
             'call_keydevid' = NA_real_,
             'release_date' = date[t],
             'release_keydevid' = keydevid[t],
             'companyname' = companyname[1]))
    }
}

dt_find_post_call = ciq_wrds_keydev[!is.na(gvkey)
    ][keydeveventtypeid %in% c(28, 48)
    ][order(gvkey, mostimportantdateutc),
      .(gvkey, date=as.Date(mostimportantdateutc), keydevid, keydeveventtypeid, companyname, eventtype)
    ][, {ts = which(keydeveventtypeid==28)
      lapply(ts, partial(find_post_call, date=date, 
                           keydevid=keydevid, keydeveventtypeid=keydeveventtypeid,
                           companyname=companyname)
            ) %>% rbindlist(fill=T, use=T)
     },
     keyby=.(gvkey)
    ]

In [10]:
call_release_link = unique(na.omit(rbindlist(list(dt_find_post_call, dt_find_prev_release), use=T)))[order(gvkey, release_date)
    ][(gvkey %in% sp500_cst$gvkey) & (call_date %between% c(ymd('2008-01-01'), ymd('2020-12-31')))
    ][, ':='(nday_release_lead=call_date-release_date)
    ][nday_release_lead %between% c(-1, 1)
    ][order(call_keydevid, nday_release_lead)
    ][, head(.SD,1), keyby=.(call_keydevid)]

n_identified_call = call_release_link[, uniqueN(call_keydevid)]
cat(sprintf("%s calls have been sucessfully matched with release event in range [-1d,1d].\n\n", n_identified_call))

sv(call_release_link)
call_release_link[1]

35524 (114.45%) calls have been sucessfully matched with release event in range [-1d,1d].

"call_release_link" saved as "call_release_link.feather" (1.1 MB) (0.01 secs)


call_keydevid,gvkey,call_date,release_date,release_keydevid,companyname,nday_release_lead
<dbl>,<chr>,<date>,<date>,<dbl>,<chr>,<drtn>
3101633,14489,2008-02-28,2008-02-28,5219816,Dell Technologies Inc.,0 days


## inflow

In [20]:
# inflow: in millions of dollars
inflow = crsp_holdings[!is.na(permno)
    ][, ':='(permno=as.character(permno))
    ][, .(mv=sum(market_val, na.rm=T)), keyby=.(permno, report_dt)
    ][order(permno, report_dt)
    ][, ':='(inflow=(mv-shift(mv))/1e6), keyby=permno
    ][!is.na(inflow)]
sv(inflow)

inflow[1]

"inflow" saved as "inflow.feather" (19.9 MB) (0.18 secs)


permno,report_dt,mv,inflow
<chr>,<date>,<dbl>,<dbl>
10001,2002-11-30,0,0


## revision

Task:
- $revision = \frac{EPS_{new} - EPS_{previous}}{\text{stock price 2 days before revision}}*100$
- compute revision of year-end earnings for the current FY, resutling in `revision`
- create `ibtic_gvkey_link` where `ibtic` is unique and one `gvkey` may have multiple `ibtic` matches
- add `gvkey` to `revision`

In [29]:
ld(ibes_det_epsus, path=WRDS_DOWNLOAD_DIR)
ld(comp_security, path=WRDS_DOWNLOAD_DIR) # stock ids
ld(comp_secd, path=WRDS_DOWNLOAD_DIR)

"ibes_det_epsus" (1002.7 MB) already in .GlobalEnv, will NOT load again! (0 secs)
"comp_security" (4.5 MB) already in .GlobalEnv, will NOT load again! (0 secs)
"comp_secd" (6.5 GB) already in .GlobalEnv, will NOT load again! (0 secs)


In [30]:
ibtic_gvkey_link = comp_security[!is.na(ibtic) & excntry=='USA', .(ibtic, gvkey, iid, dldtei)]
comp_secd_link = comp_secd[, .(gvkey, iid, price_date=datadate, price=prccd, join_date=datadate+2)]

In [31]:
ibes = ibes_det_epsus[anndats %between% c(ymd('2007-01-01'), ymd('2020-12-31')) & !is.na(cusip) & measure=='EPS' & usfirm==1 & fpi %in% c(1), 
     .(ticker, cname, anndats, revdats, fpedats, analys, pdf, fpi, value)
    ][order(analys, ticker, fpedats, anndats), ':='(join_date=anndats)
    ][ibtic_gvkey_link, on=.(ticker=ibtic), nomatch=NULL]

revision = comp_secd_link[ibes, on=.(gvkey, iid, join_date), roll=T, nomatch=NULL
    ][is.na(dldtei) | (!is.na(dldtei) & dldtei>=anndats)
    ][, ':='(join_date=NULL, dldtei=NULL)
    ][anndats-price_date<=30
    ][order(analys, ticker, fpedats, anndats)
    ][, .(gvkey, anndats, revdats, pdf, value, revision=(value-shift(value))/price*100), 
      keyby=.(analys, ticker, fpedats)
    ][!is.na(revision)
    ][revision %between% c(quantile(revision, 0.005), quantile(revision, 0.995))]

In [32]:
revision[, summary(revision)]

      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-128.80583   -0.34351   -0.01539   -0.64624    0.18900   58.96226 

In [33]:
sv(revision)

"revision" saved as "revision.feather" (46.9 MB) (1.74 secs)


## retail

Last update: 2021-03-01

Steps:
- Compute retail-tracking.sas in WRDS Cloud
- Download `taq-crsp-link` from WRDS Cloud
- Join

In [2]:
# ----------------------------
# Link table: TAQ <-> CUSIP
# ----------------------------
taq_crsp_link_daily = read_sas('data/Retail tracking/taq-crsp-link-daily.sas7bdat')%>%as.data.table()

taq_crsp_link_daily = taq_crsp_link_daily[, .(permno=PERMNO, date=DATE, symbol=SYM_ROOT, cusip=CUSIP, match_lvl=MATCH_LVL)]

taq_crsp_link_daily[1]

permno,date,symbol,cusip,match_lvl
<dbl>,<date>,<chr>,<chr>,<dbl>
87432,2003-09-10,A,00846U10,1


In [3]:
# ----------------------------
# read: retail trades
# ----------------------------

read_retail <- function(filename, if_monthly=T) {
    x = read_sas(sprintf('data/Retail tracking/retail_%s.sas7bdat', filename)
                ) %>% as.data.table()
    if (!if_monthly) {
        setnames(x, 'SYM_ROOT', 'SYMBOL')
    }
    setnames(x, names(x), tolower(names(x)))
    x
}

library(haven)

retail_2007_2009 = read_retail('2007_2009')
retail_2010_2012 = read_retail('2010_2012')
retail_2013 = read_retail('2013')
retail_2014 = read_retail('2014')

retail_2015_2016 = read_retail('2015_2016', F)
retail_2017 = read_retail('2017', F)
retail_2018 = read_retail('2018', F)
retail_2018 = read_retail('2019_2020', F)


retail = rbindlist(list(retail_2007_2009, retail_2010_2012, 
                        retail_2013, retail_2014, retail_2015_2016, 
                        retail_2017, retail_2018), fill=T)

In [4]:
# -------------------------------
# Add: retail trades <-> `permno`
# -------------------------------
retail_cusip = retail[taq_crsp_link_daily, on=.(symbol, date), nomatch=NULL
    ][order(symbol, date, match_lvl)]

# -------------------------------
# Add: <-> `gvkey`
# -------------------------------
ld(gvkey_permno_link)

retail = gvkey_permno_link[retail_cusip, 
      on=c('lpermno==permno'), nomatch=NULL
    ][date>=linkdt & date<=linkenddt, 
      .(gvkey, date, total_vol, retail_buy_vol, retail_sell_vol)
    ][, lapply(.SD, sum, na.rm=T), keyby=.(gvkey, date)]

sv(retail)

"gvkey_permno_link.feather" (312.9 KB) loaded (0.02 secs) (2021-03-01 1:53 PM)
"retail" saved as "retail.feather" (173.3 MB) (1.84 secs, 2021-03-01 13:54:01)


## SUE

Primary key
- [`gvkey`, `rdq`]
- total obs: 43,822
- N of unique PK: 43,815

Variable Description
- `datadate`(COMP): End Date of Earnings Report(earlier than `rdq` and `repdats`)
- `fyearq`(COMP): fiscal year
- `fyr`(COMP): fiscal year - end month
- `fqtr`(COMP): fiscal quarter
    - only in [1, 2, 3, 4]
    - *no "year-end" earnings*
- `repdats`(IBES): Report Date of Quarterly Earnigns
- `rdq`(COMP): Report Date of Quarterly Earnings
- `leadrdq`(COMP): Report Date of NEXT Quarter's Earnings

- `numest`: Number of Forecasts
- `smedest`: standard median forecast (based on estimates in the 90 days prior to the EAD), = medest/price_close
- `sstdest`: standard error of Forecasts, = stdest/price_close
- `basis`: Whether most analysts report estimates on primary(P) / diluted(D)
- `act`(IBES): actual earnings
- `se`(COMP): Standard Earnings (=act/price_close)
- `sest`: Standard Estimates (= se-sue = est/price_close)
- `sue1`: SUE based on a rolling seasonal random walk model (LM,p. 185)
- `sue2`: SUE accounting for  exclusion of special items
- `sue3`: SUE based on IBES reported analyst forecasts and actuals  

- `mcap`: Market Cap

In [1]:
ld('sue_final', ldname='sue', force=T)

sue[, ':='(permno=as.character(permno),
           sest=se-sue3,
           price_close=act/se)
    ][, ':='(smedest=medest/price_close,
             sstdest=stdest/price_close)
    ][, ':='(stdest=NULL, medest=NULL)]

sue[1]
sv(sue)

"sue_final.feather" loaded as "sue" (0.04 secs)
"sue" saved as "sue.feather" (4.4 MB) (0.05 secs)


gvkey,ticker,permno,conm,fyearq,fqtr,datadate,fyr,rdq,rdq1,leadrdq1,repdats,mcap,act,numest,basis,sue1,sue2,sue3,se,sest,price_close,smedest,sstdest
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<date>,<dbl>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,ADCT,50906,ADCTELECOMMUNICATIONSINC,2010,4,2010-09-30,9,2010-11-23,2010-11-23,2011-01-31,2010-11-23,1231.524,0.15,1,D,,,-0.000789266,0.01183899,0.01262826,12.67,0.01262826,


## CAR

### prepare event list

Compute the CAR based on the following two event types: (1) earnings announcment; (2) earnings call. Do as follows:

1. Create a dataset where the first col is `permno` and the second is `event_date`.
2. Use a Python script (`compute-car.ipynb`) to compute CAR

In [2]:
ld(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)
ld(gvkey_permno_link)
ld(sp500_cst)

"ciq_wrds_keydev.feather" (168.7 MB) loaded (3.3 secs)
"gvkey_permno_link.feather" (312.9 KB) loaded (0.01 secs)
"sp500_cst.feather" (24.8 KB) loaded (0 secs)


In [4]:
event_samples = ciq_wrds_keydev[gvkey_permno_link, on=.(gvkey), nomatch=0
    ][as.Date(mostimportantdateutc)>linkdt & as.Date(mostimportantdateutc)<linkenddt
    ][as.Date(mostimportantdateutc) >= as.Date('2000-01-01')
    ][gvkey %in% sp500_cst[, unique(gvkey)]
    ][, ':='(linkdt=NULL, linkenddt=NULL)]

Then we generate two event samples:
- `event_samples_earnings_call`: the earnings CALL (type 48)
- `event_samples_earnings_announce`: the earnings announcement (type28)

In [5]:
event_samples_test = event_samples[
      mostimportantdateutc>ymd('2019-10-01')
    ][keydeveventtypeid==48,
     .(permno=lpermno, edate=format(mostimportantdateutc, '%m/%d/%Y'))
    ][order(permno, edate)
    ] %>% unique()

toJSON(event_samples_test, pretty=T) %>% write('data/CAR/event_samples_test.json')

In [6]:
event_samples_earnings_call = event_samples[keydeveventtypeid==48,
     .(permno=lpermno, edate=format(mostimportantdateutc, '%m/%d/%Y'))
    ][order(permno, edate)] %>% unique()

toJSON(event_samples_earnings_call, pretty=T) %>% write('data/CAR/event_samples_earnings_call.json')

event_samples_earnings_announce = event_samples[keydeveventtypeid==28,
     .(permno=lpermno, edate=format(mostimportantdateutc, '%m/%d/%Y'))
    ][order(permno, edate)] %>% unique()

toJSON(event_samples_earnings_announce, pretty=T) %>% write('data/CAR/event_samples_earnings_announce.json')

### ==>

> The ret/car are also in *decimals*

Output these variabls:
- `CAR[-1,-1]`
- `CAR[-2,-2]`
- `CAR[-30,-3]`
- `CAR[0,1]`
- `CAR[0,30]`
- `alpha[-125,-31]`
- `volatility[-125, -31]`

In [7]:
ld(cars_30d_call, ldname='car', path='data/CAR', force=T)

car = car[, ':='(permno=as.character(permno))
    ][order(permno, edate, rdate)
    ][, .(car_m1_m1=abret[isevt==1 & evttime==-1],
          car_m2_m2=abret[isevt==1 & evttime==-2],
          car_m30_m3=sum(abret[isevt==1 & evttime %between% c(-30,-3)], na.rm=T),
          
          car_0_1=sum(abret[isevt==1 & evttime %between% c(0,1)], na.rm=T),
          car_0_10=sum(abret[isevt==1 & evttime %between% c(0,10)], na.rm=T),
          car_0_20=sum(abret[isevt==1 & evttime %between% c(0,20)], na.rm=T),
          car_0_30=sum(abret[isevt==1 & evttime %between% c(0,30)], na.rm=T),
          
          ret_m30_m3=prod(1+ret[isevt==1 & evttime %between% c(-30, -3)], na.rm=T)-1,
          ret_m2_m2=prod(1+ret[isevt==1 & evttime==-2], na.rm=T)-1,
          ret_m1_m1=prod(1+ret[isevt==1 & evttime==-1], na.rm=T)-1,
          
          ret_0_1=ret[isevt==1 & evttime %between% c(0,1)],
          ret_0_10=prod(1+ret[isevt==1 & evttime %between% c(0,10)], na.rm=T)-1,
          ret_0_20=prod(1+ret[isevt==1 & evttime %between% c(0,20)], na.rm=T)-1,
          ret_0_30=prod(1+ret[isevt==1 & evttime %between% c(0,30)], na.rm=T)-1,
          
          alpha=alpha[isevt==1 & evttime==0],
          beta_mktrf=beta_mktrf[isevt==1 & evttime==0],
          beta_smb=beta_smb[isevt==1 & evttime==0],
          beta_hml=beta_hml[isevt==1 & evttime==0],
          volatility=sd(ret[isevt==0], na.rm=T), # sd(ret[-125,-1])
          volatility2=sd(ret[isevt==0 & evttime %between% c(95, 125)], na.rm=T)), # sd(ret[-30,-1])
      keyby=.(permno, edate)]

car[1]
sv(car)

"cars_30d_call.feather" (76.3 MB) loaded as "car" (0.25 secs)
"car" saved as "car.feather" (8.8 MB) (0.04 secs)


permno,edate,car_m1_m1,car_m2_m2,car_m30_m3,car_0_1,car_0_10,car_0_20,car_0_30,ret_m30_m3,ret_m2_m2,ret_m1_m1,ret_0_1,ret_0_10,ret_0_20,ret_0_30,alpha,beta_mktrf,beta_smb,beta_hml,volatility,volatility2
<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
10078,2004-10-15,-0.002767933,0.005908036,-0.005490097,-0.009563919,0.08891284,0.07693825,0.1722624,0.05167966,-0.01228506,-0.0124378,0,0.1335013,0.2241814,0.3879094,-0.0005621021,1.351072,2.327937,-0.3718033,0.0332523,0.03211976


# X (transcripts)

## Clean ciq_transcript_detail

Task:
- Check if there are unknown `transcriptcollectiontypeid`

In [4]:
ld(sp500_cst)
ld(call_release_link)
ld(gvkey_permno_link)
ld(ciq_transcript_detail, path=WRDS_DOWNLOAD_DIR)
ld(ciq_transcript_component, path=WRDS_DOWNLOAD_DIR)
ld(ciq_wrds_gvkey, path=WRDS_DOWNLOAD_DIR)

"sp500_cst" (24.8 KB) already in .GlobalEnv, will NOT load again! (0 secs)
"call_release_link" (1.1 MB) already in .GlobalEnv, will NOT load again! (0 secs)
"gvkey_permno_link" (312.9 KB) already in .GlobalEnv, will NOT load again! (0 secs)
"ciq_transcript_detail.feather" (54.6 MB) loaded (1.87 secs)
"ciq_transcript_component.feather" (25 GB) loaded (4.52 mins)
"ciq_wrds_gvkey.feather" (3.7 MB) loaded (0.12 secs)


In [5]:
version_in_data = ciq_transcript_detail[, unique(transcriptcollectiontypeid)]
if (sum(!(version_in_data %in% c(8, 1, 2, 7, 11, 10, 6, 9, 13))) > 0) {
    cat('Unknown `transcriptcollectiontypeid` in data!!!')
} else {cat('All `transcriptcollectiontypeid` are known')}

All `transcriptcollectiontypeid` are known

Task 1: For every `keydevid`, select one `transcriptid` (which believed to be most precise)
- Check: priority order of "audited > proofed > edited > spellchecked"
- We only analyze SP500


Task 2: 
- Add `ciq_wrds_gvkey:gvkey` to `ciq_transcript_detail`
  - merge `ciq_wrds_gvkey` and `ciq_transcript_detail_sp500` by `companyid`
  - one `companyid` may corresp to multiple `gvkey`, we kee ALL the matches

Task 3: 
- Add `size`, `bw_adj`, and `mom` from `dgtw` to `ciq_transcript_detail_sp500`

Warnings:
> In the final `f_ciq_transcript_detail_sp500`:
>   - `transcriptid` and `keydevid` are one-to-one mapped
>   - unique keys: `['transcriptid', 'gvkey']` or `['keydevid', 'gvkey']`
>   - n_row: 35593
>   - unique_n_row: 35077

In [6]:
latest_transcriptcollectiontypeid = ciq_transcript_detail[
     (keydeveventtypeid==48) & 
     (companyid %in% ciq_wrds_gvkey[gvkey %in% sp500_cst$gvkey, companyid])
    ][, { 
      latest_transcript_version = NA_real_
      for (tid in c(8, 1, 2, 7)) {
         if (tid %in% transcriptcollectiontypeid) {
             latest_transcript_version = tid
             break
         } else { 
         for (tid in c(11, 10, 6, 9, 13)) {
             if (tid %in% transcriptcollectiontypeid) {
                 latest_transcript_version = tid 
                 break
             }
         }}
      } 
      list(latest_transcript_version=latest_transcript_version)  
      }, 
      keyby=.(keydevid)]

f_ciq_transcript_detail_sp500 = ciq_transcript_detail[     
     (keydeveventtypeid==48) & 
     (companyid %in% ciq_wrds_gvkey[gvkey %in% sp500_cst$gvkey, companyid])
    ][latest_transcriptcollectiontypeid, on=.(keydevid), nomatch=0
    ][transcriptcollectiontypeid==latest_transcript_version
    ][order(keydevid, -transcriptcreationdate_utc, -transcriptcreationtime_utc)
    ][, .SD[1],
      keyby=.(keydevid)
    ][unique(ciq_wrds_gvkey[, .(companyid, gvkey)], by=c('companyid', 'gvkey')),
      on=.(companyid),
      nomatch=0
    ] %>% unique()

f_ciq_transcript_detail_sp500[, .N]
f_ciq_transcript_detail_sp500[1]
sv(f_ciq_transcript_detail_sp500)

keydevid,companyid,transcriptid,headline,mostimportantdateutc,keydeveventtypeid,keydeveventtypename,companyname,transcriptcollectiontypeid,transcriptcollectiontypename,transcriptpresentationtypeid,transcriptpresentationtypename,transcriptcreationdate_utc,transcriptcreationtime_utc,audiolengthsec,isdelayed_flag,delayreasontypeid,delayreasontypename,latest_transcript_version,gvkey
<dbl>,<dbl>,<dbl>,<chr>,<date>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>
4855823,18671,504,"Albemarle Corp., Q3 2007 Earnings Call, Oct-23-2007",2007-10-23,48,Earnings Calls,Albemarle Corporation,6,SA Edited Copy,5,Final,2008-06-20,1898,,0,,,6,29751


"f_ciq_transcript_detail_sp500" saved as "f_ciq_transcript_detail_sp500.feather" (2.4 MB) (0.05 secs)


Task:
- check All `f_ciq_transcript_detail_sp500:transcriptid` are identified in `ciq_transcript_component:transcriptid`

In [7]:
if (sum(f_ciq_transcript_detail_sp500[, transcriptid] %in% ciq_transcript_component[, unique(transcriptid)]) == nrow(f_ciq_transcript_detail_sp500)) {
    cat('All `f_ciq_transcript_detail_sp500:transcriptid` are identified in `ciq_transcript_component:transcriptid`')
} else {cat('Unknown `transcriptid` found in f_ciq_transcript_detail_sp500')}

All `f_ciq_transcript_detail_sp500:transcriptid` are identified in `ciq_transcript_component:transcriptid`

## Clean ciq_transcript_component

Task:
> Select obs in `ciq_transcript_component` where `ciq_transcript_component:transcriptid` in `f_ciq_transcript_detail_sp500:transcriptid`
>
> Save results to `f_ciq_transcript_component_sp500`
>
> The output will be used in `C-preEncode.ipynb`: Section 3 (Sentencized)

In [9]:
ld(f_ciq_transcript_detail_sp500)

"f_ciq_transcript_detail_sp500" (2.4 MB) already in .GlobalEnv, will NOT load again! (0 secs)


In [10]:
ld(ciq_transcript_component, path=WRDS_DOWNLOAD_DIR)
ld(f_ciq_transcript_detail_sp500)

"ciq_transcript_component" (25 GB) already in .GlobalEnv, will NOT load again! (0 secs)
"f_ciq_transcript_detail_sp500" (2.4 MB) already in .GlobalEnv, will NOT load again! (0 secs)


In [11]:
f_ciq_transcript_component_sp500 = ciq_transcript_component[
     transcriptid %in% f_ciq_transcript_detail_sp500$transcriptid
    ][order(transcriptid, componentorder)]

In [12]:
sv(f_ciq_transcript_component_sp500)

"f_ciq_transcript_component_sp500" saved as "f_ciq_transcript_component_sp500.feather" (975 MB) (6.68 secs)


In [13]:
f_ciq_transcript_component_sp500[, .N]
f_ciq_transcript_component_sp500[, uniqueN(transcriptcomponentid)]

In [14]:
# write component to disk for Python to sentencize
sv(f_ciq_transcript_component_sp500)

"f_ciq_transcript_component_sp500" saved as "f_ciq_transcript_component_sp500.feather" (975 MB) (6.7 secs)


In [15]:
f_ciq_transcript_component_sp500[, uniqueN(transcriptid)]

## Prepare input for spaCy

> Warnings
>
> `transcriptcomponentid` doesn't necessarily follow the correct order of speech. You must use `componentorder` to have the correct order 

In [2]:
ld(f_ciq_transcript_component_sp500)
ld(ciq_transcript_speaker, path=WRDS_DOWNLOAD_DIR)
ld(ciq_transcript_detail, path=WRDS_DOWNLOAD_DIR)

"f_ciq_transcript_component_sp500.feather" (975 MB) loaded (14.18 secs)
"ciq_transcript_speaker.feather" (7.8 GB) loaded (2.48 mins)
"ciq_transcript_detail.feather" (54.6 MB) loaded (0.95 secs)


In [3]:
# Combine "transcript_speaker" and "transcript_detail" 
text_component_sp500 = ciq_transcript_speaker[, .(transcriptcomponentid, speakertypeid)
    ][f_ciq_transcript_component_sp500, on=.(transcriptcomponentid)]

sv(text_component_sp500)

"text_component_sp500" saved as "text_component_sp500.feather" (978 MB) (6.34 secs)


In [None]:
# Warnings:
#  This methods is deprecated. Concatination no longer needed 

# Concat all components that share the same transcriptid,
# split by ' [EOC] ' (End Of Component)
# [EOC] is safe because I did a full text check in f_ciq_transcript_component_sp500 and 
# no occurance found.
text_component_sp500 = text_component_sp500[order(transcriptid, componentorder)
    ][, .(componentorder=list(componentorder),
          componentid=list(transcriptcomponentid),
          componenttypeid=list(transcriptcomponenttypeid),
          speakerid=list(transcriptpersonid),
          speakertypeid=list(speakertypeid),
          text=str_c(componenttext, collapse=' [EOC] ')),
      keyby=.(transcriptid)]

In [26]:
sv(text_component_sp500)

"text_component_sp500" saved as "text_component_sp500.feather" (971.8 MB) (5.18 secs)


# (X,Y)

## filter `sue`

Task:
- remove obs from `sue` where `act` is NA

Filter:
- `sue[!is.na(act)]`   44690->42307

Notes:
- `sue` is computed for the SP500 universe
- ~500 obs in `f_sue` has NA `sue3` because`numest=0`
- some `lagrdq1` in `f_sue` are NA
- I divide `mcap` by 1000
    
Warnings
- Because we need *lead-one earnings*, the last obs of each `gvkey` is removed. 
- Therefore `f_sue` will have *LESS* obs than `sue`
- the latest `rdq` in `f_sue` is *2019-10-13* (updated: 2020-2-5)
- When computing `se_lead1`, you MUST use price at `t`, not simply shifting future `se` backward:`se_lead1=se_(t+1)/price_close_(t)`

In [11]:
ld(sue, force=T)
ld(sp500_cst, force=T)

"sue.feather" (4.4 MB) loaded (0.02 secs, 2021-02-26 17:27:21)
"sp500_cst.feather" (24.8 KB) loaded (0 secs, 2021-02-26 17:27:21)


In [12]:
sprintf('N sue: %s\n', nrow(sue)) %>% cat()

f_sue = sue[!is.na(act)] # 42440, ['gvkey', 'rdq'] as unique key

rdq_lag_lead_pair = f_sue[, .(gvkey, lagrdq1=rdq1, rdq1=leadrdq1)]

f_sue = rdq_lag_lead_pair[f_sue, on=.(gvkey, rdq1)
    ][gvkey %in% sp500_cst[, unique(gvkey)],
      .(gvkey, permno, datadate, fyearq, fqtr, fyr, rdq, rdq1, repdats, lagrdq1, leadrdq1,
        mcap=log(mcap), act, smedest, numest, sstdest, sue=sue3, se, sest, price_close)
    ][order(gvkey, rdq)
    ] %>% unique()

sprintf('N f_sue: %s\n', nrow(f_sue)) %>% cat()
sv(f_sue)
f_sue[1]

N sue: 45849
N f_sue: 43579
"f_sue" saved as "f_sue.feather" (3.2 MB) (0.03 secs, 2021-02-26 17:27:22)


gvkey,permno,datadate,fyearq,fqtr,fyr,rdq,rdq1,repdats,lagrdq1,leadrdq1,mcap,act,smedest,numest,sstdest,sue,se,sest,price_close
<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,50906,2005-01-31,2005,1,10,2005-02-28,2005-02-28,2005-02-28,,2005-06-01,7.64057,0.01,0.003891051,8,0,0,0.003891051,0.003891051,2.57


In [None]:
f_sue[, .(year=as.factor(year(rdq1)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## sue <-> call_release_link

Task: merge with `call_release_link`
- select row with minimal `abs(rdq-call_date)`
- Output: `f_sue_keydevid`

Filter:
- release event in `sue` has no match in `call_release_link` within [-1d,1d]. 42690 -> 29065

Warnings:
- n_obs of `f_sue_keydevid` is larger than that of `f_sue` because in `call_release_link` there exist multiple `call_keydevid` for the SAME `release_keydevid`
    - e.g. `ciq_wrds_keydev[keydevid %in% c(404527363, 404314219, 404314224)]`

In [13]:
ld(call_release_link, force=T)
ld(f_sue, force=T)

"call_release_link.feather" (1.1 MB) loaded (0.02 secs, 2021-02-26 17:27:54)
"f_sue.feather" (3.2 MB) loaded (0.01 secs, 2021-02-26 17:27:54)


In [14]:
sprintf('N f_sue: %s\n', nrow(f_sue)) %>% cat()

f_sue_keydevid = call_release_link[, ':='(join_date=release_date)
    ][copy(f_sue)[, ':='(join_date=rdq)], on=.(gvkey, join_date), roll='nearest', nomatch=NULL
    ][, ':='(ciq_release_date=rdq)
    ][, ':='(calldate_rdq_gap=call_date-rdq)
    ][calldate_rdq_gap %between% c(-1, 1)
    ][order(gvkey, rdq)
    ][, ':='(join_date=NULL)
    ][order(gvkey, rdq)
    ][, ':='(docid=str_c(gvkey, '-', ciq_release_date))
    ] %>% unique()

sprintf('N f_sue_keydevid: %s\n', nrow(f_sue_keydevid)) %>% cat()

f_sue_keydevid[1]
sv(f_sue_keydevid) 

N f_sue: 43579
N f_sue_keydevid: 33642


call_keydevid,gvkey,call_date,release_date,release_keydevid,companyname,nday_release_lead,permno,datadate,fyearq,⋯,smedest,numest,sstdest,sue,se,sest,price_close,ciq_release_date,calldate_rdq_gap,docid
<dbl>,<chr>,<date>,<date>,<dbl>,<chr>,<drtn>,<chr>,<date>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<date>,<drtn>,<chr>
5152678,1013,2008-03-05,2008-03-05,5241424,CommScope Connectivity LLC,0 secs,50906,2008-01-31,2008,⋯,0.01217039,17,0.001547043,0.006761325,0.01893171,0.01217039,14.79,2008-03-05,0 days,001013-2008-03-05


"f_sue_keydevid" saved as "f_sue_keydevid.feather" (3.5 MB) (0.03 secs, 2021-02-26 17:28:09)


In [None]:
f_sue_keydevid[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## <-> car

Task: merge with `CAR`
- `car` also includes `alpha`, `beta` and `volatility`
- I multiplied `car` by 100!!!

Filter:
- events in `f_sue_keydevid` are not in `car`. 29065 -> 26967

In [16]:
ld(f_sue_keydevid, force=T)
ld(car, force=T)

"f_sue_keydevid.feather" (3.5 MB) loaded (0.01 secs, 2021-02-26 17:28:21)
"car.feather" (8.8 MB) loaded (0.03 secs, 2021-02-26 17:28:21)


In [17]:
sprintf('N f_sue_keydevid: %s\n', nrow(f_sue_keydevid)) %>% cat()

car_cols = c('car_m1_m1', 'car_m2_m2', 'car_m30_m3', 'car_0_1', 'car_0_10', 'car_0_20', 'car_0_30',
             'ret_m1_m1', 'ret_m2_m2', 'ret_m30_m3', 'ret_0_1', 'ret_0_10', 'ret_0_20', 'ret_0_30',
             'alpha', 'volatility')

f_sue_keydevid_car = car[, ':='(join_date=edate)
    ][f_sue_keydevid[, ':='(join_date=call_date)], on=.(permno, join_date), nomatch=NULL
    ][, ':='(join_date=NULL)
    ][, (car_cols) := lapply(.SD, function(x) 100*x), .SDcols=car_cols
    ][, {
      res = list()
      for (i in 1:.N) {
          leadrdq1_tmp = leadrdq1[i]
          lagrdq1_tmp = lagrdq1[i]
          lead_selector=fillna(rdq1==leadrdq1_tmp, na.value=F)
          lag_selector=fillna(rdq1==lagrdq1_tmp, na.value=F)
          
          se_lead1 = se[lead_selector][1]
          se_lag1 = se[lag_selector][1]
          sue_lag1 = sue[lag_selector][1]
          sue_lead1 = sue[lead_selector][1]
          sest_lead1 = sest[lead_selector][1]
          sest_lag1 = sest[lag_selector][1]
          
          car_0_10_lead1 = car_0_10[lead_selector][1]
          car_0_10_lag1 = car_0_10[lag_selector][1]
          car_0_20_lead1 = car_0_20[lead_selector][1]
          car_0_20_lag1 = car_0_20[lag_selector][1]
          car_0_30_lead1 = car_0_30[lead_selector][1]
          car_0_30_lag1 = car_0_30[lag_selector][1]
          

          res[[i]] = list(docid=docid[i], permno=permno[i], datadate=datadate[i], fyearq=fyearq[i], fqtr=fqtr[i],
                          fyr=fyr[i], rdq=rdq[i], rdq1=rdq1[i], repdats=repdats[i], ciq_release_date=ciq_release_date[i],
                          call_date=call_date[i],
                          leadrdq1=leadrdq1[i], release_keydevid=release_keydevid[i],
                          call_keydevid=call_keydevid[i], companyname=companyname[i],
                          mcap=mcap[i], sue=sue[i], sue_lag1=sue_lag1, sue_lead1=sue_lead1,
                          se=se[i], se_lag1=se_lag1, se_lead1=se_lead1, 
                          sest=sest[i], sest_lag1=sest_lag1, sest_lead1=sest_lead1,
                          smedest=smedest[i], numest=numest[i], sstdest=sstdest[i],
                          ret_m1_m1=ret_m1_m1[i], ret_m2_m2=ret_m2_m2[i], ret_m30_m3=ret_m30_m3[i],
                          ret_0_10=ret_0_10[i], ret_0_20=ret_0_20[i], ret_0_30=ret_0_30[i],
                          car_m1_m1=car_m1_m1[i], car_m2_m2=car_m2_m2[i], car_m30_m3=car_m30_m3[i], 
                          car_0_10=car_0_10[i], car_0_10_lead1=car_0_10_lead1, car_0_10_lag1=car_0_10_lag1,
                          car_0_20=car_0_20[i], car_0_20_lead1=car_0_20_lead1, car_0_20_lag1=car_0_20_lag1,
                          car_0_30=car_0_30[i], car_0_30_lead1=car_0_30_lead1, car_0_30_lag1=car_0_30_lag1,
                          alpha=alpha[i], beta_mktrf=beta_mktrf[i], beta_smb=beta_smb[i],
                          beta_hml=beta_hml[i], volatility=volatility[i])}
      res = rbindlist(res)
      }, 
      keyby=.(gvkey)
    ][order(gvkey, ciq_release_date)
    ] %>% unique()

sprintf('N f_sue_keydevid_car: %s\n', nrow(f_sue_keydevid_car)) %>% cat()

f_sue_keydevid_car[1]
sv(f_sue_keydevid_car)

N f_sue_keydevid: 33642
N f_sue_keydevid_car: 31147


gvkey,docid,permno,datadate,fyearq,fqtr,fyr,rdq,rdq1,repdats,⋯,car_0_20_lead1,car_0_20_lag1,car_0_30,car_0_30_lead1,car_0_30_lag1,alpha,beta_mktrf,beta_smb,beta_hml,volatility
<chr>,<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<date>,<date>,<date>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,001013-2008-03-05,50906,2008-01-31,2008,1,10,2008-03-05,2008-03-05,2008-03-05,⋯,4.561394,,-6.801334,0.1948368,,-0.08446115,0.9738662,-0.2997048,1.193008,3.171002


"f_sue_keydevid_car" saved as "f_sue_keydevid_car.feather" (9.4 MB) (0.03 secs, 2021-02-26 17:28:41)


In [None]:
# Plot: number of calls by year
f_sue_keydevid_car[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## <-> finratio

Filter:
- `gvkey` in `f_sue_keydevid_car` are not in `wrds_finratio`. 
- `abs(pubdate_calldate_gap)<=60`. 
- 26967 -> 24470

In [19]:
ld(wrds_finratio, path=WRDS_DOWNLOAD_DIR, force=T)
ld(f_sue_keydevid_car, force=T)

"wrds_finratio.feather" (11.6 MB) loaded (0.06 secs, 2021-02-26 17:28:51)
"f_sue_keydevid_car.feather" (9.4 MB) loaded (0.01 secs, 2021-02-26 17:28:51)


In [20]:
sprintf('N f_sue_keydevid_car: %s\n', nrow(f_sue_keydevid_car)) %>% cat()

f_sue_keydevid_car_finratio = wrds_finratio[, ':='(join_date=public_date)
    ][f_sue_keydevid_car[, ':='(join_date=call_date)], on=.(gvkey, join_date), roll='nearest'
    ][, ':='(pubdate_calldate_gap=public_date-call_date)
    ][!is.na(pubdate_calldate_gap)
    ][abs(pubdate_calldate_gap)<=60
    ][order(gvkey, ciq_release_date)
    ][, ':='(bm=nafill(bm, 'locf'), roa=nafill(roa, 'locf'), debt_assets=nafill(debt_assets, 'locf'))
    ][, ':='(pubdate_calldate_gap=NULL, adate=NULL, qdate=NULL, public_date=NULL, join_date=NULL)
    ] %>% unique()

sprintf('N f_sue_keydevid_car_finratio: %s\n', nrow(f_sue_keydevid_car_finratio)) %>% cat()

f_sue_keydevid_car_finratio[1]
sv(f_sue_keydevid_car_finratio)

N f_sue_keydevid_car: 31147
N f_sue_keydevid_car_finratio: 27253


gvkey,bm,roa,debt_assets,docid,permno,datadate,fyearq,fqtr,fyr,⋯,car_0_20_lead1,car_0_20_lag1,car_0_30,car_0_30_lead1,car_0_30_lag1,alpha,beta_mktrf,beta_smb,beta_hml,volatility
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,0.477,0.099,0.429,001013-2008-03-05,50906,2008-01-31,2008,1,10,⋯,4.561394,,-6.801334,0.1948368,,-0.08446115,0.9738662,-0.2997048,1.193008,3.171002


"f_sue_keydevid_car_finratio" saved as "f_sue_keydevid_car_finratio.feather" (8.6 MB) (0.02 secs, 2021-02-26 17:29:07)


In [None]:
# Plot: number of calls by year
f_sue_keydevid_car_finratio[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## <-> volume

Filter:
- Some gvkey has no `iid=01` in `comp_secd`
- 24470 -> 24451

Notes:
- For any `gvkey`, we only use `iid=01`
- In `comp_secd` some records have `volume==0`, we just keep it.

In [22]:
ld(comp_secd, path=WRDS_DOWNLOAD_DIR)
ld(f_sue_keydevid_car_finratio, force=T)

"comp_secd.feather" (6.5 GB) loaded (25.41 secs, 2021-02-26 17:29:47)
"f_sue_keydevid_car_finratio.feather" (8.6 MB) loaded (0.02 secs, 2021-02-26 17:29:48)


In [23]:
sprintf('N f_sue_keydevid_car_finratio: %s\n', nrow(f_sue_keydevid_car_finratio)) %>% cat()

f_sue_keydevid_car_finratio_vol = comp_secd[datadate>ymd('2005-01-01'), .(gvkey, iid, volume=cshtrd/1e6, price_date=datadate)
    ][order(gvkey, price_date, iid)
    ][, head(.SD,1), keyby=.(gvkey, price_date)
    ][, ':='(join_date=price_date)
    ][, .(gvkey, join_date, volume, price_date, iid)
    ][f_sue_keydevid_car_finratio[, ':='(join_date=ciq_release_date)], on=.(gvkey, join_date), nomatch=NULL
    ][, ':='(releasedate_datadate_gap=ciq_release_date-price_date)
    ][abs(releasedate_datadate_gap)<=0
    ][order(-abs(releasedate_datadate_gap))
    ][, ':='(join_date=NULL, releasedate_datadate_gap=NULL, price_date=NULL, iid=NULL)
    ] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol: %s\n', nrow(f_sue_keydevid_car_finratio_vol)) %>% cat()
f_sue_keydevid_car_finratio_vol[1]
sv(f_sue_keydevid_car_finratio_vol)

N f_sue_keydevid_car_finratio: 27253
N f_sue_keydevid_car_finratio_vol: 27231


gvkey,volume,bm,roa,debt_assets,docid,permno,datadate,fyearq,fqtr,⋯,car_0_20_lead1,car_0_20_lag1,car_0_30,car_0_30_lead1,car_0_30_lag1,alpha,beta_mktrf,beta_smb,beta_hml,volatility
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<date>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,3.996961,0.477,0.099,0.429,001013-2008-03-05,50906,2008-01-31,2008,1,⋯,4.561394,,-6.801334,0.1948368,,-0.08446115,0.9738662,-0.2997048,1.193008,3.171002


"f_sue_keydevid_car_finratio_vol" saved as "f_sue_keydevid_car_finratio_vol.feather" (8.8 MB) (0.03 secs, 2021-02-26 17:30:01)


In [None]:
# Plot: number of calls by year
f_sue_keydevid_car_finratio_vol[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## <-> transcriptid

Filter:
- `call_keydevid` must also exists in `f_ciq_transcript_detail_sp500`
- 24451 -> 24105

In [25]:
ld(f_ciq_transcript_detail_sp500, force=T)
ld(f_sue_keydevid_car_finratio_vol, force=T)

"f_ciq_transcript_detail_sp500.feather" (2.4 MB) loaded (0.09 secs, 2021-02-26 17:30:12)
"f_sue_keydevid_car_finratio_vol.feather" (8.8 MB) loaded (0.01 secs, 2021-02-26 17:30:12)


In [27]:
sprintf('N f_sue_keydevid_car_finratio_vol: %s\n', nrow(f_sue_keydevid_car_finratio_vol)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid = unique(f_ciq_transcript_detail_sp500[, .(join_keydevid=keydevid, transcriptid)])[f_sue_keydevid_car_finratio_vol[, ':='(join_keydevid=call_keydevid)], 
      on=.(join_keydevid),
      nomatch=NULL
    ][, .(gvkey, permno, datadate, fyear=fyearq, fqtr, ciq_release_date, ciq_call_date=call_date, rdq, rdq1, repdats,
          leadrdq1, release_keydevid, call_keydevid, transcriptid,
          mcap, sue, sue_lag1, sue_lead1, se, se_lag1, se_lead1,
          sest, sest_lag1, sest_lead1, numest, smedest, sstdest, volume,
          ret_m1_m1, ret_m2_m2, ret_m30_m3,
          ret_0_10, ret_0_20, ret_0_30,
          car_m1_m1, car_m2_m2, car_m30_m3, 
          car_0_10, car_0_10_lag1, car_0_10_lead1, car_0_20, car_0_20_lag1, car_0_20_lead1,
          car_0_30, car_0_30_lag1, car_0_30_lead1,
          bm, roa, debt_asset=debt_assets, alpha, beta_mktrf, beta_smb, beta_hml, volatility, docid)
    ][order(gvkey, ciq_release_date)
    ] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid[1]
sv(f_sue_keydevid_car_finratio_vol_transcriptid)

N f_sue_keydevid_car_finratio_vol: 27231
N f_sue_keydevid_car_finratio_vol_transcriptid: 26878


gvkey,permno,datadate,fyear,fqtr,ciq_release_date,ciq_call_date,rdq,rdq1,repdats,⋯,car_0_30_lead1,bm,roa,debt_asset,alpha,beta_mktrf,beta_smb,beta_hml,volatility,docid
<chr>,<chr>,<date>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1013,50906,2008-01-31,2008,1,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,⋯,0.1948368,0.477,0.099,0.429,-0.08446115,0.9738662,-0.2997048,1.193008,3.171002,001013-2008-03-05


"f_sue_keydevid_car_finratio_vol_transcriptid" saved as "f_sue_keydevid_car_finratio_vol_transcriptid.feather" (8.7 MB) (0.03 secs, 2021-02-26 17:30:27)


In [None]:
# Plot: number of calls per year
f_sue_keydevid_car_finratio_vol_transcriptid[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## <-> similarity

Filter:
- `docid` must also exists in `similarity`
- 24105 -> 21822

Warnings:
- Must do this step after joining `transcripts`.
- before joining `similarity`, please first create `similarity.feather` in Python!

In [29]:
ld(similarity, force=T)
ld(similarity_finbert, force=T)
ld(f_sue_keydevid_car_finratio_vol_transcriptid, force=T)

"similarity.feather" (1022.2 KB) loaded (0 secs, 2021-02-26 17:30:37)
"similarity_finbert.feather" (334.8 KB) loaded (0 secs, 2021-02-26 17:30:37)
"f_sue_keydevid_car_finratio_vol_transcriptid.feather" (8.7 MB) loaded (0.01 secs, 2021-02-26 17:30:37)


In [32]:
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid)) %>% cat()

# add similarity_ngram
f_sue_keydevid_car_finratio_vol_transcriptid_sim = similarity[f_sue_keydevid_car_finratio_vol_transcriptid, on=.(transcriptid), nomatch=NULL] %>% unique()

# add similarity_finbert
f_sue_keydevid_car_finratio_vol_transcriptid_sim = similarity_finbert[f_sue_keydevid_car_finratio_vol_transcriptid_sim, on=.(transcriptid), nomatch=NULL] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim)) %>% cat()
sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim)
f_sue_keydevid_car_finratio_vol_transcriptid_sim[1]

N f_sue_keydevid_car_finratio_vol_transcriptid: 26878
N f_sue_keydevid_car_finratio_vol_transcriptid_sim: 26878
"f_sue_keydevid_car_finratio_vol_transcriptid_sim" saved as "f_sue_keydevid_car_finratio_vol_transcriptid_sim.feather" (9.4 MB) (0.03 secs, 2021-02-26 17:32:20)


transcriptid,similarity_finbert,similarity_unigram,similarity_bigram,similarity_allgram,gvkey,permno,datadate,fyear,fqtr,⋯,car_0_30_lead1,bm,roa,debt_asset,alpha,beta_mktrf,beta_smb,beta_hml,volatility,docid
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<date>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
320,0.9328347,0.8019376,0.3446537,0.7719722,1013,50906,2008-01-31,2008,1,⋯,0.1948368,0.477,0.099,0.429,-0.08446115,0.9738662,-0.2997048,1.193008,3.171002,001013-2008-03-05


In [33]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim[sue>quantile(sue,0.9,na.rm=T), mean(car_0_30, na.rm=T)]
f_sue_keydevid_car_finratio_vol_transcriptid_sim[sue<quantile(sue,0.1,na.rm=T), mean(car_0_30, na.rm=T)]

In [None]:
# Plot: number of calls by year
f_sue_keydevid_car_finratio_vol_transcriptid_sim[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## <-> inflow

Task:
- Track the [-3, 35] and [-35, 3] money inflow before or after each earnings call
    - use `foverlaps(query, subject)`
    - `query=f_sue`
    - `subject=inflow`

In [35]:
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim, force=T)
ld(inflow, force=T)

"f_sue_keydevid_car_finratio_vol_transcriptid_sim.feather" (9.4 MB) loaded (0.01 secs, 2021-02-26 17:32:32)
"inflow.feather" (19.9 MB) loaded (0.06 secs, 2021-02-26 17:32:32)


In [36]:
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim)) %>% cat()

# subject: inflow
inflow[, ':='(start=report_dt-95, end=report_dt+95)]
setkey(inflow, permno, start, end)

# query: f_sue
f_sue_keydevid_car_finratio_vol_transcriptid_sim[, ':='(start=ciq_call_date, end=ciq_call_date)]
setkey(f_sue_keydevid_car_finratio_vol_transcriptid_sim, permno, start, end)

# foverlaps(query, subject)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow = foverlaps(
      f_sue_keydevid_car_finratio_vol_transcriptid_sim, inflow,
      nomatch=NA)[, ':='(mv=NULL, i.start=NULL, i.end=NULL)
    ][, ':='(inflow_0_90=sum(inflow[(report_dt-ciq_call_date) %between% c(-3, 85)], na.rm=T)), by=docid
    ][, ':='(inflow_0_60=sum(inflow[(report_dt-ciq_call_date) %between% c(-3, 60)], na.rm=T)), by=docid
    ][, ':='(inflow_0_30=sum(inflow[(report_dt-ciq_call_date) %between% c(-3, 30)], na.rm=T)), by=docid
    ][, ':='(inflow_m30_0=sum(inflow[(report_dt-ciq_call_date) %between% c(-30, 3)], na.rm=T)), by=docid
    ][is.na(inflow_0_90), ':='(inflow_0_90=0)
    ][is.na(inflow_0_60), ':='(inflow_0_60=0)
    ][is.na(inflow_0_30), ':='(inflow_0_30=0)
    ][is.na(inflow_m30_0), ':='(inflow_m30_0=0)
    ][, ':='(report_dt=NULL, inflow=NULL, start=NULL, end=NULL)
    ][order(docid)] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow)) %>% cat()

sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow[1]

N f_sue_keydevid_car_finratio_vol_transcriptid_sim: 26878
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: 26878
"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow" saved as "f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow.feather" (10.2 MB) (0.03 secs, 2021-02-26 17:32:39)


permno,transcriptid,similarity_finbert,similarity_unigram,similarity_bigram,similarity_allgram,gvkey,datadate,fyear,fqtr,⋯,alpha,beta_mktrf,beta_smb,beta_hml,volatility,docid,inflow_0_90,inflow_0_60,inflow_0_30,inflow_m30_0
<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<date>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
50906,320,0.9328347,0.8019376,0.3446537,0.7719722,1013,2008-01-31,2008,1,⋯,-0.08446115,0.9738662,-0.2997048,1.193008,3.171002,001013-2008-03-05,74.69675,74.69675,197.6949,21.2308


In [None]:
# Plot: number of calls by year
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow[, 
      .(year=as.factor(year(ciq_release_date)))
    ] %>% plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## <-> revision

In [38]:
ld(revision, force=T)
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow, force=T)

"revision.feather" (46.9 MB) loaded (0.15 secs, 2021-02-26 17:32:46)
"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow.feather" (10.2 MB) loaded (0.01 secs, 2021-02-26 17:32:46)


Task
- How soon do anlysts revise their year-end earnings forecats after earnings call?

In [7]:
how_soon_revise = revision[, .(gvkey, anndats, analys, revision, join_date=anndats)
    ][f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow[, .(gvkey, ciq_call_date, sue, join_date=ciq_call_date)],
      on=.(gvkey, join_date>=join_date), allow.cartesian=T
    ][order(gvkey, ciq_call_date, anndats), .(gvkey, ciq_call_date, anndats, analys, revision)
    ][anndats-ciq_call_date<=30, .(t=(anndats-ciq_call_date)[1], n=.N), keyby=.(gvkey, ciq_call_date, anndats)]

In [None]:
# plot: How soon do anlysts revise their year-end earnings forecats after earnings call?
# ==> Most in the first 2 days
how_soon_revise[, .(n=sum(n)), keyby=.(t)] %>%
    plot_ly(x=~t, y=~n, type='bar') %>%
    plotly::layout(autosize=F, xaxis=list(title='N days after earnings call'),
                   yaxis=list(title='N revisions'))

Task:
- <-> revision

In [39]:
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow)) %>% cat()

# subject: inflow
revision[, ':='(start=anndats-95, end=anndats+95)]
setkey(revision, gvkey, start, end)

# query: f_sue
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow[, ':='(start=ciq_call_date, end=ciq_call_date)]
setkey(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow, gvkey, start, end)

# foverlaps(query, subject)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision = foverlaps(
      f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow, revision,
      nomatch=NA)[, ':='(i.start=NULL, i.end=NULL)
    ][, ':='(revision_0_90=median(revision[(anndats-ciq_call_date) %between% c(-1, 85)], na.rm=T)), by=docid
    ][, ':='(revision_0_60=median(revision[(anndats-ciq_call_date) %between% c(-1, 60)], na.rm=T)), by=docid
    ][, ':='(revision_0_30=median(revision[(anndats-ciq_call_date) %between% c(-1, 30)], na.rm=T)), by=docid
    ][, ':='(revision_m30_0=median(revision[(anndats-ciq_call_date) %between% c(-30, 1)], na.rm=T)), by=docid
    ][is.na(revision_0_90), ':='(revision_0_90=0)
    ][is.na(revision_0_60), ':='(revision_0_60=0)
    ][is.na(revision_0_30), ':='(revision_0_30=0)
    ][is.na(revision_m30_0), ':='(revision_m30_0=0)
    ][, ':='(anndats=NULL, analys=NULL, ticker=NULL, fpedats=NULL, revdats=NULL, value=NULL, revision=NULL, pdf=NULL, start=NULL, end=NULL)
    ][order(docid)] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision)) %>% cat()

sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision[1]

N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: 26878
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision: 26878
"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision" saved as "f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision.feather" (11 MB) (0.03 secs, 2021-02-26 17:33:15)


gvkey,permno,transcriptid,similarity_finbert,similarity_unigram,similarity_bigram,similarity_allgram,datadate,fyear,fqtr,⋯,volatility,docid,inflow_0_90,inflow_0_60,inflow_0_30,inflow_m30_0,revision_0_90,revision_0_60,revision_0_30,revision_m30_0
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,⋯,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,50906,320,0.9328347,0.8019376,0.3446537,0.7719722,2008-01-31,2008,1,⋯,3.171002,001013-2008-03-05,74.69675,74.69675,197.6949,21.2308,0.2643754,0.2291198,0.2962963,0.4074074


In [None]:
# Plot: number of calls by year
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## <-> retail

In [7]:
retail[1]

gvkey,date,total_vol,retail_buy_vol,retail_sell_vol
<chr>,<date>,<dbl>,<dbl>,<dbl>
1004,2007-01-03,288900,1500,11100


In [10]:
# -------------------------------
# compute the output: retail
# -------------------------------
ld(retail)
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision)

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision)) %>% cat()

# create start/end/key for foverlaps
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision[, ':='(start=ciq_call_date, end=ciq_call_date)]
setkey(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision, gvkey, start, end)

# create start/end/key for foverlaps
retail[, ':='(start=date-95, end=date+95)]
setkey(retail, gvkey, start, end)

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail = foverlaps(
      f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision, retail,
      nomatch=NA)[, ':='(i.start=NULL, i.end=NULL)
    ][, ':='(retail_tot_m30_0=sum(total_vol[(date-ciq_call_date) %between% c(-30, 0)], na.rm=T)), by=docid
    ][, ':='(retail_tot_m10_0=sum(total_vol[(date-ciq_call_date) %between% c(-10, 0)], na.rm=T)), by=docid
    ][, ':='(retail_tot_m3_0=sum(total_vol[(date-ciq_call_date) %between% c(-3, 0)], na.rm=T)), by=docid
    ][, ':='(retail_tot_0_3=sum(total_vol[(date-ciq_call_date) %between% c(0, 3)], na.rm=T)), by=docid
    ][, ':='(retail_tot_0_10=sum(total_vol[(date-ciq_call_date) %between% c(0, 10)], na.rm=T)), by=docid
    ][, ':='(retail_tot_0_30=sum(total_vol[(date-ciq_call_date) %between% c(0, 30)], na.rm=T)), by=docid
    ][, ':='(retail_buy_m30_0=sum(retail_buy_vol[(date-ciq_call_date) %between% c(-30, 0)], na.rm=T)), by=docid
    ][, ':='(retail_buy_m10_0=sum(retail_buy_vol[(date-ciq_call_date) %between% c(-10, 0)], na.rm=T)), by=docid
    ][, ':='(retail_buy_m3_0=sum(retail_buy_vol[(date-ciq_call_date) %between% c(-3, 0)], na.rm=T)), by=docid
    ][, ':='(retail_buy_0_3=sum(retail_buy_vol[(date-ciq_call_date) %between% c(0, 3)], na.rm=T)), by=docid
    ][, ':='(retail_buy_0_10=sum(retail_buy_vol[(date-ciq_call_date) %between% c(0, 10)], na.rm=T)), by=docid
    ][, ':='(retail_buy_0_30=sum(retail_buy_vol[(date-ciq_call_date) %between% c(0, 30)], na.rm=T)), by=docid    
    ][, ':='(retail_sell_m30_0=sum(retail_sell_vol[(date-ciq_call_date) %between% c(-30, 0)], na.rm=T)), by=docid
    ][, ':='(retail_sell_m10_0=sum(retail_sell_vol[(date-ciq_call_date) %between% c(-10, 0)], na.rm=T)), by=docid
    ][, ':='(retail_sell_m3_0=sum(retail_sell_vol[(date-ciq_call_date) %between% c(-3, 0)], na.rm=T)), by=docid
    ][, ':='(retail_sell_0_3=sum(retail_sell_vol[(date-ciq_call_date) %between% c(0, 3)], na.rm=T)), by=docid
    ][, ':='(retail_sell_0_10=sum(retail_sell_vol[(date-ciq_call_date) %between% c(0, 10)], na.rm=T)), by=docid
    ][, ':='(retail_sell_0_30=sum(retail_sell_vol[(date-ciq_call_date) %between% c(0, 30)], na.rm=T)), by=docid   
    ][, ':='(retail_net_m30_0=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(-30, 0)], na.rm=T)), by=docid
    ][, ':='(retail_net_m10_0=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(-10, 0)], na.rm=T)), by=docid
    ][, ':='(retail_net_m3_0=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(-3, 0)], na.rm=T)), by=docid
    ][, ':='(retail_net_0_3=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(0, 3)], na.rm=T)), by=docid
    ][, ':='(retail_net_0_10=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(0, 10)], na.rm=T)), by=docid
    ][, ':='(retail_net_0_30=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(0, 30)], na.rm=T)), by=docid
    ][, ':='(total_vol=NULL, retail_buy_vol=NULL, retail_sell_vol=NULL, start=NULL, end=NULL, date=NULL)
    ][order(docid)] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail[1]
sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail)

retail (173.3 MB) already loaded, will NOT load again! (0 secs) (2021-03-01 2:05 PM)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision (11 MB) already loaded, will NOT load again! (0 secs) (2021-03-01 2:05 PM)
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision: 26878
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail: 26878


gvkey,permno,transcriptid,similarity_finbert,similarity_unigram,similarity_bigram,similarity_allgram,datadate,fyear,fqtr,⋯,retail_sell_m3_0,retail_sell_0_3,retail_sell_0_10,retail_sell_0_30,retail_net_m30_0,retail_net_m10_0,retail_net_m3_0,retail_net_0_3,retail_net_0_10,retail_net_0_30
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,50906,320,0.9328347,0.8019376,0.3446537,0.7719722,2008-01-31,2008,1,⋯,152820,244485,502171,899459,-135822,-81094,-83834,105341,172821,289406


"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail" saved as "f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail.feather" (14 MB) (0.03 secs, 2021-03-01 14:06:26)


In [12]:
# ----------------------------------------
# PLOT: retail trades change around the call day
# ----------------------------------------

dt = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail[((date-ciq_call_date) %between% c(-30, 30)), 
      .(retail_tot=sum(total_vol, na.rm=T), 
        retail_buy=sum(retail_buy_vol, na.rm=T), 
        retail_sell=sum(retail_sell_vol, na.rm=T)), 
      keyby=.(date-ciq_call_date)
    ][, c('retail_tot', 'retail_buy', 'retail_sell') := lapply(.SD, cumsum), .SDcols=is.numeric][]

dt %>%
    plot_ly(x=~date, y=~(retail_buy-retail_sell), type='scatter', mode='lines') %>%
#     add_trace(y=~retail_buy) %>%
#     add_trace(y=~(retail_buy-retail_sell)) %>%
    plotly::layout(autosize=F)

ERROR: Error: can only subtract from "Date" objects


## <-> sentiment

In [17]:
ld(sentiment)
ld(dt_sents_sp500)
ld(ciq_transcript_speaker, path=WRDS_DOWNLOAD_DIR)

sentiment (458.2 MB) already loaded, will NOT load again! (0 secs) (2021-03-01 2:11 PM)
dt_sents_sp500 (1 GB) already loaded, will NOT load again! (0 secs) (2021-03-01 2:11 PM)
"ciq_transcript_speaker.feather" (7.8 GB) loaded (1.42 mins) (2021-03-01 2:12 PM)


`speakertype`
- 1: Operator
- 2: Exeutives
- 3: Analyst
- 4: Shareholders
- 5: Attendees

`transcriptcomponenttypeid`
- 1: Presentation Operator Message
- 2: Presenter Speech
- 3: Question
- 4: Answer
- 5: Presentation Section (NULL)
- 6: Question and Answer Section (NULL)
- 7: Question and Answer Operator Message
- 8: Unknown Question and Answer Message

In [18]:
# Find (tid, sid) pair for each speaker type
md_manager = ciq_transcript_speaker[
      transcriptcomponenttypeid==2 & speakertypeid==2, 
      .(transcriptid, componentid=transcriptcomponentid)
    ][dt_sents_sp500[, .(transcriptid, componentid, sentenceid)],
      on=.(transcriptid, componentid), nomatch=NULL]

qa_analyst = ciq_transcript_speaker[
      transcriptcomponenttypeid==3 & speakertypeid==3, 
      .(transcriptid, componentid=transcriptcomponentid)
    ][dt_sents_sp500[, .(transcriptid, componentid, sentenceid)],
      on=.(transcriptid, componentid), nomatch=NULL]

qa_manager = ciq_transcript_speaker[
      transcriptcomponenttypeid==4 & speakertypeid==2, 
      .(transcriptid, componentid=transcriptcomponentid)
    ][dt_sents_sp500[, .(transcriptid, componentid, sentenceid)],
      on=.(transcriptid, componentid), nomatch=NULL]

qa_all = ciq_transcript_speaker[
      transcriptcomponenttypeid%in%c(3,4) & speakertypeid%in%c(2,3), 
      .(transcriptid, componentid=transcriptcomponentid)
    ][dt_sents_sp500[, .(transcriptid, componentid, sentenceid)],
      on=.(transcriptid, componentid), nomatch=NULL]

qa_all[1]

transcriptid,componentid,sentenceid
<int>,<int>,<int>
108,30190,290


In [19]:
# compute sentiment
md_manager = sentiment[, ':='(transcriptid=as.integer(transcriptid),
                 sentenceid=as.integer(sentenceid))
    ][md_manager, on=.(transcriptid, sentenceid), nomatch=NULL
    ][, .(sentiment_positive_md_manager=mean(positive, na.rm=T),
          sentiment_negative_md_manager=mean(negative, na.rm=T),
          sentiment_neutral_md_manager=mean(neutral, na.rm=T)),
      keyby=.(transcriptid)]

qa_analyst = sentiment[, ':='(transcriptid=as.integer(transcriptid),
                 sentenceid=as.integer(sentenceid))
    ][qa_analyst, on=.(transcriptid, sentenceid), nomatch=NULL
    ][, .(sentiment_positive_qa_analyst=mean(positive, na.rm=T),
          sentiment_negative_qa_analyst=mean(negative, na.rm=T),
          sentiment_neutral_qa_analyst=mean(neutral, na.rm=T)),
      keyby=.(transcriptid)]

qa_manager = sentiment[, ':='(transcriptid=as.integer(transcriptid),
                 sentenceid=as.integer(sentenceid))
    ][qa_manager, on=.(transcriptid, sentenceid), nomatch=NULL
    ][, .(sentiment_positive_qa_manager=mean(positive, na.rm=T),
          sentiment_negative_qa_manager=mean(negative, na.rm=T),
          sentiment_neutral_qa_manager=mean(neutral, na.rm=T)),
      keyby=.(transcriptid)]

qa_all = sentiment[, ':='(transcriptid=as.integer(transcriptid),
                 sentenceid=as.integer(sentenceid))
    ][qa_all, on=.(transcriptid, sentenceid), nomatch=NULL
    ][, .(sentiment_positive_qa_all=mean(positive, na.rm=T),
          sentiment_negative_qa_all=mean(negative, na.rm=T),
          sentiment_neutral_qa_all=mean(neutral, na.rm=T)),
      keyby=.(transcriptid)]

sentiment = md_manager[qa_manager, on=.(transcriptid), nomatch=NULL
    ][qa_analyst, on=.(transcriptid), nomatch=NULL
    ][qa_all, on=.(transcriptid), nomatch=NULL]

sentiment[1]

transcriptid,sentiment_positive_md_manager,sentiment_negative_md_manager,sentiment_neutral_md_manager,sentiment_positive_qa_manager,sentiment_negative_qa_manager,sentiment_neutral_qa_manager,sentiment_positive_qa_analyst,sentiment_negative_qa_analyst,sentiment_neutral_qa_analyst,sentiment_positive_qa_all,sentiment_negative_qa_all,sentiment_neutral_qa_all
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
108,0.4891194,0.1527988,0.3580818,0.366055,0.1893606,0.4445844,0.2922119,0.1317654,0.5760226,0.323859,0.1564491,0.519692


In [20]:
# load previous datatable
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail, force=T)

# <-> sentiment
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment = sentiment[f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail, on=.(transcriptid), nomatch=NULL]

# save
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: %s\n (%s variable)\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment), length(names(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment))) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment[1]
sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment)

"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail.feather" (14 MB) loaded (0.02 secs) (2021-03-01 2:56 PM)
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail: 26878
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: 26327
 (103 variable)


transcriptid,sentiment_positive_md_manager,sentiment_negative_md_manager,sentiment_neutral_md_manager,sentiment_positive_qa_manager,sentiment_negative_qa_manager,sentiment_neutral_qa_manager,sentiment_positive_qa_analyst,sentiment_negative_qa_analyst,sentiment_neutral_qa_analyst,⋯,retail_sell_m3_0,retail_sell_0_3,retail_sell_0_10,retail_sell_0_30,retail_net_m30_0,retail_net_m10_0,retail_net_m3_0,retail_net_0_3,retail_net_0_10,retail_net_0_30
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
320,0.4169185,0.1822185,0.400863,0.3166729,0.1609967,0.5223304,0.2500814,0.1686604,0.5812582,⋯,152820,244485,502171,899459,-135822,-81094,-83834,105341,172821,289406


"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment" saved as "f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment.feather" (16.2 MB) (0.03 secs, 2021-03-01 14:56:00)


In [21]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment[sue>quantile(sue,0.9, na.rm=T), mean(car_0_30)]
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment[sue<quantile(sue,0.1, na.rm=T), mean(car_0_30)]

In [None]:
# Plot: number of calls by year
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## outlier

### find outlier companies

In [37]:
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment, force=T)
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment, filter, force=T)

# find the companies that 1) n_call>=10
gvkey_outlier = filter[, 
      .(n_tid=uniqueN(transcriptid), last_call_date=max(ciq_call_date)),
      keyby=.(gvkey)
    ][n_tid<10, unique(gvkey)]


remove_rate = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment[, sum(gvkey%in%gvkey_outlier)/.N]

sprintf('Will remove %s%%', round(remove_rate*100, 2))
sv(gvkey_outlier)

"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment.feather" (16.2 MB) loaded (0.03 secs) (2021-03-01 3:27 PM)
"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment.feather" (16.2 MB) loaded as "filter" (0.03 secs) (2021-03-01 3:27 PM)


"gvkey_outlier" saved as "gvkey_outlier.rds" (321 B) (0 secs, 2021-03-01 15:27:41)


### winsorize

Task:
- Remove top/bottom 1% CAR
- Add outlier flag. The flag is created in `C-benchmark.ipynb` (investigate OLS performance)


In [38]:
# print N obs Before processing
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: %s\n (%s variable)\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment), length(names(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment))) %>% cat()

# ----------------------------
# winsorize
# ----------------------------
wsrz <- function(x, level) {
    high = quantile(x, 1-level)
    low = quantile(x, level)
    
    x[x>=high] = high
    x[x<=low] = low
    x
}

# only cols in the following are winsorized.
# we only winsorize target variables
# the winsorization is carried out in place: I didn't create new variable
cols_to_be_winsorized = c('car_0_10', 'car_0_20', 'car_0_30')

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment[, 
     (cols_to_be_winsorized) := lapply(.SD, wsrz, 0.01),
     .SDcols=cols_to_be_winsorized]


# ----------------------------
# Add outlier flag
# ----------------------------
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier[,
      ':='(outlier_flag1 = ifelse(gvkey%in%gvkey_outlier, T, F))]

# print N obs After processing
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier: %s\n (%s variable)\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier), length(names(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier))) %>% cat()

# save
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier[1]
sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier)

N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: 26327
 (103 variable)
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier: 26327
 (104 variable)


transcriptid,sentiment_positive_md_manager,sentiment_negative_md_manager,sentiment_neutral_md_manager,sentiment_positive_qa_manager,sentiment_negative_qa_manager,sentiment_neutral_qa_manager,sentiment_positive_qa_analyst,sentiment_negative_qa_analyst,sentiment_neutral_qa_analyst,⋯,retail_sell_0_3,retail_sell_0_10,retail_sell_0_30,retail_net_m30_0,retail_net_m10_0,retail_net_m3_0,retail_net_0_3,retail_net_0_10,retail_net_0_30,outlier_flag1
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
320,0.4169185,0.1822185,0.400863,0.3166729,0.1609967,0.5223304,0.2500814,0.1686604,0.5812582,⋯,244485,502171,899459,-135822,-81094,-83834,105341,172821,289406,True


"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier" saved as "f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier.feather" (16.2 MB) (0.04 secs, 2021-03-01 15:27:47)


In [None]:
# Plot: number of calls by year
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

## standardize

Some final touch

In [44]:
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier, force=T)


# Check all keys variables are not NA
cols = c('car_0_30', 'inflow_0_90', 'revision_0_90', 'transcriptid', 'alpha', 'car_m1_m1', 'car_m2_m2', 'car_m30_m3', 'sest', 'sue', 'numest', 'sstdest', 'smedest', 'mcap', 'roa', 'bm', 'debt_asset', 'volatility', 'volume', 'similarity_bigram', 'sentiment_negative_qa_analyst')

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier[!is.na(sue) & !is.na(alpha) & !is.na(sstdest)]

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier[,
      lapply(.SD, function(x) sum(is.na(x))/length(x)),
      .SDcols=cols]

"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier.feather" (16.2 MB) loaded (0.03 secs) (2021-03-01 3:44 PM)


car_0_30,inflow_0_90,revision_0_90,transcriptid,alpha,car_m1_m1,car_m2_m2,car_m30_m3,sest,sue,⋯,sstdest,smedest,mcap,roa,bm,debt_asset,volatility,volume,similarity_bigram,sentiment_negative_qa_analyst
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


Start standardizing...

In [45]:
standardize <- function(x) {
    # zero mean, unit variance
    stopifnot(length(x) == sum(!is.na(x)))
    (x-mean(x))/sd(x)
}

normalize <- function(x) {
    # 0-1 range
    (x - min(x)) / (max(x) - min(x))
}

binarize <- function(x) {
    # 1: up, 0: down
    as.integer(x>0)
}

rank <- function(x) {
    order(x)/length(x)
}

# --------------------------------------------
# rename aux variables
#     pick the aux variable versions that will
#     be used in model training
# --------------------------------------------
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier[, 
      ':='(inflow=inflow_0_90, revision=revision_0_90, retail=retail_net_0_3)]


# variables that will be standardized
cols_feature = c('alpha', 'sest', 'sue', 'numest', 'sstdest', 'smedest', 
                 'mcap', 'roa', 'bm', 'debt_asset', 'volatility', 'volume', 
                 'similarity_bigram', 'sentiment_negative_qa_analyst')

cols_target = c('inflow', 'revision', 'retail',
                'car_0_10', 'car_0_20', 'car_0_30',
                'car_m1_m1', 'car_m2_m2', 'car_m30_m3')

# print N obs before processing
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: %s\n (%s variable)\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier), length(names(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier))) %>% cat()

# normalize!
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand = copy(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier)[, 
    (str_c(cols_feature, '_stand')) := lapply(.SD, standardize), .SDcols=cols_feature
    ][, (str_c(cols_target, '_stand')) := lapply(.SD, standardize),
      .SDcols=cols_target
    ][, (str_c(cols_target, '_norm')) := lapply(.SD, normalize),
      .SDcols=cols_target
    ][, (str_c(cols_target, '_bin')) := lapply(.SD, binarize),
      .SDcols=cols_target
    ][, (str_c(cols_target, '_quintile')) := lapply(.SD, ntile, 5),
      .SDcols=cols_target
    ][, (str_c(cols_target, '_decimal')) := lapply(.SD, ntile, 10),
      .SDcols=cols_target
    ]


# print N obs After processing
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand: %s\n (%s variable)\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand), length(names(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand))) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand[1]
sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand, targets_final_addretail)

N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: 24677
 (107 variable)
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand: 24677
 (166 variable)


transcriptid,sentiment_positive_md_manager,sentiment_negative_md_manager,sentiment_neutral_md_manager,sentiment_positive_qa_manager,sentiment_negative_qa_manager,sentiment_neutral_qa_manager,sentiment_positive_qa_analyst,sentiment_negative_qa_analyst,sentiment_neutral_qa_analyst,⋯,car_m30_m3_quintile,inflow_decimal,revision_decimal,retail_decimal,car_0_10_decimal,car_0_20_decimal,car_0_30_decimal,car_m1_m1_decimal,car_m2_m2_decimal,car_m30_m3_decimal
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
320,0.4169185,0.1822185,0.400863,0.3166729,0.1609967,0.5223304,0.2500814,0.1686604,0.5812582,⋯,2,5,9,9,1,2,3,8,2,4


"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand" saved as "targets_final_addretail.feather" (22.1 MB) (0.05 secs, 2021-03-01 15:44:19)


In [None]:
# Plot: number of calls by year
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=T)

In [None]:
# hist: inflow vs. revision
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_outlier_stand %>%
    plot_ly(alpha=0.6) %>%
    add_histogram(x=~inflow_norm, name='inflow') %>%
    add_histogram(x=~car_0_30_norm, name='car') %>%
    add_histogram(x=~revision, name='revision') %>%
    plotly::layout(barmode='overlay', autosize=F)

## plot (for test)

Task:
- Distribution of calls within a year

In [None]:
dt = read_feather('./data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_sentiment_text_norm_wsrz.feather') %>% as.data.table()

dt[, ':='(year=year(ciq_call_date))]

add_vline <- function(y) {
    list(type = "line", 
         y0 = y, y1 = y, xref = "paper", # i.e. y as a proportion of visible region
         x0 = 0, x1 = 1, 
         line = list(dash='dot', width=1))
}

copy(dt)[ciq_call_date %between% c(ymd('2017-10-01'), ymd('2018-12-31')), .(car_0_30_norm, ciq_call_date, date)] %>%
    plot_ly(x=~ciq_call_date, y=~car_0_30_norm, color=~as.factor(year(ciq_call_date)), type='scatter', mode='markers', alpha=0.3) %>%
    plotly::layout(autosize=F, xaxis=list(title='', type='date'),
                   yaxis=list(title='CAR'), shapes=list(add_vline(-2), add_vline(2)))

In [None]:
copy(dt)[, ':='(date=make_date(2000, month(ciq_call_date), day(ciq_call_date)))
    ][ciq_call_date %between% c(ymd('2008-01-01'), ymd('2018-12-31')), .(car_0_30_norm, ciq_call_date, date)] %>%
    plot_ly(x=~date, y=~car_0_30_norm, color='red', type='scatter', mode='markers', alpha=0.2) %>%
    plotly::layout(autosize=F, xaxis=list(title='', type='date', tickformat = "%b %d"),
                   yaxis=list(title='CAR'))

Only remove missing values here. For scaling, do it in Python.

In [209]:
targets_df = as.data.table(read_feather('data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text.feather'))

all_cols = names(targets_df)
text_cols = c('text_present', 'text_qa', 'text_ans', 'text_ques', 'text_all')
non_text_cols = all_cols[!all_cols %in% text_cols]

targets_df = targets_df[, ..non_text_cols]

index vs CAR

In [None]:
ld(idx_price, path=WRDS_DOWNLOAD_DIR)

spx_daily = idx_price[gvkeyx=='000003' & datadate %between% c(ymd('2008-01-01'), ymd('2018-10-01')), 
                    .(date=datadate, idx=prccd)] %>%
    plot_ly(x=~date, y=~idx, name='Index Return', type='scatter', mode='lines') %>%
    plotly::layout(autosize=F, legend=list(0.1, 0.9))

outlier = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text[, .(ciq_release_date, car_0_30)] %>%
    plot_ly(x=~ciq_release_date, y=~car_0_30, type='scatter', mode='markers', marker=list(size=1, line=list(width=1), alpha=0.5), name='CAR') %>%
    plotly::layout(autosize=F, legend=list(0.1, 0.9))

subplot(spx_daily, outlier, nrows=2, shareX=T)

Extreme values of CAR

In [None]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text[abs(car_0_30)>=60] %>%
    plot_ly(x=~ciq_release_date, y=~car_0_30, type='scatter', mode='markers', showlegend=T) %>% 
    plotly::layout(autosize=F)

In [None]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text %>%
    plot_ly(x=~car_0_30, type='histogram', nbinsx=300) %>% 
    plotly::layout(autosize=F)

In [5]:
dt = read_feather('./data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text.feather')

In [7]:
setDT(dt)
dt[, summary(inflow)]

      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-110.91022   -2.07961   -0.60965   -2.03208   -0.07435   43.82143 

# Split train/val/test


Task: create rolling window
- start: `2008-01-01`
- end: `2019-12-31`
- training period: 3 years (12 quarters)
- predict period: next quarter

> `val` is randomly sampled from `train`, that is `val` and `train` are of same period

In [None]:
get_rolling_split_dates <- function(train_start, train_end, test_start, test_end, window_size) {
    rolling_split_dates = data.table(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, window_size=window_size)
    
    i_window = 1
    while (test_end < ymd('2019-12-31')) {
        i_window = i_window + 1
        train_start = train_start %m+% months(3) 
        train_end = train_end %m+% months(4) %>% rollback()
        test_start = test_start %m+% months(3) 
        test_end = test_end %m+% months(4) %>% rollback()
        
        rolling_split_dates = rbindlist(
            list(rolling_split_dates, 
                 list(train_start=train_start, train_end=train_end, 
                      test_start=test_start, test_end=test_end,
                      window_size=window_size)))
    }
    return(rolling_split_dates)
}
train_start_1y = ymd('2008-01-01')
train_end_1y = ymd('2008-12-31')
test_start_1y = ymd('2009-01-01')
test_end_1y = ymd('2009-03-31')

train_start_2y = ymd('2008-01-01')
train_end_2y = ymd('2009-12-31')
test_start_2y = ymd('2010-01-01')
test_end_2y = ymd('2010-03-31')

train_start_3y = ymd('2008-01-01')
train_end_3y = ymd('2010-12-31')
test_start_3y = ymd('2011-01-01')
test_end_3y = ymd('2011-03-31')

train_start_4y = ymd('2008-01-01')
train_end_4y = ymd('2011-12-31')
test_start_4y = ymd('2012-01-01')
test_end_4y = ymd('2012-03-31')

train_start_5y = ymd('2008-01-01')
train_end_5y = ymd('2012-12-31')
test_start_5y = ymd('2013-01-01')
test_end_5y = ymd('2013-03-31')

train_start_6y = ymd('2008-01-01')
train_end_6y = ymd('2013-12-31')
test_start_6y = ymd('2014-01-01')
test_end_6y = ymd('2014-03-31')

train_start_7y = ymd('2008-01-01')
train_end_7y = ymd('2014-12-31')
test_start_7y = ymd('2015-01-01')
test_end_7y = ymd('2015-03-31')

train_start_8y = ymd('2008-01-01')
train_end_8y = ymd('2015-12-31')
test_start_8y = ymd('2016-01-01')
test_end_8y = ymd('2016-03-31')

train_start_9y = ymd('2008-01-01')
train_end_9y = ymd('2016-12-31')
test_start_9y = ymd('2017-01-01')
test_end_9y = ymd('2017-03-31')

rolling_split_dates = rbindlist(
    list(get_rolling_split_dates(train_start_1y, train_end_1y, test_start_1y, test_end_1y, window_size='1y'),
         get_rolling_split_dates(train_start_2y, train_end_2y, test_start_2y, test_end_2y, window_size='2y'),
         get_rolling_split_dates(train_start_3y, train_end_3y, test_start_3y, test_end_3y, window_size='3y'),
         get_rolling_split_dates(train_start_4y, train_end_4y, test_start_4y, test_end_4y, window_size='4y'),
         get_rolling_split_dates(train_start_5y, train_end_5y, test_start_5y, test_end_5y, window_size='5y'),
         get_rolling_split_dates(train_start_6y, train_end_6y, test_start_6y, test_end_6y, window_size='6y'),
         get_rolling_split_dates(train_start_7y, train_end_7y, test_start_7y, test_end_7y, window_size='7y'),
         get_rolling_split_dates(train_start_8y, train_end_8y, test_start_8y, test_end_8y, window_size='8y'),
         get_rolling_split_dates(train_start_9y, train_end_9y, test_start_9y, test_end_9y, window_size='9y')),
    use=T)

nonrolling_split = list(train_start=ymd('2008-01-01'),
                        train_end=ymd('2017-12-31'),
                        test_start=ymd('2018-01-01'),
                        test_end=ymd('2018-12-31'),
                        window_size='2008-2017')

rolling_split_dates = rbindlist(list(rolling_split_dates, nonrolling_split), fill=T)[order(window_size, test_start)
    ][, ':='(yqtr=fcase(month(test_start)==1, str_c(year(test_start)-1, '-q4'),
                        month(test_start)==4, str_c(year(test_start), '-q1'),
                        month(test_start)==7, str_c(year(test_start), '-q2'),
                        month(test_start)==10, str_c(year(test_start), '-q3')))
    ][window_size=='2008-2017', ':='(yqtr='2018')
    ][test_end<=ymd('2019-12-31')]

# print preview
rolling_split_dates[order(window_size, yqtr)]


# write to csv
sv(rolling_split_dates)
fwrite(rolling_split_dates, './data/split_dates.csv')