# Init

In [2]:
# library
library(repr)
library(Matrix)
suppressMessages(suppressWarnings({
    library(glmnet)
    library(jsonlite)
    library(utilr)
    # library(comet)
}))

# data_dir
if (tolower(str_sub(getwd(), -11)) == 'onedrive/cc') {
    DATA_DIR = str_c(getwd(), '/data')
    WRDS_DOWNLOAD_DIR = str_c(DATA_DIR, '/WRDS-download')
    cat(str_c('Current working directory: ', getwd()))
} else {
    cat(str_c('Please set working dir to "~/onedrive/cc"'))
}

# options for plot
options(repr.plot.width=7, repr.plot.height=4, repr.plot.res = 300)

Current working directory: C:/Users/rossz/Onedrive/CC

# WRDS

## establish connection

In [6]:
library(RPostgres)

# connect to wrds
wrds <- dbConnect(Postgres(),
                  host='wrds-pgdata.wharton.upenn.edu',
                  port=9737,
                  dbname='wrds',
                  sslmode='require',
                  user='xiaomowu')

## unit test

In [None]:
# Determine the data libraries available at WRDS
res <- dbSendQuery(wrds, "select distinct table_schema
                   from information_schema.tables
                   where table_type ='VIEW'
                   or table_type ='FOREIGN TABLE'
                   order by table_schema")
data <- dbFetch(res, n=-1)
dbClearResult(res)
print(setDT(data)[, sort(table_schema)])

In [None]:
# Determine the datasets within a given library
library = 'crsp'
res <- dbSendQuery(wrds, sprintf("select distinct table_name
                   from information_schema.columns
                   where table_schema='%s'
                   order by table_name", library))
data <- dbFetch(res, n=-1)
dbClearResult(res)
setDT(data)[, sort(table_name)] %>% print()

In [None]:
# Determine the variables (column headers) within a given dataset
library = 'ibes'
dataset = 'det_epsus'
res <- dbSendQuery(wrds, sprintf("select column_name
                   from information_schema.columns
                   where table_schema='%s'
                   and table_name='%s'
                   order by column_name", library, dataset))
data <- dbFetch(res, n=-1)
dbClearResult(res)
setDT(data)[, column_name] %>% print()

In [None]:
# To query the crsp.dsf dataset
library = "ibes"
dataset = 'ptgdet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
data <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
nrow(data)

## CIQ-Transcripts

> Transcript meta data

In [85]:
library = "ciq"
dataset = 'wrds_transcript_detail'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ciq_transcript_detail <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
sv(ciq_transcript_detail, path=WRDS_DOWNLOAD_DIR)
ciq_transcript_detail[1]

-ciq_transcript_detail- saved  (4.89 secs)


companyid,keydevid,transcriptid,headline,mostimportantdateutc,keydeveventtypeid,keydeveventtypename,companyname,transcriptcollectiontypeid,transcriptcollectiontypename,transcriptpresentationtypeid,transcriptpresentationtypename,transcriptcreationdate_utc,transcriptcreationtime_utc,audiolengthsec,isdelayed_flag,delayreasontypeid,delayreasontypename
<dbl>,<dbl>,<dbl>,<chr>,<date>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
,637187178,1855141,,,,,,7,Spellchecked Copy,4,Preliminary,2019-11-06,67695,3363,1,2,Cancelled- No Audio


In [86]:
nrow(ciq_transcript_detail)

> Transcript speaker

In [None]:
library = "ciq"
dataset = 'wrds_transcript_person'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ciq_transcript_speaker <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
ciq_transcript_speaker[1]
sv(ciq_transcript_speaker, path=WRDS_DOWNLOAD_DIR)

In [5]:
ciq_transcript_speaker[1]

transcriptid,transcriptcomponentid,componentorder,transcriptcomponenttypeid,transcriptcomponenttypename,transcriptpersonid,transcriptpersonname,proid,companyofperson,speakertypeid,speakertypename,componenttextpreview,word_count
<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
108,30184,1,1,Presentation Operator Message,1,Operator,,,1,Operator,"Good day ladies and gentlemen and welcome to the Second Quarter 2006 Agilent Technologies Inc. Earnings Conference Call. My name is Maria, and I will be your audio coordinator for today. (Operator Instructions).At this time, I would now turn the prese",54


> Transcript component

In [None]:
library = "ciq_transcripts"
dataset = 'ciqtranscriptcomponent'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ciq_transcript_component <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
ciq_transcript_component[1]
sv(ciq_transcript_component, path=WRDS_DOWNLOAD_DIR)

In [None]:
ld(ciq_transcript_component, path=WRDS_DOWNLOAD_DIR)
ciq_transcript_component[1]

> Transcript component_type_name

In [None]:
library = "ciq_transcripts"
dataset = 'ciqtranscriptcomponenttype'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ciq_transcript_componenttype <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
ciq_transcript_componenttype
sv(ciq_transcript_componenttype, path=WRDS_DOWNLOAD_DIR)

## I/B/E/S

### Detail

In [None]:
# ADJUSTED
# from 2006

library = 'ibes'
dataset = 'det_epsus' # eps
# dataset = 'det_xepsus' # non-eps

query = sprintf("select * from %s.%s where anndats>='2006-01-01'::date", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_det_epsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sv(ibes_det_epsus, path=WRDS_DOWNLOAD_DIR)

In [None]:
# UNADJUSTED

library = 'ibes'
dataset = 'detu_epsus' # eps
# dataset = 'det_xepsus' # non-eps

query = sprintf("select ticker, estimator, analys, pdf, fpi, value, fpedats, revdats, revtims, anndats, anntims from %s.%s where fpedats between '2000-01-01' and '2018-12-31' and (fpi='6' or fpi='7')", library, dataset, year)

res <- dbSendQuery(wrds, query)
ibes_detu_epsus<- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s', dataset, nrow(ibes_detu_epsus))
sv(ibes_detu_epsus, path=WRDS_DOWNLOAD_DIR)

### actuals

In [None]:
library = 'ibes'
dataset = 'actu_epsus' 

query = sprintf("select ticker, anndats as repdats, value as act, pends as fpedats, pdicity from %s.%s where pends between '2000-01-01' and '2018-12-31' and pdicity='QTR'", library, dataset, year)

res <- dbSendQuery(wrds, query)
ibes_actu_epsus<- setDT(dbFetch(res, n=-1)); dbClearResult(res)

sprintf('%s: %s', dataset, nrow(ibes_actu_epsus))
sv(ibes_actu_epsus, path=WRDS_DOWNLOAD_DIR)
ibes_actu_epsus[1]

### Summary

In [None]:
library = 'ibes'
dataset = 'statsum_epsus' # summary eps
query = sprintf("select * from %s.%s", library, dataset)

res <- dbSendQuery(wrds, query)
ibes_statsum_epsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

ibes_statsum_epsus[1]
sv(ibes_statsum_epsus, path=WRDS_DOWNLOAD_DIR)

In [None]:
library = 'ibes'
dataset = 'statsum_xepsus' # summary xeps

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_statsum_xepsus <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
ibes_statsum_xepsus[1]
sv(ibes_statsum_xepsus, path=WRDS_DOWNLOAD_DIR)

In [51]:
ld(ibes_statsum_epsus, path=WRDS_DOWNLOAD_DIR)

-ibes_statsum_epsus- loaded  (21.8 secs)


### Surprise

In [None]:
library = 'ibes'
dataset = 'surpsum' # summary surprise 

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_surpsum <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
ibes_surpsum[1]
sv(ibes_surpsum, path=WRDS_DOWNLOAD_DIR)

## price target

In [None]:
library = "ibes"
dataset = 'ptgdet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
ptgdet <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
sv(ptgdet, path='/data/WRDS-download')

In [None]:
nrow(ptgdet)

In [None]:
ptgdet[1]

## recommend

In [None]:
library = 'ibes'
dataset = 'recddet'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
recddet <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
sv(recddet, path='/data/WRDS-download')

In [None]:
nrow(recddet)

In [None]:
recddet[1]

## financial ratio

There's no Python or R API to download financial-ratio. So I download the csv through web query and import as R datatable.

Download path: **CIQ -> Financial Ratios Firm Level**

- last update: 2020-5-4
- `bm`: book-to-market
- `debt_assets`: Total Debt/Total Assets
- `debt_at` *(deprecated)*: Total Debt/Total Assets
- `roa`: return on assets

In [478]:
wrds_finratio = fread(str_c(WRDS_DOWNLOAD_DIR, '/financial-ratio-firm.csv'), colClass=c(rep('character', times=4), rep('double', 4)))[order(gvkey, public_date)
    ][, ':='(adate=ymd(adate), qdate=ymd(qdate), public_date=ymd(public_date), debt_at=NULL)]

sv(wrds_finratio, path=WRDS_DOWNLOAD_DIR)

-wrds_finratio- saved  (0.96 secs)


## index price

In [51]:
library = 'comp'
dataset = 'idx_daily' # factors

res <- dbSendQuery(wrds, sprintf("select * from %s.%s", library, dataset))
idx_price <- dbFetch(res, n=-1) %>% setDT()
dbClearResult(res)
sv(idx_price, path=WRDS_DOWNLOAD_DIR)

-idx_price- saved  (13.87 secs)


## stock price

- `2007-01-01` to `2020-01-01`
- all stocks from COMP
- took 47 min
- saved as `.feather`, 17.9 GB

In [7]:
library = 'comp'
dataset = 'secd' # factors

system.time({
    query = sprintf("select * from %s.%s where datadate>='2007-01-01'::date and datadate<='2020-01-01'", library, dataset)

    res <- dbSendQuery(wrds, query)
    comp_secd <- setDT(dbFetch(res, n=-1))
    dbClearResult(res)
})

write_feather(comp_secd, str_c(WRDS_DOWNLOAD_DIR, '/comp_secd.feather'))

"Expired, result set already closed"


   user  system elapsed 
 283.96   23.56 2839.59 

In [8]:
ld(comp_secd, path='./data/WRDS-download')

-comp_secd- loaded  (3.44 mins)


In [9]:
comp_secd[1]

gvkey,iid,datadate,tic,cusip,conm,curcddv,capgn,cheqv,div,divd,divdpaydateind,divsp,dvrated,paydateind,anncdate,capgnpaydate,cheqvpaydate,divdpaydate,divsppaydate,paydate,recorddate,curcdd,adrrc,ajexdi,cshoc,cshtrd,dvi,eps,epsmo,prccd,prchd,prcld,prcod,prcstd,trfd,exchg,secstat,tpci,cik,fic
<chr>,<chr>,<date>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<date>,<date>,<date>,<date>,<date>,<date>,<date>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
1004,1,2000-01-03,AIR,361105,AAR CORP,,,,,,,,,,,,,,,,,USD,,1,27401000,109000,0.34,1.6,11,17.5625,18,17.5625,,3,1.469693,11,A,0,1750,USA


Download `CRSP`
It's *deprecated*!

In [None]:
library = 'crspq'
dataset = 'dsf' # factors
years = 2001:2018

for (year in years) {
    print(sprintf('Start %s at %s', year, now()))
    query = sprintf("select * from %s.%s where to_char(date,'yyyy')='%s'", library, dataset, year)
    save_name_short = sprintf('%s_%s', dataset, year)
    save_name_full = sprintf('../data/WRDS-download/%s_%s.rds', dataset, year)

    res <- dbSendQuery(wrds, query)
    data <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
    
    sprintf('%s_%s: %s', dataset, year, nrow(data))
    assign(save_name_short, data)
    saveRDS(data, save_name_full)
}

## stock industry

In [19]:
library = 'comp'
dataset = 'co_industry' # factors

system.time({
    query = sprintf("select * from %s.%s", library, dataset)
    res <- dbSendQuery(wrds, query)
    comp_industry <- setDT(dbFetch(res, n=-1))
    dbClearResult(res)
})


   user  system elapsed 
   0.58    0.04    3.52 

In [None]:
comp_industry[consol=='C', .(gvkey, naicsh, sich, year, datadate)]

In [None]:
# comp_industry[1]
comp_secd[90000:90010, .(gvkey, trfd, datadate, prccd)]
sp500_cst[1]

In [46]:
xym = comp_secd[(gvkey %in% sp500_cst$gvkey) & (iid=='01'), 
      .(gvkey, ticker=tic, name=conm, date=datadate, year=year(datadate), prcice=prccd, price_adjusted=prccd/ajexdi)
    ][comp_industry[consol=='C', .(gvkey, year, class_na=naicsh, class_sic=sich)], on=.(gvkey, year), nomatch=NULL
    ][, ':='(return=growth(price_adjusted))
    ][!is.na(return)
    ][order(gvkey, date)
    ][, ':='(year=NULL)] %>% unique()

In [54]:
sv(xym)

-xym- saved  (9.08 secs)


## FF-factors

Notes:
- `factors` are in decimal, not percentage point. So `smb=0.0024` means the return is 0.24 percentage points

In [None]:
library = 'ff'
dataset = 'factors_daily' # factors

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
factors <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(data))
sv(factors, path='/data/WRDS-download')

In [35]:
factors

date,mktrf,smb,hml,rf,umd
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1926-07-01,0.0010,-0.0024,-0.0028,9e-05,
1926-07-02,0.0045,-0.0032,-0.0008,9e-05,
1926-07-06,0.0017,0.0027,-0.0035,9e-05,
1926-07-07,0.0009,-0.0059,0.0003,9e-05,
1926-07-08,0.0021,-0.0036,0.0015,9e-05,
1926-07-09,-0.0071,0.0044,0.0056,9e-05,
1926-07-10,0.0062,-0.0050,-0.0015,9e-05,
1926-07-12,0.0004,0.0003,0.0054,9e-05,
1926-07-13,0.0048,-0.0026,-0.0023,9e-05,
1926-07-14,0.0004,0.0009,-0.0048,9e-05,


## firm ID

### `ibes.id`

In [None]:
library = 'ibes'
dataset = 'id' # firm names

query = sprintf("select ticker, cusip, cname, sdates from %s.%s where usfirm=1 and cusip != ''", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_id <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(ibes_id))
sv(ibes_id, path=WRDS_DOWNLOAD_DIR)

query = sprintf("select ticker, cusip, cname, oftic, sdates from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ibes_id2 <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ibes_id2))

ibes_id2[1]
sv(ibes_id2, path=WRDS_DOWNLOAD_DIR)

### `comp.security`

In [134]:
library = 'compm'
dataset = 'security' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_security <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(comp_security))

comp_security[1]
sv(comp_security, path=WRDS_DOWNLOAD_DIR)

tic,gvkey,iid,cusip,dlrsni,dsci,epf,exchg,excntry,ibtic,isin,secstat,sedol,tpci,dldtei
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<date>
AE.2,1000,1,32102,9,COM USD1,,12,USA,,,I,,0,1978-06-30


-comp_security- saved  (0.33 secs)


### `ciq.wrds_gvkey`

In [None]:
library = 'ciq'
dataset = 'wrds_gvkey' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
ciq_wrds_gvkey <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ciq_wrds_gvkey))

ciq_wrds_gvkey[1]
sv(ciq_wrds_gvkey, path=WRDS_DOWNLOAD_DIR)

In [12]:
ld(ciq_wrds_gvkey, path=WRDS_DOWNLOAD_DIR)
ciq_wrds_gvkey[str_detect(companyname, 'Goldman')]

-ciq_wrds_gvkey- already exists, will NOT load again!  (0 secs)


companyid,gvkey,startdate,enddate,companyname
<dbl>,<chr>,<date>,<date>,<chr>
363476,65135,,,Friede Goldman Halter Inc.
398625,114628,,,"The Goldman Sachs Group, Inc."
4463182,200484,,,Rich Goldman Holdings Limited
9176987,159155,,,Corts Trust For Goldman Sachs Capital I
9563181,266220,,,CABCO Series 2004-1 Trust (Goldman Sachs Capital I)
9863691,160346,,,Corts Trust II For Goldman Sachs Capital I
10097185,160560,,,CABCO Series 2004-101 Trust (Goldman Sachs Capital I)
10625760,216255,,,Strats Trust Goldman Sachs Group
28113913,161697,,,Goldman Sachs Trust - Goldman Sachs Equity Income Fund
28121135,161698,,,Goldman Sachs Variable Insurance Trust - Goldman Sachs U.S. Equity Insights Fund


### `crsp.stocknames`

In [None]:
library = 'crsp'
dataset = 'stocknames' # firm names

query = sprintf("select permno, ncusip, comnam, namedt, nameenddt from %s.%s where ncusip != ''", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_stocknames <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_stocknames))
sv(crsp_stocknames, path=WRDS_DOWNLOAD_DIR)

query = sprintf("select ticker, comnam, permno, ncusip, namedt, nameenddt from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_stocknames2 <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_stocknames2))
crsp_stocknames2[1]
sv(crsp_stocknames2, path=WRDS_DOWNLOAD_DIR)

### `crsp.ccm`

In [None]:
library = 'crsp'
dataset = 'ccmxpf_linktable' # firm names

query = sprintf("select gvkey, lpermco as permco, lpermno as permno, linkdt, linkenddt from %s.%s where usedflag=1 and linkprim in ('P', 'C')", library, dataset)
res <- dbSendQuery(wrds, query)
crsp_ccmlink <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('nrow: %s', nrow(crsp_ccmlink))
sv(crsp_ccmlink, path=WRDS_DOWNLOAD_DIR)
crsp_ccmlink[1]

In [358]:
ld(crsp_ccmlink, path=WRDS_DOWNLOAD_DIR)
crsp_ccmlink[1]

-crsp_ccmlink- already exists, will NOT load again!  (0 secs)


gvkey,permco,permno,linkdt,linkenddt
<chr>,<dbl>,<dbl>,<date>,<date>
1000,23369,25881,1970-11-13,1978-06-30


## index constitution

In [None]:
ld(comp_idx_prof, path=WRDS_DOWNLOAD_DIR)
ld(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)
ld(crsp_stocknames, path=WRDS_DOWNLOAD_DIR)

In [None]:
library = 'compa'
dataset = 'idx_index' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_idx_profile <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

comp_idx_profile[conm=='S&P 500 Comp-Ltd']
sv(comp_idx_profile, path=WRDS_DOWNLOAD_DIR)

In [None]:
library = 'compa'
dataset = 'idxcst_his' # firm names

query = sprintf("select * from %s.%s", library, dataset)
res <- dbSendQuery(wrds, query)
comp_idx_cst <- setDT(dbFetch(res, n=-1)); dbClearResult(res)

comp_idx_cst[1]
sv(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)

## keydev

Last download: 2020-04-23

In [107]:
library = 'ciq'
dataset = 'wrds_keydev' # everything except for `headline` and `situation`

query = sprintf("select * from %s.%s 
    where keydeveventtypeid in (28, 48, 55, 61, 144)", library, dataset)
res <- dbSendQuery(wrds, query)
ciq_wrds_keydev <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ciq_wrds_keydev))

# `mostimportantdateutc` is in UTC but R doesn't recognize,
# so we need to set it explictly
ciq_wrds_keydev[, ':='(mostimportantdateutc=force_tz(mostimportantdateutc, 'UTC'))]

sv(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)

-ciq_wrds_keydev- saved  (14.6 secs)


In [108]:
library = 'ciq'
dataset = 'ciqkeydev' # `headline` and `situation`

query = sprintf("select * from %s.%s 
    where keydevid in 
        (select keydevid from ciq.wrds_keydev
        where keydeveventtypeid in (28, 48, 55, 61, 144))",
    library, dataset)
res <- dbSendQuery(wrds, query)
ciq_keydev <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ciq_keydev))

# `mostimportantdateutc` is in UTC but R doesn't recognize,
# so we need to set it explictly
ciq_keydev[, ':='(mostimportantdateutc=force_tz(mostimportantdateutc, 'UTC'))]

sv(ciq_keydev, path=WRDS_DOWNLOAD_DIR)

-ciq_keydev- saved  (56.69 secs)


## CCM

Notes:
- Last download: 2020-04-23
- `ccmxpf_lnkhist` from CCM provides a linktable between CRSP and Compustat

In [109]:
library = 'crspq'
dataset = 'ccmxpf_lnkhist'

query = sprintf("select * from %s.%s",
    library, dataset)
res <- dbSendQuery(wrds, query)
ccm <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(ccm))
sv(ccm, path=WRDS_DOWNLOAD_DIR)

-ccm- saved  (0.27 secs)


## money inflow

In [34]:
library = 'crsp'
dataset = 'holdings'

query = sprintf("select crsp_portno, report_dt, percent_tna, nbr_shares, market_val, cusip, permno from %s.%s where report_dt>'2008-01-01'::date",
    library, dataset)
                
res <- dbSendQuery(wrds, query)
holdings <- setDT(dbFetch(res, n=-1)); dbClearResult(res)
sprintf('%s: %s', dataset, nrow(holdings))

sv(holdings, path=WRDS_DOWNLOAD_DIR)

-holdings- saved  (6.62 mins)


## retail tracking

In [16]:
taq_year = 2003 # 2003~2020
taq_month = str_pad(1, 2, pad='0')
taq_day = str_pad(12, 2, pad='0')

library = str_c('taqm_', taq_year)
dataset = str_c('ctm', taq_year, taq_month, taq_day)

res <- dbSendQuery(wrds, sprintf("select * from %s.%s limit 10", library, dataset))

data <- dbFetch(res, n=-1)
dbClearResult(res)

setDT(data)


ERROR: Error: Failed to prepare query: ERROR:  permission denied for schema taqm_2003
LINE 1: select * from taqm_2003.ctm20030112 limit 10
                      ^



In [22]:
# Determine the data libraries available at WRDS
res <- dbSendQuery(wrds, "select distinct table_schema
                   from information_schema.tables
                   where table_type ='VIEW'
                   or table_type ='FOREIGN TABLE'
                   order by table_schema")
data <- dbFetch(res, n=-1)
dbClearResult(res)
print(setDT(data)[, sort(table_schema)])

  [1] "aha"                "ahasamp"            "audit"             
  [4] "blab"               "block"              "boardex"           
  [7] "boardsmp"           "bvd"                "bvdsamp"           
 [10] "calcbnch"           "cboe"               "centris"           
 [13] "ciq"                "ciqsamp"            "cisdm"             
 [16] "cisdmsmp"           "clrvt"              "clrvtsmp"          
 [19] "comp"               "compa"              "compb"             
 [22] "compdcur"           "compg"              "comph"             
 [25] "compm"              "compmcur"           "compsamp"          
 [28] "compseg"            "compsnap"           "comscore"          
 [31] "contrib"            "crsp"               "crspm"             
 [34] "crspq"              "crspsamp"           "csmar"             
 [37] "dealscan"           "djones"             "dmef"              
 [40] "doe"                "emdb"               "etfg"              
 [43] "etfgsamp"           "eureka

In [50]:
# Determine the datasets within a given library
library = 'taqmsec'
res <- dbSendQuery(wrds, sprintf("select distinct table_name
                   from information_schema.columns
                   where table_schema='%s'
                   order by table_name", library))
data <- dbFetch(res, n=-1)
dbClearResult(res)
setDT(data)
data

table_name
<chr>
cqm_20030910
cqm_20030911
cqm_20030912
cqm_20030915
cqm_20030916
cqm_20030917
cqm_20030918
cqm_20030919
cqm_20030922
cqm_20030923


In [44]:
data[, table(str_sub(table_name,1,3))]


  cqm   ctm   ix_   mas   nbb 
 4084  4111 12223  2238  4080 

In [49]:
# Determine the variables (column headers) within a given dataset
library = 'taqmsec'
dataset = 'ctm_20100706'
res <- dbSendQuery(wrds, sprintf("select * from %s.%s limit 10", library, dataset))
data <- dbFetch(res, n=-1)
dbClearResult(res)

data

ERROR: Error: Failed to prepare query: ERROR:  permission denied for schema taqm_2010
LINE 1: select * from taqm_2010.ctm_20100706 limit 10
                      ^



# Coverage

## Find SP 500

In [None]:
# thru is missing means it's still effective
# fill the missing with today()
ld(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)

sp500_cst = comp_idx_cst[gvkeyx=='000003'
    ][is.na(from), ':='(from=as.Date('1900-01-01'))
    ][is.na(thru), ':='(thru=today())
    ][order(gvkey, iid)
    ][, .SD[1], keyby=.(gvkey)
    ][, .(gvkey, from, thru)] %>% unique()

sv(sp500_cst)

In [None]:
sp500_cst[, uniqueN(gvkey)]
sp500_cst[1]

## Find DJI

In [None]:
ld(comp_idx_cst, path=WRDS_DOWNLOAD_DIR)
ld(comp_security, path=WRDS_DOWNLOAD_DIR)

In [None]:
SDATE = as.Date('2005-01-01')
EDATE = as.Date('2018-12-31')

In [None]:
# DJI
dji_cst = unique(comp_idx_cst[gvkeyx=='000005'
    ][comp_security[, .(gvkey, iid, tic, cusip=str_sub(cusip, 1, 6), sedol)], 
      on=.(gvkey, iid), nomatch=0])
dji_cst
sv(dji_cst)

# Y (CAR)

## `gvkey` <-> `permno`

We'll use `ccm` to link these two variables.

In the next cell, I:
- only select "primary" links (LC,LU,LS) which are considered to be robust
- fill missing `linkdt` and `linkenddt`
- for every `gvkey`, only select its **first** issue.

In [322]:
ld(ccm, path=WRDS_DOWNLOAD_DIR, force=T)

gvkey_permno_link = ccm[linktype %in% c('LC', 'LU', 'LS')
    ][is.na(linkenddt) | linkenddt>=as.Date('2000-01-01'), 
     .(gvkey, lpermno, liid, linkdt, linkenddt)
    ][is.na(linkenddt), ':='(linkenddt=as.Date('2020-12-31'))
    ][is.na(linkdt), ':='(linkdt=as.Date('1990-01-01'))
    ][order(gvkey, liid)
    ][, .SD[1], keyby=.(gvkey)
    ][order(gvkey, linkdt)
    ][, ':='(liid=NULL)] %>% 
    na.omit(cols=c('gvkey', 'lpermno')) %>% 
    unique(by=c('gvkey', 'lpermno'))
sv(gvkey_permno_link)

-ccm- loaded  (0.35 secs)
-gvkey_permno_link- saved  (0.04 secs)


## call <-> release

Task:
- link call and release
- method
  1. For any release, find all the calls within the [-180d, 180d] window, get `dt_find_post_call`
  2. For any call, find all the releases within the [-180d, 180d] window, get `dt_find_prev_release`
  3. merge `dt_find_post_call` and `dt_find_prev_release`, get `call_release_link`
  4. For every `call_keydevid`, find its **closest** **previous**`release_keydevid` within [-1d, 1d]

Warnings:
- In `call_release_link`:
    - `call_keydevid` is unique key
    - `release_keydevid` *not* unique key, because different calls may be matched to the *same* release.
- In `ciq_wrds_keydev`, one `keydevid` may have multiple obs because it may corresponds to multiple `gvkey`

Notes:
- `keyeventtypeid`
  - 28: earnings announcement
  - 48: earnings calls
  - 61: accnounce earnings release delay
  


In [10]:
ld(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR, force=T)
ld(ciq_keydev, path=WRDS_DOWNLOAD_DIR, force=T)
ld(gvkey_permno_link, force=T)

-ciq_wrds_keydev- loaded  (5.77 secs)
-ciq_keydev- loaded  (25.33 secs)
-gvkey_permno_link- loaded  (0.02 secs)


In [277]:
find_prev_release <- function(t, date, keydevid, keydeveventtypeid, companyname) {
    # t: the row_id of one call
    # n: the row_ids of the closet earnings release
    ns = which((date>=(date[t]-180)) & (date<=date[t]+180) & (keydeveventtypeid==28))
    if (length(ns) >= 1) { # has match
        return(list('release_date' = date[ns],
             'release_keydevid' = keydevid[ns],
             'call_date' = date[rep(t, length(ns))],
             'call_keydevid' = keydevid[rep(t, length(ns))],
             'companyname' = companyname[ns]))
    } else if (length(ns) == 0) {# fail to find a release
        return(list('release_date' = ymd(NA),
             'release_keydevid' = NA_real_,
             'call_date' = date[t],
             'call_keydevid' = keydevid[t],
             'companyname' = companyname[1]))
    }
}

dt_find_prev_release = ciq_wrds_keydev[!is.na(gvkey)
    ][keydeveventtypeid %in% c(28, 48)
    ][order(gvkey, mostimportantdateutc),
      .(gvkey, date=as.Date(mostimportantdateutc), keydevid, keydeveventtypeid, companyname, eventtype)
    ][, {ts = which(keydeveventtypeid==48)
      lapply(ts, partial(find_prev_release, date=date, 
                         keydevid=keydevid, keydeveventtypeid=keydeveventtypeid,
                         companyname=companyname)
            ) %>% rbindlist(fill=T, use=T)
     },
     keyby=.(gvkey)
    ]

In [278]:
find_post_call <- function(t, date, keydevid, keydeveventtypeid, companyname) {
    # t: the row_id of one announcement
    # ns: the row_id of the closest call
    ns = which((date>=(date[t]-180)) & (date<=date[t]+180) & (keydeveventtypeid==48))
    if (length(n) == 1) { # has match
        return(list('call_date' = date[ns],
             'call_keydevid' = keydevid[ns],
             'release_date' = date[rep(t, length(ns))],
             'release_keydevid' = keydevid[rep(t, length(ns))],
             'companyname' = companyname[ns]))
    } else if (length(n) == 0) {# fail to find a release
        return(list('call_date' = ymd(NA),
             'call_keydevid' = NA_real_,
             'release_date' = date[t],
             'release_keydevid' = keydevid[t],
             'companyname' = companyname[1]))
    }
}

dt_find_post_call = ciq_wrds_keydev[!is.na(gvkey)
    ][keydeveventtypeid %in% c(28, 48)
    ][order(gvkey, mostimportantdateutc),
      .(gvkey, date=as.Date(mostimportantdateutc), keydevid, keydeveventtypeid, companyname, eventtype)
    ][, {ts = which(keydeveventtypeid==28)
      lapply(ts, partial(find_post_call, date=date, 
                         keydevid=keydevid, keydeveventtypeid=keydeveventtypeid,
                         companyname=companyname)
            ) %>% rbindlist(fill=T, use=T)
     },
     keyby=.(gvkey)
    ]

In [385]:
call_release_link = unique(na.omit(rbindlist(list(dt_find_post_call, dt_find_prev_release), use=T)))[order(gvkey, release_date)
    ][(gvkey %in% sp500_cst$gvkey) & (call_date %between% c(ymd('2008-01-01'), ymd('2018-12-31')))
    ][, ':='(nday_release_lead=call_date-release_date)
    ][nday_release_lead %between% c(-1, 1)
    ][order(call_keydevid, nday_release_lead)
    ][, head(.SD,1), keyby=.(call_keydevid)]

n_identified_call = call_release_link[, uniqueN(call_keydevid)]
cat(sprintf("%s (%.2f%%) calls have been sucessfully matched with release event in range [-1d,1d].\n\n", n_identified_call, round(n_identified_call/31040*100, 2)))

sv(call_release_link)
call_release_link[1]

30657 (98.77%) calls have been sucessfully matched with release event in range [-1,1].

-call_release_link- saved  (0.09 secs)


call_keydevid,gvkey,call_date,release_date,release_keydevid,companyname,nday_release_lead
<dbl>,<chr>,<date>,<date>,<dbl>,<chr>,<drtn>
3101633,14489,2008-02-28,2008-02-28,5219816,Dell Technologies Inc.,0 days


## infolow

In [30]:
ld(holdings, path=WRDS_DOWNLOAD_DIR)

-holdings- loaded  (1.48 mins)


In [31]:
# inflow: in millions of dollars
inflow = holdings[!is.na(permno)
    ][, ':='(permno=as.character(permno))
    ][, .(mv=sum(market_val, na.rm=T)), keyby=.(permno, report_dt)
    ][order(permno, report_dt)
    ][, ':='(inflow=(mv-shift(mv))/1e6), keyby=permno
    ][!is.na(inflow)]
inflow[1]

permno,report_dt,mv,inflow
<chr>,<date>,<dbl>,<dbl>
10001,2010-04-30,89377.28,0.08315468


In [56]:
sv(inflow)

-inflow- saved  (0.89 secs)


## revision

Task:
- $revision = \frac{EPS_{new} - EPS_{previous}}{\text{stock price 2 days before revision}}*100$
- compute revision of year-end earnings for the current FY, resutling in `revision`
- create `ibtic_gvkey_link` where `ibtic` is unique and one `gvkey` may have multiple `ibtic` matches
- add `gvkey` to `revision`

In [2]:
ld(ibes_det_epsus, path=WRDS_DOWNLOAD_DIR)
ld(comp_security, path=WRDS_DOWNLOAD_DIR) # stock ids

comp_secd = read_feather(str_c(WRDS_DOWNLOAD_DIR, '/comp_secd.feather')) %>% as.data.table() # daily stock price

In [144]:
ibtic_gvkey_link = comp_security[!is.na(ibtic) & excntry=='USA', .(ibtic, gvkey, iid, dldtei)]
comp_secd_link = comp_secd[, .(gvkey, iid, price_date=datadate, price=prccd, join_date=datadate+2)]

In [158]:
ibes = ibes_det_epsus[anndats %between% c(ymd('2007-01-01'), ymd('2019-01-01')) & !is.na(cusip) & measure=='EPS' & usfirm==1 & fpi %in% c(1), 
     .(ticker, cname, anndats, revdats, fpedats, analys, pdf, fpi, value)
    ][order(analys, ticker, fpedats, anndats), ':='(join_date=anndats)
    ][ibtic_gvkey_link, on=.(ticker=ibtic), nomatch=NULL]

revision = comp_secd_link[ibes, on=.(gvkey, iid, join_date), roll=T, nomatch=NULL
    ][is.na(dldtei) | (!is.na(dldtei) & dldtei>=anndats)
    ][, ':='(join_date=NULL, dldtei=NULL)
    ][anndats-price_date<=30
    ][order(analys, ticker, fpedats, anndats)
    ][, .(gvkey, anndats, revdats, pdf, value, revision=(value-shift(value))/price*100), 
      keyby=.(analys, ticker, fpedats)
    ][!is.na(revision)
    ][revision %between% c(quantile(revision, 0.005), quantile(revision, 0.995))]

In [159]:
revision[, summary(revision)]

      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-115.82106   -0.32189   -0.00989   -0.55922    0.19482   48.66412 

In [160]:
sv(revision)

-revision- saved  (3.64 secs)


## retail

In [9]:
library(haven)

# ----------------------------
# read: retail trades
# ----------------------------

retail2017 = read_sas('data/retail tracking/retail2017.sas7bdat') %>% as.data.table()

retail2015_2016 = read_sas('data/retail tracking/retail2015_2016.sas7bdat') %>% as.data.table()

retail = rbindlist(list(retail2017, retail2015_2016))

retail[, ':='(DATE_END=DATE)]
retail[1]

SYM_ROOT,DATE,total_vol,retail_buy_vol,retail_sell_vol,DATE_END
<chr>,<date>,<dbl>,<dbl>,<dbl>,<date>
A,2017-01-03,398438,17572,24209,2017-01-03


In [10]:
# -------------------------------
# Add: retail trades <-> `cusip`
# -------------------------------

cusip_sysroot = read_sas('data/retail tracking/cusip-taq_symbol.sas7bdat') %>% as.data.table()

cusip_sysroot = cusip_sysroot[is.na(first_available_date), ':='(first_available_date=ymd('1800-01-01'))
    ][first_available_date<=last_available_date, .(CUSIP, SYMBOL, first_available_date, last_available_date)
    ] %>% unique()

setkey(cusip_sysroot, SYMBOL, first_available_date, last_available_date)

retail_cusip = foverlaps(retail, cusip_sysroot, by.x=c('SYM_ROOT', 'DATE', 'DATE_END'), by.y=c('SYMBOL', 'first_available_date', 'last_available_date'), type='within', nomatch=NULL)
retail_cusip = retail_cusip[, .(sym_root=SYM_ROOT, cusip=CUSIP, date=DATE, total_vol, retail_buy_vol, retail_sell_vol)]
retail_cusip[1]

sym_root,cusip,date,total_vol,retail_buy_vol,retail_sell_vol
<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>
A,00846U101,2017-01-03,398438,17572,24209


In [15]:
retail_cusip[1]

sym_root,cusip,date,total_vol,retail_buy_vol,retail_sell_vol
<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>
A,00846U101,2017-01-03,398438,17572,24209


In [22]:
# -------------------------------
# Add: <-> `gvkey`
# -------------------------------

ld(comp_security, path=WRDS_DOWNLOAD_DIR)

retail_cusip_gvkey = comp_security[, .(gvkey, cusip)
    ][retail_cusip, on=.(cusip), nomatch=NULL
    ][, .(total_vol=sum(total_vol, na.rm=T),
          retail_buy_vol=sum(retail_buy_vol, na.rm=T),
          retail_sell_vol=sum(retail_sell_vol, na.rm=T)), 
      keyby=.(gvkey, date)]

retail_cusip_gvkey[1]
sv(retail_cusip_gvkey)

-comp_security- already exists, will NOT load again!  (0 secs)


gvkey,date,total_vol,retail_buy_vol,retail_sell_vol
<chr>,<date>,<dbl>,<dbl>,<dbl>
1004,2015-01-02,23105,4134,1186


-retail_cusip_gvkey- saved  (10.89 secs)


## `SUE`

Primary key
- [`gvkey`, `rdq`]
- total obs: 43,822
- N of unique PK: 43,815

Variable Description
- `datadate`(COMP): End Date of Earnings Report(earlier than `rdq` and `repdats`)
- `fyearq`(COMP): fiscal year
- `fyr`(COMP): fiscal year - end month
- `fqtr`(COMP): fiscal quarter
    - only in [1, 2, 3, 4]
    - *no "year-end" earnings*
- `repdats`(IBES): Report Date of Quarterly Earnigns
- `rdq`(COMP): Report Date of Quarterly Earnings
- `leadrdq`(COMP): Report Date of NEXT Quarter's Earnings

- `numest`: Number of Forecasts
- `smedest`: standard median forecast (based on estimates in the 90 days prior to the EAD), = medest/price_close
- `sstdest`: standard error of Forecasts, = stdest/price_close
- `basis`: Whether most analysts report estimates on primary(P) / diluted(D)
- `act`(IBES): actual earnings
- `se`(COMP): Standard Earnings (=act/price_close)
- `sest`: Standard Estimates (= se-sue = est/price_close)
- `sue1`: SUE based on a rolling seasonal random walk model (LM,p. 185)
- `sue2`: SUE accounting for  exclusion of special items
- `sue3`: SUE based on IBES reported analyst forecasts and actuals  

- `mcap`: Market Cap

In [346]:
sue = fread('data/sue_final.csv', colClasses=c('gvkey'='character'))
sue[, ':='(permno=as.character(permno),
           datadate=ymd(datadate),
           rdq=ymd(rdq),
           rdq1=ymd(rdq1),
           leadrdq1=ymd(leadrdq1),
           repdats=ymd(repdats),
           sest=se-sue3,
           price_close=act/se)
    ][, ':='(smedest=medest/price_close,
             sstdest=stdest/price_close)
    ][, ':='(stdest=NULL, medest=NULL)]
sue[1]
sv(sue)

gvkey,ticker,permno,conm,fyearq,fqtr,datadate,fyr,rdq,rdq1,leadrdq1,repdats,mcap,act,numest,basis,sue1,sue2,sue3,se,sest,price_close,smedest,sstdest
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<date>,<dbl>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,ADCT,50906,ADC TELECOMMUNICATIONS INC,2010,4,2010-09-30,9,2010-11-23,2010-11-23,2011-02-23,2010-11-23,1231.524,0.15,1,D,,,-0.000789266,0.01183899,0.01262826,12.67,0.01262826,


-sue- saved  (0.27 secs)


## `CAR`

### prepare event list

Compute the CAR based on the following two event types: (1) earnings announcment; (2) earnings call. Do as follows:

1. Create a dataset where the first col is `permno` and the second is `event_date`.
2. Use a Python script (`compute-car.ipynb`) to compute CAR

In [263]:
ld(ciq_wrds_keydev, path=WRDS_DOWNLOAD_DIR)
ld(gvkey_permno_link)
ld(sp500_cst)

-ciq_wrds_keydev- loaded  (10.7 secs)
-gvkey_permno_link- loaded  (0.02 secs)
-sp500_cst- loaded  (0.01 secs)


In [266]:
gvkey_permno_link

gvkey,lpermno,linkdt,linkenddt
<chr>,<dbl>,<date>,<date>
001004,54594,1972-04-24,2020-12-31
001013,50906,1979-03-16,2010-12-31
001034,65832,1984-02-15,2008-12-29
001036,65453,1983-12-06,2001-05-31
001038,66413,1983-08-18,2004-12-31
001043,80071,1993-12-06,2000-01-27
001045,21020,1962-01-31,2012-01-04
001050,11499,1980-11-28,2020-12-31
001056,62500,1977-06-30,2007-08-31
001062,26649,1965-01-29,2020-12-31


In [264]:
event_samples = ciq_wrds_keydev[gvkey_permno_link, on=.(gvkey), nomatch=0
    ][as.Date(mostimportantdateutc)>linkdt & as.Date(mostimportantdateutc)<linkenddt
    ][as.Date(mostimportantdateutc) >= as.Date('2000-01-01')
    ][gvkey %in% sp500_cst[, unique(gvkey)]
    ][, ':='(linkdt=NULL, linkenddt=NULL)]

Then we generate two event samples:
- `event_samples_earnings_call`: the earnings CALL (type 48)
- `event_samples_earnings_announce`: the earnings announcement (type28)

In [31]:
event_samples_test = event_samples[
      mostimportantdateutc>ymd('2019-01-01')
    ][keydeveventtypeid==48,
     .(permno=lpermno, edate=format(mostimportantdateutc, '%m/%d/%Y'))
    ][order(permno, edate)
    ] %>% unique()

toJSON(event_samples_test, pretty=T) %>% write('data/car/event_samples_test.json')

In [32]:
event_samples_earnings_call = event_samples[keydeveventtypeid==48,
     .(permno=lpermno, edate=format(mostimportantdateutc, '%m/%d/%Y'))
    ][order(permno, edate)] %>% unique()

toJSON(event_samples_earnings_call, pretty=T) %>% write('data/car/event_samples_earnings_call.json')

event_samples_earnings_announce = event_samples[keydeveventtypeid==28,
     .(permno=lpermno, edate=format(mostimportantdateutc, '%m/%d/%Y'))
    ][order(permno, edate)] %>% unique()

toJSON(event_samples_earnings_announce, pretty=T) %>% write('data/car/event_samples_earnings_announce.json')

### ==>

> The ret/car are also in *decimals*

Output these variabls:
- `CAR[-1,-1]`
- `CAR[-2,-2]`
- `CAR[-30,-3]`
- `CAR[0,1]`
- `CAR[0,30]`
- `alpha[-125,-31]`
- `volatility[-125, -31]`

In [153]:
car = fread('./data/CAR/cars_30d_call.csv', colClass=c('integer', rep('character', times=3), 'integer', rep('double', times=7)))[,
      ':='(edate=ymd(edate), rdate=ymd(rdate))
    ][order(permno, edate, rdate)
    ][, .(car_m1_m1=abret[isevt==1 & evttime==-1],
          car_m2_m2=abret[isevt==1 & evttime==-2],
          car_m30_m3=sum(abret[isevt==1 & evttime %between% c(-30,-3)], na.rm=T),
          
          car_0_1=sum(abret[isevt==1 & evttime %between% c(0,1)], na.rm=T),
          car_0_10=sum(abret[isevt==1 & evttime %between% c(0,10)], na.rm=T),
          car_0_20=sum(abret[isevt==1 & evttime %between% c(0,20)], na.rm=T),
          car_0_30=sum(abret[isevt==1 & evttime %between% c(0,30)], na.rm=T),
          
          ret_m30_m3=prod(1+ret[isevt==1 & evttime %between% c(-30, -3)], na.rm=T)-1,
          ret_m2_m2=prod(1+ret[isevt==1 & evttime==-2], na.rm=T)-1,
          ret_m1_m1=prod(1+ret[isevt==1 & evttime==-1], na.rm=T)-1,
          
          ret_0_1=ret[isevt==1 & evttime %between% c(0,1)],
          ret_0_10=prod(1+ret[isevt==1 & evttime %between% c(0,10)], na.rm=T)-1,
          ret_0_20=prod(1+ret[isevt==1 & evttime %between% c(0,20)], na.rm=T)-1,
          ret_0_30=prod(1+ret[isevt==1 & evttime %between% c(0,30)], na.rm=T)-1,
          
          alpha=alpha[isevt==1 & evttime==0],
          beta_mktrf=beta_mktrf[isevt==1 & evttime==0],
          beta_smb=beta_smb[isevt==1 & evttime==0],
          beta_hml=beta_hml[isevt==1 & evttime==0],
          volatility=sd(ret[isevt==0], na.rm=T), # sd(ret[-125,-1])
          volatility2=sd(ret[isevt==0 & evttime %between% c(95, 125)], na.rm=T)), # sd(ret[-30,-1])
      keyby=.(permno, edate)]

car[1]
sv(car)

permno,edate,car_m1_m1,car_m2_m2,car_m30_m3,car_0_1,car_0_10,car_0_20,car_0_30,ret_m30_m3,ret_m2_m2,ret_m1_m1,ret_0_1,ret_0_10,ret_0_20,ret_0_30,alpha,beta_mktrf,beta_smb,beta_hml,volatility,volatility2
<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
10078,2004-10-15,-0.003817367,0.004315403,-0.00785148,-0.01034447,0.07986508,0.06859604,0.16199,0.05167966,-0.01228506,-0.0124378,0,0.1335013,0.2241814,0.3879094,-0.0003937242,1.339511,2.318787,-0.5911424,0.0332523,0.03211976


-car- saved  (0.41 secs)


# X (transcripts)

## `ciq_transcript_detail`

Task:
- Check if there are unknown `transcriptcollectiontypeid`

In [209]:
ld(sp500_cst)
ld(call_release_link)
ld(gvkey_permno_link)
ld(ciq_transcript_detail, path=WRDS_DOWNLOAD_DIR)
ld(ciq_transcript_component, path=WRDS_DOWNLOAD_DIR)
ld(ciq_wrds_gvkey, path=WRDS_DOWNLOAD_DIR)

-sp500_cst- already exists, will NOT load again!  (0 secs)
-call_release_link- already exists, will NOT load again!  (0 secs)
-gvkey_permno_link- already exists, will NOT load again!  (0 secs)
-ciq_transcript_detail- already exists, will NOT load again!  (0 secs)
-ciq_transcript_component- already exists, will NOT load again!  (0 secs)
-ciq_wrds_gvkey- already exists, will NOT load again!  (0 secs)


In [210]:
version_in_data = ciq_transcript_detail[, unique(transcriptcollectiontypeid)]
if (sum(!(version_in_data %in% c(8, 1, 2, 7, 11, 10, 6, 9, 13))) > 0) {
    cat('Unknown `transcriptcollectiontypeid` in data!!!')
} else {cat('All `transcriptcollectiontypeid` are known')}

All `transcriptcollectiontypeid` are known

Task 1: For every `keydevid`, select one `transcriptid` (which believed to be most precise)
- Check: priority order of "audited > proofed > edited > spellchecked"
- We only analyze SP500


Task 2: 
- Add `ciq_wrds_gvkey:gvkey` to `ciq_transcript_detail`
  - merge `ciq_wrds_gvkey` and `ciq_transcript_detail_sp500` by `companyid`
  - one `companyid` may corresp to multiple `gvkey`, we kee ALL the matches

Task 3: 
- Add `size`, `bw_adj`, and `mom` from `dgtw` to `ciq_transcript_detail_sp500`

Warnings:
> In the final `f_ciq_transcript_detail_sp500`:
>   - `transcriptid` and `keydevid` are one-to-one mapped
>   - unique keys: `['transcriptid', 'gvkey']` or `['keydevid', 'gvkey']`
>   - n_row: 35593
>   - unique_n_row: 35077

In [211]:
latest_transcriptcollectiontypeid = ciq_transcript_detail[
     (keydeveventtypeid==48) & 
     (companyid %in% ciq_wrds_gvkey[gvkey %in% sp500_cst$gvkey, companyid])
    ][, { 
      latest_transcript_version = NA_real_
      for (tid in c(8, 1, 2, 7)) {
         if (tid %in% transcriptcollectiontypeid) {
             latest_transcript_version = tid
             break
         } else { 
         for (tid in c(11, 10, 6, 9, 13)) {
             if (tid %in% transcriptcollectiontypeid) {
                 latest_transcript_version = tid 
                 break
             }
         }}
      } 
      list(latest_transcript_version=latest_transcript_version)  
      }, 
      keyby=.(keydevid)]

f_ciq_transcript_detail_sp500 = ciq_transcript_detail[     
     (keydeveventtypeid==48) & 
     (companyid %in% ciq_wrds_gvkey[gvkey %in% sp500_cst$gvkey, companyid])
    ][latest_transcriptcollectiontypeid, on=.(keydevid), nomatch=0
    ][transcriptcollectiontypeid==latest_transcript_version
    ][order(keydevid, -transcriptcreationdate_utc, -transcriptcreationtime_utc)
    ][, .SD[1],
      keyby=.(keydevid)
    ][unique(ciq_wrds_gvkey[, .(companyid, gvkey)], by=c('companyid', 'gvkey')),
      on=.(companyid),
      nomatch=0
    ] %>% unique()

f_ciq_transcript_detail_sp500[, .N]
f_ciq_transcript_detail_sp500[1]
sv(f_ciq_transcript_detail_sp500)

keydevid,companyid,transcriptid,headline,mostimportantdateutc,keydeveventtypeid,keydeveventtypename,companyname,transcriptcollectiontypeid,transcriptcollectiontypename,transcriptpresentationtypeid,transcriptpresentationtypename,transcriptcreationdate_utc,transcriptcreationtime_utc,audiolengthsec,isdelayed_flag,delayreasontypeid,delayreasontypename,latest_transcript_version,gvkey
<dbl>,<dbl>,<dbl>,<chr>,<date>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>
4855823,18671,504,"Albemarle Corp., Q3 2007 Earnings Call, Oct-23-2007",2007-10-23,48,Earnings Calls,Albemarle Corporation,6,SA Edited Copy,5,Final,2008-06-20,1898,,0,,,6,29751


-f_ciq_transcript_detail_sp500- saved  (0.23 secs)


Task:
- check All `f_ciq_transcript_detail_sp500:transcriptid` are identified in `ciq_transcript_component:transcriptid`

In [213]:
if (sum(f_ciq_transcript_detail_sp500[, transcriptid] %in% ciq_transcript_component[, unique(transcriptid)]) == nrow(f_ciq_transcript_detail_sp500)) {
    cat('All `f_ciq_transcript_detail_sp500:transcriptid` are identified in `ciq_transcript_component:transcriptid`')
} else {cat('Unknown `transcriptid` found in f_ciq_transcript_detail_sp500')}

All `f_ciq_transcript_detail_sp500:transcriptid` are identified in `ciq_transcript_component:transcriptid`

## `ciq_transcript_component`

Task:
> - Select obs in `ciq_transcript_component` where `ciq_transcript_component:transcriptid` in `f_ciq_transcript_detail_sp500:transcriptid`
> - Save results to `f_ciq_transcript_component_sp500`

In [214]:
ld(ciq_transcript_component, path=WRDS_DOWNLOAD_DIR)
ld(f_ciq_transcript_detail_sp500)

-ciq_transcript_component- already exists, will NOT load again!  (0 secs)
-f_ciq_transcript_detail_sp500- already exists, will NOT load again!  (0 secs)


In [112]:
f_ciq_transcript_component_sp500 = ciq_transcript_component[
     transcriptid %in% f_ciq_transcript_detail_sp500$transcriptid
    ][order(transcriptid, componentorder)]

In [113]:
sv(f_ciq_transcript_component_sp500)

-f_ciq_transcript_component_sp500- saved  (1.48 mins)


In [12]:
f_ciq_transcript_component_sp500[, .N]
f_ciq_transcript_component_sp500[, uniqueN(transcriptcomponentid)]

In [10]:
# write component to disk for Python to sentencize
write_feather(f_ciq_transcript_component_sp500, 'f_ciq_transcript_component_sp500.feather')

## `detail`+`components`

Task
- We output five types of X:
    - all
    - presentation
    - question
    - answer
    - question and answer

In [215]:
ld(f_ciq_transcript_component_sp500)
ld(ciq_transcript_speaker, path=WRDS_DOWNLOAD_DIR)

-f_ciq_transcript_component_sp500- already exists, will NOT load again!  (0 secs)
-ciq_transcript_speaker- already exists, will NOT load again!  (0 secs)


In [119]:
x_all = f_ciq_transcript_component_sp500[order(transcriptid, componentorder)
    ][transcriptcomponentid %in% ciq_transcript_speaker[transcriptcomponenttypeid %in% c(2,3,4) & speakertypeid %in% c(2,3), unique(transcriptcomponentid)],
      .(text=str_c(componenttext, collapse=' ')),
      keyby=.(transcriptid)]

sv(x_all)

-x_all- saved  (1.32 mins)


In [120]:
x_present = f_ciq_transcript_component_sp500[order(transcriptid, componentorder)
    ][transcriptcomponentid %in% ciq_transcript_speaker[transcriptcomponenttypeid==2 & speakertypeid==2, unique(transcriptcomponentid)],
      .(text=str_c(componenttext, collapse=' ')),
      keyby=.(transcriptid)]

sv(x_present)

-x_present- saved  (32.07 secs)


In [121]:
x_ques = f_ciq_transcript_component_sp500[order(transcriptid, componentorder)
    ][transcriptcomponentid %in% ciq_transcript_speaker[transcriptcomponenttypeid %in% c(3) & speakertypeid %in% c(2,3), unique(transcriptcomponentid)],
      .(text=str_c(componenttext, collapse=' ')),
      keyby=.(transcriptid)]

sv(x_ques)

-x_ques- saved  (12.92 secs)


In [122]:
x_ans = f_ciq_transcript_component_sp500[order(transcriptid, componentorder)
    ][transcriptcomponentid %in% ciq_transcript_speaker[transcriptcomponenttypeid==4 & speakertypeid==2, unique(transcriptcomponentid)],
      .(text=str_c(componenttext, collapse=' ')),
      keyby=.(transcriptid)]

sv(x_ans)

-x_ans- saved  (34.51 secs)


In [123]:
x_qa = f_ciq_transcript_component_sp500[order(transcriptid, componentorder)
    ][transcriptcomponentid %in% ciq_transcript_speaker[transcriptcomponenttypeid %in% c(3,4) & speakertypeid %in% c(2,3), unique(transcriptcomponentid)],
      .(text=str_c(componenttext, collapse=' ')),
      keyby=.(transcriptid)]

sv(x_qa)

-x_qa- saved  (47.76 secs)


# (X,Y)

## filter `sue`

Task:
- remove obs from `sue` where `act` is NA

Filter:
- `sue[!is.na(act)]`   44690->42307

Notes:
- `sue` is computed for the SP500 universe
- ~500 obs in `f_sue` has NA `sue3` because`numest=0`
- some `lagrdq1` in `f_sue` are NA
- I divide `mcap` by 1000
    
Warnings
- Because we need *lead-one earnings*, the last obs of each `gvkey` is removed. 
- Therefore `f_sue` will have *LESS* obs than `sue`
- the latest `rdq` in `f_sue` is *2019-10-13* (updated: 2020-2-5)
- When computing `se_lead1`, you MUST use price at `t`, not simply shifting future `se` backward:`se_lead1=se_(t+1)/price_close_(t)`

In [8]:
ld(sue, force=T)
ld(sp500_cst, force=T)

-sue- loaded  (0.07 secs)
-sp500_cst- loaded  (0.01 secs)


In [11]:
sprintf('N sue: %s\n', nrow(sue)) %>% cat()

f_sue = sue[!is.na(act)] # 42440, ['gvkey', 'rdq'] as unique key

rdq_lag_lead_pair = f_sue[, .(gvkey, lagrdq1=rdq1, rdq1=leadrdq1)]

f_sue = rdq_lag_lead_pair[f_sue, on=.(gvkey, rdq1)
    ][gvkey %in% sp500_cst[, unique(gvkey)],
      .(gvkey, permno, datadate, fyearq, fqtr, fyr, rdq, rdq1, repdats, lagrdq1, leadrdq1,
        mcap=log(mcap), act, smedest, numest, sstdest, sue=sue3, se, sest, price_close)
    ][order(gvkey, rdq)
    ] %>% unique()

sprintf('N f_sue: %s\n', nrow(f_sue)) %>% cat()
sv(f_sue)
f_sue[1]

N sue: 44690
N f_sue: 42307
-f_sue- saved  (0.23 secs)


gvkey,permno,datadate,fyearq,fqtr,fyr,rdq,rdq1,repdats,lagrdq1,leadrdq1,mcap,act,smedest,numest,sstdest,sue,se,sest,price_close
<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,50906,2005-01-31,2005,1,10,2005-02-28,2005-02-28,2005-02-28,,2005-06-01,7.64057,0.01,0.003891051,8,0,0,0.003891051,0.003891051,2.57


In [None]:
f_sue[, .(year=as.factor(year(rdq1)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=F)

## `sue` <-> `call_release_link`

Task: merge with `call_release_link`
- select row with minimal `abs(rdq-call_date)`
- Output: `f_sue_keydevid`

Filter:
- release event in `sue` has no match in `call_release_link` within [-1d,1d]. 42690 -> 29065

Warnings:
- n_obs of `f_sue_keydevid` is larger than that of `f_sue` because in `call_release_link` there exist multiple `call_keydevid` for the SAME `release_keydevid`
    - e.g. `ciq_wrds_keydev[keydevid %in% c(404527363, 404314219, 404314224)]`

In [4]:
ld(call_release_link, force=T)
ld(f_sue, force=T)

-call_release_link- loaded  (0.05 secs)
-f_sue- loaded  (0.04 secs)


In [7]:
call_release_link[1]

call_keydevid,gvkey,call_date,release_date,release_keydevid,companyname,nday_release_lead
<dbl>,<chr>,<date>,<date>,<dbl>,<chr>,<drtn>
3101633,14489,2008-02-28,2008-02-28,5219816,Dell Technologies Inc.,0 days


In [14]:
sprintf('N f_sue: %s\n', nrow(f_sue)) %>% cat()

f_sue_keydevid = call_release_link[, ':='(join_date=release_date)
    ][copy(f_sue)[, ':='(join_date=rdq)], on=.(gvkey, join_date), roll='nearest', nomatch=NULL
    ][, ':='(ciq_release_date=rdq)
    ][, ':='(calldate_rdq_gap=call_date-rdq)
    ][calldate_rdq_gap %between% c(-1, 1)
    ][order(gvkey, rdq)
    ][, ':='(join_date=NULL)
    ][order(gvkey, rdq)
    ][, ':='(docid=str_c(gvkey, '-', ciq_release_date))]

sprintf('N f_sue_keydevid: %s\n', nrow(f_sue_keydevid)) %>% cat()

f_sue_keydevid[1]
sv(f_sue_keydevid) 

N f_sue: 42307
N f_sue_keydevid: 29065


call_keydevid,gvkey,call_date,release_date,release_keydevid,companyname,nday_release_lead,permno,datadate,fyearq,fqtr,fyr,rdq,rdq1,repdats,lagrdq1,leadrdq1,mcap,act,smedest,numest,sstdest,sue,se,sest,price_close,ciq_release_date,calldate_rdq_gap,docid
<dbl>,<chr>,<date>,<date>,<dbl>,<chr>,<drtn>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<date>,<drtn>,<chr>
5152678,1013,2008-03-05,2008-03-05,5241424,ADC Telecommunications Inc.,0 days,50906,2008-01-31,2008,1,10,2008-03-05,2008-03-05,2008-03-05,2007-12-12,2008-06-04,7.46124,0.28,0.01217039,17,0.001547043,0.006761325,0.01893171,0.01217039,14.79,2008-03-05,0 days,001013-2008-03-05


-f_sue_keydevid- saved  (0.28 secs)


In [None]:
f_sue_keydevid[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=F)

## <-> `car`

Task: merge with `CAR`
- `car` also includes `alpha`, `beta` and `volatility`
- I multiplied `car` by 100!!!

Filter:
- events in `f_sue_keydevid` are not in `car`. 29065 -> 26967

In [424]:
ld(f_sue_keydevid, force=T)
ld(car, force=T)

-f_sue_keydevid- loaded  (0.09 secs)
-car- loaded  (0.07 secs)


In [425]:
car[1]

permno,edate,car_m1_m1,car_m2_m2,car_m30_m3,car_0_1,car_0_10,car_0_20,car_0_30,ret_m30_m3,ret_m2_m2,ret_m1_m1,ret_0_1,ret_0_10,ret_0_20,ret_0_30,alpha,beta_mktrf,beta_smb,beta_hml,volatility,volatility2
<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
10078,2004-10-15,-0.003817367,0.004315403,-0.00785148,-0.01034447,0.07986508,0.06859604,0.16199,0.05167966,-0.01228506,-0.0124378,0,0.1335013,0.2241814,0.3879094,-0.0003937242,1.339511,2.318787,-0.5911424,0.0332523,0.03211976


In [427]:
sprintf('N f_sue_keydevid: %s\n', nrow(f_sue_keydevid)) %>% cat()

car_cols = c('car_m1_m1', 'car_m2_m2', 'car_m30_m3', 'car_0_1', 'car_0_10', 'car_0_20', 'car_0_30',
             'ret_m1_m1', 'ret_m2_m2', 'ret_m30_m3', 'ret_0_1', 'ret_0_10', 'ret_0_20', 'ret_0_30',
             'alpha', 'volatility')

f_sue_keydevid_car = car[, ':='(join_date=edate)
    ][f_sue_keydevid[, ':='(join_date=call_date)], on=.(permno, join_date), nomatch=NULL
    ][, ':='(join_date=NULL)
    ][, (car_cols) := lapply(.SD, function(x) 100*x), .SDcols=car_cols
    ][, {
      res = list()
      for (i in 1:.N) {
          leadrdq1_tmp = leadrdq1[i]
          lagrdq1_tmp = lagrdq1[i]
          lead_selector=fillna(rdq1==leadrdq1_tmp, na.value=F)
          lag_selector=fillna(rdq1==lagrdq1_tmp, na.value=F)
          
          se_lead1 = se[lead_selector][1]
          se_lag1 = se[lag_selector][1]
          sue_lag1 = sue[lag_selector][1]
          sue_lead1 = sue[lead_selector][1]
          sest_lead1 = sest[lead_selector][1]
          sest_lag1 = sest[lag_selector][1]
          
          car_0_10_lead1 = car_0_10[lead_selector][1]
          car_0_10_lag1 = car_0_10[lag_selector][1]
          car_0_20_lead1 = car_0_20[lead_selector][1]
          car_0_20_lag1 = car_0_20[lag_selector][1]
          car_0_30_lead1 = car_0_30[lead_selector][1]
          car_0_30_lag1 = car_0_30[lag_selector][1]
          

          res[[i]] = list(docid=docid[i], permno=permno[i], datadate=datadate[i], fyearq=fyearq[i], fqtr=fqtr[i],
                          fyr=fyr[i], rdq=rdq[i], rdq1=rdq1[i], repdats=repdats[i], ciq_release_date=ciq_release_date[i],
                          call_date=call_date[i],
                          leadrdq1=leadrdq1[i], release_keydevid=release_keydevid[i],
                          call_keydevid=call_keydevid[i], companyname=companyname[i],
                          mcap=mcap[i], sue=sue[i], sue_lag1=sue_lag1, sue_lead1=sue_lead1,
                          se=se[i], se_lag1=se_lag1, se_lead1=se_lead1, 
                          sest=sest[i], sest_lag1=sest_lag1, sest_lead1=sest_lead1,
                          smedest=smedest[i], numest=numest[i], sstdest=sstdest[i],
                          ret_m1_m1=ret_m1_m1[i], ret_m2_m2=ret_m2_m2[i], ret_m30_m3=ret_m30_m3[i],
                          ret_0_10=ret_0_10[i], ret_0_20=ret_0_20[i], ret_0_30=ret_0_30[i],
                          car_m1_m1=car_m1_m1[i], car_m2_m2=car_m2_m2[i], car_m30_m3=car_m30_m3[i], 
                          car_0_10=car_0_10[i], car_0_10_lead1=car_0_10_lead1, car_0_10_lag1=car_0_10_lag1,
                          car_0_20=car_0_20[i], car_0_20_lead1=car_0_20_lead1, car_0_20_lag1=car_0_20_lag1,
                          car_0_30=car_0_30[i], car_0_30_lead1=car_0_30_lead1, car_0_30_lag1=car_0_30_lag1,
                          alpha=alpha[i], beta_mktrf=beta_mktrf[i], beta_smb=beta_smb[i],
                          beta_hml=beta_hml[i], volatility=volatility[i])}
      res = rbindlist(res)
      }, 
      keyby=.(gvkey)
    ][order(gvkey, ciq_release_date)]

sprintf('N f_sue_keydevid_car: %s\n', nrow(f_sue_keydevid_car)) %>% cat()

f_sue_keydevid_car[1]
sv(f_sue_keydevid_car)

N f_sue_keydevid: 29065
N f_sue_keydevid_car: 53930


gvkey,docid,permno,datadate,fyearq,fqtr,fyr,rdq,rdq1,repdats,ciq_release_date,call_date,leadrdq1,release_keydevid,call_keydevid,companyname,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,smedest,numest,sstdest,ret_m1_m1,ret_m2_m2,ret_m30_m3,ret_0_10,ret_0_20,ret_0_30,car_m1_m1,car_m2_m2,car_m30_m3,car_0_10,car_0_10_lead1,car_0_10_lag1,car_0_20,car_0_20_lead1,car_0_20_lag1,car_0_30,car_0_30_lead1,car_0_30_lag1,alpha,beta_mktrf,beta_smb,beta_hml,volatility
<chr>,<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,001013-2008-03-05,50906,2008-01-31,2008,1,10,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,ADC Telecommunications Inc.,7.46124,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,0.01217039,17,0.001547043,0.2971765,-1.536211,0.2199395,-12.37037,-6.222223,-6.96296,0.7939703,-1.043903,-2.655725,-9.076894,11.50231,,-6.616572,4.588225,,-6.567197,0.3902708,,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002


-f_sue_keydevid_car- saved  (0.51 secs)


In [None]:
# Plot: number of calls by year
f_sue_keydevid_car[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=F)

## <-> `finratio`

Filter:
- `gvkey` in `f_sue_keydevid_car` are not in `wrds_finratio`. 
- `abs(pubdate_calldate_gap)<=60`. 
- 26967 -> 24470

In [160]:
ld(wrds_finratio, path=WRDS_DOWNLOAD_DIR, force=T)
ld(f_sue_keydevid_car, force=T)

-wrds_finratio- loaded  (0.29 secs)
-f_sue_keydevid_car- loaded  (0.11 secs)


In [161]:
sprintf('N f_sue_keydevid_car: %s\n', nrow(f_sue_keydevid_car)) %>% cat()

f_sue_keydevid_car_finratio = wrds_finratio[, ':='(join_date=public_date)
    ][f_sue_keydevid_car[, ':='(join_date=call_date)], on=.(gvkey, join_date), roll='nearest'
    ][, ':='(pubdate_calldate_gap=public_date-call_date)
    ][!is.na(pubdate_calldate_gap)
    ][abs(pubdate_calldate_gap)<=60
    ][order(gvkey, ciq_release_date)
    ][, ':='(bm=nafill(bm, 'locf'), roa=nafill(roa, 'locf'), debt_assets=nafill(debt_assets, 'locf'))
    ][, ':='(pubdate_calldate_gap=NULL, adate=NULL, qdate=NULL, public_date=NULL, join_date=NULL)]

sprintf('N f_sue_keydevid_car_finratio: %s\n', nrow(f_sue_keydevid_car_finratio)) %>% cat()

f_sue_keydevid_car_finratio[1]
sv(f_sue_keydevid_car_finratio)

N f_sue_keydevid_car: 53930
N f_sue_keydevid_car_finratio: 48936


gvkey,bm,roa,debt_assets,docid,permno,datadate,fyearq,fqtr,fyr,rdq,rdq1,repdats,ciq_release_date,call_date,leadrdq1,release_keydevid,call_keydevid,companyname,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,smedest,numest,sstdest,ret_m1_m1,ret_m2_m2,ret_m30_m3,ret_0_10,ret_0_20,ret_0_30,car_m1_m1,car_m2_m2,car_m30_m3,car_0_10,car_0_10_lead1,car_0_10_lag1,car_0_20,car_0_20_lead1,car_0_20_lag1,car_0_30,car_0_30_lead1,car_0_30_lag1,alpha,beta_mktrf,beta_smb,beta_hml,volatility
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,0.477,0.099,0.429,001013-2008-03-05,50906,2008-01-31,2008,1,10,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,ADC Telecommunications Inc.,7.46124,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,0.01217039,17,0.001547043,0.2971765,-1.536211,0.2199395,-12.37037,-6.222223,-6.96296,0.7939703,-1.043903,-2.655725,-9.076894,11.50231,,-6.616572,4.588225,,-6.567197,0.3902708,,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002


-f_sue_keydevid_car_finratio- saved  (0.49 secs)


## <-> volume

Filter:
- Some gvkey has no `iid=01` in `comp_secd`
- 24470 -> 24451

Notes:
- For any `gvkey`, we only use `iid=01`
- In `comp_secd` some records have `volume==0`, we just keep it.

In [162]:
ld(comp_secd, path=WRDS_DOWNLOAD_DIR)
ld(f_sue_keydevid_car_finratio, force=T)

-comp_secd- loaded  (3.16 mins)
-f_sue_keydevid_car_finratio- loaded  (3.77 secs)


In [163]:
sprintf('N f_sue_keydevid_car_finratio: %s\n', nrow(f_sue_keydevid_car_finratio)) %>% cat()

f_sue_keydevid_car_finratio_vol = comp_secd[datadate>ymd('2005-01-01'), .(gvkey, iid, volume=cshtrd/1e6, price_date=datadate)
    ][order(gvkey, price_date, iid)
    ][, head(.SD,1), keyby=.(gvkey, price_date)
    ][, ':='(join_date=price_date)
    ][, .(gvkey, join_date, volume, price_date, iid)
    ][f_sue_keydevid_car_finratio[, ':='(join_date=ciq_release_date)], on=.(gvkey, join_date), nomatch=NULL
    ][, ':='(releasedate_datadate_gap=ciq_release_date-price_date)
    ][abs(releasedate_datadate_gap)<=0
    ][order(-abs(releasedate_datadate_gap))
    ][, ':='(join_date=NULL, releasedate_datadate_gap=NULL, price_date=NULL, iid=NULL)]

sprintf('N f_sue_keydevid_car_finratio_vol: %s\n', nrow(f_sue_keydevid_car_finratio_vol)) %>% cat()
f_sue_keydevid_car_finratio_vol[1]
sv(f_sue_keydevid_car_finratio_vol)

N f_sue_keydevid_car_finratio: 48936
N f_sue_keydevid_car_finratio_vol: 48898


gvkey,volume,bm,roa,debt_assets,docid,permno,datadate,fyearq,fqtr,fyr,rdq,rdq1,repdats,ciq_release_date,call_date,leadrdq1,release_keydevid,call_keydevid,companyname,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,smedest,numest,sstdest,ret_m1_m1,ret_m2_m2,ret_m30_m3,ret_0_10,ret_0_20,ret_0_30,car_m1_m1,car_m2_m2,car_m30_m3,car_0_10,car_0_10_lead1,car_0_10_lag1,car_0_20,car_0_20_lead1,car_0_20_lag1,car_0_30,car_0_30_lead1,car_0_30_lag1,alpha,beta_mktrf,beta_smb,beta_hml,volatility
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,3.996961,0.477,0.099,0.429,001013-2008-03-05,50906,2008-01-31,2008,1,10,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,ADC Telecommunications Inc.,7.46124,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,0.01217039,17,0.001547043,0.2971765,-1.536211,0.2199395,-12.37037,-6.222223,-6.96296,0.7939703,-1.043903,-2.655725,-9.076894,11.50231,,-6.616572,4.588225,,-6.567197,0.3902708,,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002


-f_sue_keydevid_car_finratio_vol- saved  (0.51 secs)


## <-> `transcriptid`

Filter:
- `call_keydevid` must also exists in `f_ciq_transcript_detail_sp500`
- 24451 -> 24105

In [164]:
ld(f_ciq_transcript_detail_sp500, force=T)
ld(f_sue_keydevid_car_finratio_vol, force=T)

-f_ciq_transcript_detail_sp500- loaded  (0.08 secs)
-f_sue_keydevid_car_finratio_vol- loaded  (0.14 secs)


In [165]:
sprintf('N f_sue_keydevid_car_finratio_vol: %s\n', nrow(f_sue_keydevid_car_finratio_vol)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid = unique(f_ciq_transcript_detail_sp500[, .(join_keydevid=keydevid, transcriptid)])[f_sue_keydevid_car_finratio_vol[, ':='(join_keydevid=call_keydevid)], 
      on=.(join_keydevid),
      nomatch=NULL
    ][, .(gvkey, permno, datadate, fyear=fyearq, fqtr, ciq_release_date, ciq_call_date=call_date, rdq, rdq1, repdats,
          leadrdq1, release_keydevid, call_keydevid, transcriptid,
          mcap, sue, sue_lag1, sue_lead1, se, se_lag1, se_lead1,
          sest, sest_lag1, sest_lead1, numest, smedest, sstdest, volume,
          ret_m1_m1, ret_m2_m2, ret_m30_m3,
          ret_0_10, ret_0_20, ret_0_30,
          car_m1_m1, car_m2_m2, car_m30_m3, 
          car_0_10, car_0_10_lag1, car_0_10_lead1, car_0_20, car_0_20_lag1, car_0_20_lead1,
          car_0_30, car_0_30_lag1, car_0_30_lead1,
          bm, roa, debt_asset=debt_assets, alpha, beta_mktrf, beta_smb, beta_hml, volatility, docid)
    ][order(gvkey, ciq_release_date)]

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid[1]
sv(f_sue_keydevid_car_finratio_vol_transcriptid)

N f_sue_keydevid_car_finratio_vol: 48898
N f_sue_keydevid_car_finratio_vol_transcriptid: 48208


gvkey,permno,datadate,fyear,fqtr,ciq_release_date,ciq_call_date,rdq,rdq1,repdats,leadrdq1,release_keydevid,call_keydevid,transcriptid,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,numest,smedest,sstdest,volume,ret_m1_m1,ret_m2_m2,ret_m30_m3,ret_0_10,ret_0_20,ret_0_30,car_m1_m1,car_m2_m2,car_m30_m3,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,bm,roa,debt_asset,alpha,beta_mktrf,beta_smb,beta_hml,volatility,docid
<chr>,<chr>,<date>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1013,50906,2008-01-31,2008,1,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,320,7.46124,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,17,0.01217039,0.001547043,3.996961,0.2971765,-1.536211,0.2199395,-12.37037,-6.222223,-6.96296,0.7939703,-1.043903,-2.655725,-9.076894,,11.50231,-6.616572,,4.588225,-6.567197,,0.3902708,0.477,0.099,0.429,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002,001013-2008-03-05


-f_sue_keydevid_car_finratio_vol_transcriptid- saved  (0.5 secs)


In [None]:
# Plot: number of calls per year
f_sue_keydevid_car_finratio_vol_transcriptid[, .(year=as.factor(year(ciq_release_date)))] %>%
    plot_ly(x=~year, type='histogram') %>%
    plotly::layout(autosize=F)

## <-> similarity

Filter:
- `docid` must also exists in `similarity`
- 24105 -> 21822

Warnings:
- Must do this step after joining `transcripts`.
- before joining `similarity`, please first create `similarity.feather` in Python!

In [166]:
similarity = as.data.table(read_feather('data/similarity.feather'))

ld(f_sue_keydevid_car_finratio_vol_transcriptid)

-f_sue_keydevid_car_finratio_vol_transcriptid- already exists, will NOT load again!  (0 secs)


In [167]:
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim = similarity[f_sue_keydevid_car_finratio_vol_transcriptid, on=.(docid), nomatch=NULL]

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim)) %>% cat()
sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim)
f_sue_keydevid_car_finratio_vol_transcriptid_sim[1]

N f_sue_keydevid_car_finratio_vol_transcriptid: 48208
N f_sue_keydevid_car_finratio_vol_transcriptid_sim: 43644
-f_sue_keydevid_car_finratio_vol_transcriptid_sim- saved  (0.47 secs)


docid,similarity_unigram,similarity_bigram,similarity_allgram,gvkey,permno,datadate,fyear,fqtr,ciq_release_date,ciq_call_date,rdq,rdq1,repdats,leadrdq1,release_keydevid,call_keydevid,transcriptid,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,numest,smedest,sstdest,volume,ret_m1_m1,ret_m2_m2,ret_m30_m3,ret_0_10,ret_0_20,ret_0_30,car_m1_m1,car_m2_m2,car_m30_m3,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,bm,roa,debt_asset,alpha,beta_mktrf,beta_smb,beta_hml,volatility
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<date>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
001013-2008-03-05,0.7845072,0.0009473079,0.02925144,1013,50906,2008-01-31,2008,1,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,320,7.46124,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,17,0.01217039,0.001547043,3.996961,0.2971765,-1.536211,0.2199395,-12.37037,-6.222223,-6.96296,0.7939703,-1.043903,-2.655725,-9.076894,,11.50231,-6.616572,,4.588225,-6.567197,,0.3902708,0.477,0.099,0.429,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002


In [168]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim[sue>quantile(sue,0.9), mean(car_0_30)]
f_sue_keydevid_car_finratio_vol_transcriptid_sim[sue<quantile(sue,0.1), mean(car_0_30)]

## <-> `inflow`

Task:
- Track the [-3, 35] and [-35, 3] money inflow before or after each earnings call
    - use `foverlaps(query, subject)`
    - `query=f_sue`
    - `subject=inflow`

In [169]:
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim, force=T)
ld(inflow, force=T)

-f_sue_keydevid_car_finratio_vol_transcriptid_sim- loaded  (0.11 secs)
-inflow- loaded  (0.25 secs)


In [170]:
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim)) %>% cat()

# subject: inflow
inflow[, ':='(start=report_dt-95, end=report_dt+95)]
setkey(inflow, permno, start, end)

# query: f_sue
f_sue_keydevid_car_finratio_vol_transcriptid_sim[, ':='(start=ciq_call_date, end=ciq_call_date)]
setkey(f_sue_keydevid_car_finratio_vol_transcriptid_sim, permno, start, end)

# foverlaps(query, subject)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow = foverlaps(
      f_sue_keydevid_car_finratio_vol_transcriptid_sim, inflow,
      nomatch=NA)[, ':='(mv=NULL, i.start=NULL, i.end=NULL)
    ][, ':='(inflow_0_90=sum(inflow[(report_dt-ciq_call_date) %between% c(-3, 85)], na.rm=T)), by=docid
    ][, ':='(inflow_0_60=sum(inflow[(report_dt-ciq_call_date) %between% c(-3, 60)], na.rm=T)), by=docid
    ][, ':='(inflow_0_30=sum(inflow[(report_dt-ciq_call_date) %between% c(-3, 30)], na.rm=T)), by=docid
    ][, ':='(inflow_m30_0=sum(inflow[(report_dt-ciq_call_date) %between% c(-30, 3)], na.rm=T)), by=docid
    ][is.na(inflow_0_90), ':='(inflow_0_90=0)
    ][is.na(inflow_0_60), ':='(inflow_0_60=0)
    ][is.na(inflow_0_30), ':='(inflow_0_30=0)
    ][is.na(inflow_m30_0), ':='(inflow_m30_0=0)
    ][, ':='(report_dt=NULL, inflow=NULL, start=NULL, end=NULL)
    ][order(docid)] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow)) %>% cat()

sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow[1]

N f_sue_keydevid_car_finratio_vol_transcriptid_sim: 43644
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: 21822
-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow- saved  (0.43 secs)


permno,docid,similarity_unigram,similarity_bigram,similarity_allgram,gvkey,datadate,fyear,fqtr,ciq_release_date,ciq_call_date,rdq,rdq1,repdats,leadrdq1,release_keydevid,call_keydevid,transcriptid,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,numest,smedest,sstdest,volume,ret_m1_m1,ret_m2_m2,ret_m30_m3,ret_0_10,ret_0_20,ret_0_30,car_m1_m1,car_m2_m2,car_m30_m3,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,bm,roa,debt_asset,alpha,beta_mktrf,beta_smb,beta_hml,volatility,inflow_0_90,inflow_0_60,inflow_0_30,inflow_m30_0
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<date>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
50906,001013-2008-03-05,0.7845072,0.0009473079,0.02925144,1013,2008-01-31,2008,1,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,320,7.46124,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,17,0.01217039,0.001547043,3.996961,0.2971765,-1.536211,0.2199395,-12.37037,-6.222223,-6.96296,0.7939703,-1.043903,-2.655725,-9.076894,,11.50231,-6.616572,,4.588225,-6.567197,,0.3902708,0.477,0.099,0.429,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002,149.3935,149.3935,395.3898,42.46161


## <-> `revision`

In [171]:
ld(revision, force=T)
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow, force=T)

-revision- loaded  (1 secs)
-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow- loaded  (0.06 secs)


> Task: How soon do anlysts revise their year-end earnings forecats after earnings call?

In [172]:
how_soon_revise = revision[, .(gvkey, anndats, analys, revision, join_date=anndats)
    ][f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow[, .(gvkey, ciq_call_date, sue, join_date=ciq_call_date)],
      on=.(gvkey, join_date>=join_date), allow.cartesian=T
    ][order(gvkey, ciq_call_date, anndats), .(gvkey, ciq_call_date, anndats, analys, revision)
    ][anndats-ciq_call_date<=30, .(t=(anndats-ciq_call_date)[1], n=.N), keyby=.(gvkey, ciq_call_date, anndats)]

In [None]:
# plot: How soon do anlysts revise their year-end earnings forecats after earnings call?
# ==> Most in the first 2 days
how_soon_revise[, .(n=sum(n)), keyby=.(t)] %>%
    plot_ly(x=~t, y=~n, type='bar') %>%
    plotly::layout(autosize=F, xaxis=list(title='N days after earnings call'),
                   yaxis=list(title='N revisions'))

Task:
- <-> `revision` 

In [173]:
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow)) %>% cat()

# subject: inflow
revision[, ':='(start=anndats-95, end=anndats+95)]
setkey(revision, gvkey, start, end)

# query: f_sue
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow[, ':='(start=ciq_call_date, end=ciq_call_date)]
setkey(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow, gvkey, start, end)

# foverlaps(query, subject)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision = foverlaps(
      f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow, revision,
      nomatch=NA)[, ':='(i.start=NULL, i.end=NULL)
    ][, ':='(revision_0_90=median(revision[(anndats-ciq_call_date) %between% c(-1, 85)], na.rm=T)), by=docid
    ][, ':='(revision_0_60=median(revision[(anndats-ciq_call_date) %between% c(-1, 60)], na.rm=T)), by=docid
    ][, ':='(revision_0_30=median(revision[(anndats-ciq_call_date) %between% c(-1, 30)], na.rm=T)), by=docid
    ][, ':='(revision_m30_0=median(revision[(anndats-ciq_call_date) %between% c(-30, 1)], na.rm=T)), by=docid
    ][is.na(revision_0_90), ':='(revision_0_90=0)
    ][is.na(revision_0_60), ':='(revision_0_60=0)
    ][is.na(revision_0_30), ':='(revision_0_30=0)
    ][is.na(revision_m30_0), ':='(revision_m30_0=0)
    ][, ':='(anndats=NULL, analys=NULL, ticker=NULL, fpedats=NULL, revdats=NULL, value=NULL, revision=NULL, pdf=NULL, start=NULL, end=NULL)
    ][order(docid)] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision)) %>% cat()

sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision[1]

N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: 21822
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision: 21822
-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision- saved  (0.44 secs)


gvkey,permno,docid,similarity_unigram,similarity_bigram,similarity_allgram,datadate,fyear,fqtr,ciq_release_date,ciq_call_date,rdq,rdq1,repdats,leadrdq1,release_keydevid,call_keydevid,transcriptid,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,numest,smedest,sstdest,volume,ret_m1_m1,ret_m2_m2,ret_m30_m3,ret_0_10,ret_0_20,ret_0_30,car_m1_m1,car_m2_m2,car_m30_m3,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,bm,roa,debt_asset,alpha,beta_mktrf,beta_smb,beta_hml,volatility,inflow_0_90,inflow_0_60,inflow_0_30,inflow_m30_0,revision_0_90,revision_0_60,revision_0_30,revision_m30_0
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,50906,001013-2008-03-05,0.7845072,0.0009473079,0.02925144,2008-01-31,2008,1,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,320,7.46124,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,17,0.01217039,0.001547043,3.996961,0.2971765,-1.536211,0.2199395,-12.37037,-6.222223,-6.96296,0.7939703,-1.043903,-2.655725,-9.076894,,11.50231,-6.616572,,4.588225,-6.567197,,0.3902708,0.477,0.099,0.429,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002,149.3935,149.3935,395.3898,42.46161,0.1509813,0.1485884,0.1483683,0.1481481


## <-> `retail`

In [6]:
# -------------------------------
# compute the output: retail
# -------------------------------
ld(retail_cusip_gvkey)
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision)

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision)) %>% cat()

# create start/end/key for foverlaps
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision[, ':='(start=ciq_call_date, end=ciq_call_date)]
setkey(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision, gvkey, start, end)

# create start/end/key for foverlaps
retail_cusip_gvkey[, ':='(start=date-95, end=date+95)]
setkey(retail_cusip_gvkey, gvkey, start, end)

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail = foverlaps(
      f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision, retail_cusip_gvkey,
      nomatch=NA)[, ':='(i.start=NULL, i.end=NULL)
    ][, ':='(retail_tot_m30_0=sum(total_vol[(date-ciq_call_date) %between% c(-30, 0)], na.rm=T)), by=docid
    ][, ':='(retail_tot_m10_0=sum(total_vol[(date-ciq_call_date) %between% c(-10, 0)], na.rm=T)), by=docid
    ][, ':='(retail_tot_m3_0=sum(total_vol[(date-ciq_call_date) %between% c(-3, 0)], na.rm=T)), by=docid
    ][, ':='(retail_tot_0_3=sum(total_vol[(date-ciq_call_date) %between% c(0, 3)], na.rm=T)), by=docid
    ][, ':='(retail_tot_0_10=sum(total_vol[(date-ciq_call_date) %between% c(0, 10)], na.rm=T)), by=docid
    ][, ':='(retail_tot_0_30=sum(total_vol[(date-ciq_call_date) %between% c(0, 30)], na.rm=T)), by=docid
    ][, ':='(retail_buy_m30_0=sum(retail_buy_vol[(date-ciq_call_date) %between% c(-30, 0)], na.rm=T)), by=docid
    ][, ':='(retail_buy_m10_0=sum(retail_buy_vol[(date-ciq_call_date) %between% c(-10, 0)], na.rm=T)), by=docid
    ][, ':='(retail_buy_m3_0=sum(retail_buy_vol[(date-ciq_call_date) %between% c(-3, 0)], na.rm=T)), by=docid
    ][, ':='(retail_buy_0_3=sum(retail_buy_vol[(date-ciq_call_date) %between% c(0, 3)], na.rm=T)), by=docid
    ][, ':='(retail_buy_0_10=sum(retail_buy_vol[(date-ciq_call_date) %between% c(0, 10)], na.rm=T)), by=docid
    ][, ':='(retail_buy_0_30=sum(retail_buy_vol[(date-ciq_call_date) %between% c(0, 30)], na.rm=T)), by=docid    
    ][, ':='(retail_sell_m30_0=sum(retail_sell_vol[(date-ciq_call_date) %between% c(-30, 0)], na.rm=T)), by=docid
    ][, ':='(retail_sell_m10_0=sum(retail_sell_vol[(date-ciq_call_date) %between% c(-10, 0)], na.rm=T)), by=docid
    ][, ':='(retail_sell_m3_0=sum(retail_sell_vol[(date-ciq_call_date) %between% c(-3, 0)], na.rm=T)), by=docid
    ][, ':='(retail_sell_0_3=sum(retail_sell_vol[(date-ciq_call_date) %between% c(0, 3)], na.rm=T)), by=docid
    ][, ':='(retail_sell_0_10=sum(retail_sell_vol[(date-ciq_call_date) %between% c(0, 10)], na.rm=T)), by=docid
    ][, ':='(retail_sell_0_30=sum(retail_sell_vol[(date-ciq_call_date) %between% c(0, 30)], na.rm=T)), by=docid   
    ][, ':='(retail_net_m30_0=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(-30, 0)], na.rm=T)), by=docid
    ][, ':='(retail_net_m10_0=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(-10, 0)], na.rm=T)), by=docid
    ][, ':='(retail_net_m3_0=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(-3, 0)], na.rm=T)), by=docid
    ][, ':='(retail_net_0_3=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(0, 3)], na.rm=T)), by=docid
    ][, ':='(retail_net_0_10=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(0, 10)], na.rm=T)), by=docid
    ][, ':='(retail_net_0_30=sum((retail_buy_vol-retail_sell_vol)[(date-ciq_call_date) %between% c(0, 30)], na.rm=T)), by=docid
    ][, ':='(total_vol=NULL, retail_buy_vol=NULL, retail_sell_vol=NULL, start=NULL, end=NULL, date=NULL)
    ][order(docid)] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail[1]
sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail)

-retail_cusip_gvkey- loaded  (1.35 secs)
-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision- loaded  (0.09 secs)
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision: 21822
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail: 21822


gvkey,permno,docid,similarity_unigram,similarity_bigram,similarity_allgram,datadate,fyear,fqtr,ciq_release_date,ciq_call_date,rdq,rdq1,repdats,leadrdq1,release_keydevid,call_keydevid,transcriptid,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,numest,smedest,sstdest,volume,ret_m1_m1,ret_m2_m2,ret_m30_m3,ret_0_10,ret_0_20,ret_0_30,car_m1_m1,car_m2_m2,car_m30_m3,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,bm,roa,debt_asset,alpha,beta_mktrf,beta_smb,beta_hml,volatility,inflow_0_90,inflow_0_60,inflow_0_30,inflow_m30_0,revision_0_90,revision_0_60,revision_0_30,revision_m30_0,retail_tot_m30_0,retail_tot_m10_0,retail_tot_m3_0,retail_tot_0_3,retail_tot_0_10,retail_tot_0_30,retail_buy_m30_0,retail_buy_m10_0,retail_buy_m3_0,retail_buy_0_3,retail_buy_0_10,retail_buy_0_30,retail_sell_m30_0,retail_sell_m10_0,retail_sell_m3_0,retail_sell_0_3,retail_sell_0_10,retail_sell_0_30,retail_net_m30_0,retail_net_m10_0,retail_net_m3_0,retail_net_0_3,retail_net_0_10,retail_net_0_30
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,50906,001013-2008-03-05,0.7845072,0.0009473079,0.02925144,2008-01-31,2008,1,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,320,7.46124,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,17,0.01217039,0.001547043,3.996961,0.2971765,-1.536211,0.2199395,-12.37037,-6.222223,-6.96296,0.7939703,-1.043903,-2.655725,-9.076894,,11.50231,-6.616572,,4.588225,-6.567197,,0.3902708,0.477,0.099,0.429,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002,149.3935,149.3935,395.3898,42.46161,0.1509813,0.1485884,0.1483683,0.1481481,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail- saved  (0.56 secs)


In [None]:
# ----------------------------------------
# PLOT: retail trades change around the call day
# ----------------------------------------

dt = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail[!is.na(date) & ((date-ciq_call_date) %between% c(-30, 30)), 
      .(retail_tot=sum(total_vol, na.rm=T), 
        retail_buy=sum(retail_buy_vol, na.rm=T), 
        retail_sell=sum(retail_sell_vol, na.rm=T)), 
      keyby=.(date-ciq_call_date)
    ][, c('retail_tot', 'retail_buy', 'retail_sell') := lapply(.SD, cumsum), .SDcols=is.numeric][]

dt %>%
    plot_ly(x=~date, y=~(retail_buy-retail_sell), type='scatter', mode='lines') %>%
#     add_trace(y=~retail_buy) %>%
#     add_trace(y=~(retail_buy-retail_sell)) %>%
    plotly::layout(autosize=F)

## <-> `sentiment`

In [7]:
# generate sentiment
present_sentiment = as.data.table(read_parquet('data/present_sentiment.parquet',
      col_select=c('docid', 'chunk_i', 'chunk_positive', 'chunk_neutral', 
                   'chunk_negative', 'sentence_i', 'sentence_positive', 
                   'sentence_neutral', 'sentence_negative')
    ))[, ':='(n_sentence_in_chunk=max(sentence_i)), keyby=.(docid, chunk_i)
    ][n_sentence_in_chunk>=5, 
      .(present_positive_chunk=mean(chunk_positive), present_positive_sent=mean(sentence_positive, na.rm=T),
        present_neutral_chunk=mean(chunk_neutral), present_neutral_sent=mean(sentence_neutral, na.rm=T),
        present_negative_chunk=mean(chunk_negative), present_negative_sent=mean(sentence_negative, na.rm=T)),
      keyby=docid]

qa_sentiment = as.data.table(read_parquet('data/qa_sentiment.parquet', 
      col_select=c('docid', 'chunk_i', 'chunk_positive', 'chunk_neutral',
                   'chunk_negative', 'sentence_i', 'sentence_positive',
                   'sentence_neutral', 'sentence_negative')
    ))[, ':='(n_sentence_in_chunk=max(sentence_i)), keyby=.(docid, chunk_i)
    ][n_sentence_in_chunk>=5, 
      .(qa_positive_chunk=mean(chunk_positive), qa_positive_sent=mean(sentence_positive, na.rm=T),
        qa_neutral_chunk=mean(chunk_neutral), qa_neutral_sent=mean(sentence_neutral, na.rm=T),
        qa_negative_chunk=mean(chunk_negative), qa_negative_sent=mean(sentence_negative, na.rm=T)),
      keyby=docid]

sentiment = present_sentiment[qa_sentiment, on=.(docid), nomatch=NA]

sentiment[1, .(docid, md_positive=present_positive_sent, md_neutral=present_neutral_sent, md_negative=present_negative_sent，
               qa_positive=qa_positive_sent, qa_neutral=qa_neutral_sent, qa_negative=qa_negative_sent)]

docid,md_positive,md_neutral,md_negative,qa_positive,qa_neutral,qa_negative
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
001013-2008-03-05,0.2613174,0.6041317,0.1345509,0.2761852,0.5408889,0.1829259


In [8]:
# <-> sentiment
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail)

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment = sentiment[f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail, on=.(docid), nomatch=NULL]

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: %s\n (%s variable)\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment), length(names(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment))) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment[1]
sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment)

-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail- already exists, will NOT load again!  (0 secs)
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail: 21822
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: 21816
 (102 variable)


docid,present_positive_chunk,present_positive_sent,present_neutral_chunk,present_neutral_sent,present_negative_chunk,present_negative_sent,qa_positive_chunk,qa_positive_sent,qa_neutral_chunk,qa_neutral_sent,qa_negative_chunk,qa_negative_sent,gvkey,permno,similarity_unigram,similarity_bigram,similarity_allgram,datadate,fyear,fqtr,ciq_release_date,ciq_call_date,rdq,rdq1,repdats,leadrdq1,release_keydevid,call_keydevid,transcriptid,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,numest,smedest,sstdest,volume,ret_m1_m1,ret_m2_m2,ret_m30_m3,ret_0_10,ret_0_20,ret_0_30,...,car_m30_m3,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,bm,roa,debt_asset,alpha,beta_mktrf,beta_smb,beta_hml,volatility,inflow_0_90,inflow_0_60,inflow_0_30,inflow_m30_0,revision_0_90,revision_0_60,revision_0_30,revision_m30_0,retail_tot_m30_0,retail_tot_m10_0,retail_tot_m3_0,retail_tot_0_3,retail_tot_0_10,retail_tot_0_30,retail_buy_m30_0,retail_buy_m10_0,retail_buy_m3_0,retail_buy_0_3,retail_buy_0_10,retail_buy_0_30,retail_sell_m30_0,retail_sell_m10_0,retail_sell_m3_0,retail_sell_0_3,retail_sell_0_10,retail_sell_0_30,retail_net_m30_0,retail_net_m10_0,retail_net_m3_0,retail_net_0_3,retail_net_0_10,retail_net_0_30
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
001013-2008-03-05,0.5322156,0.2613174,0.1649701,0.6041317,0.3028144,0.1345509,0.5445926,0.2761852,0.1044074,0.5408889,0.351,0.1829259,1013,50906,0.7845072,0.0009473079,0.02925144,2008-01-31,2008,1,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,320,7.46124,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,17,0.01217039,0.001547043,3.996961,0.2971765,-1.536211,0.2199395,-12.37037,-6.222223,-6.96296,...,-2.655725,-9.076894,,11.50231,-6.616572,,4.588225,-6.567197,,0.3902708,0.477,0.099,0.429,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002,149.3935,149.3935,395.3898,42.46161,0.1509813,0.1485884,0.1483683,0.1481481,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment- saved  (0.63 secs)


## <-> `transcripts`

Task
- add `x_present`, `x_qa`, `x_ans` and `x_ques`

Filter:
- If one `release_keydevid` have multiple `transcriptid`, merge them
- 21822 -> 21763

Note:
- unique key: ['gvkey', 'ciq_release_date']

In [1]:
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment, force=T)
ld(x_present)
ld(x_qa)
ld(x_ans)
ld(x_ques)
ld(x_all)

-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment- loaded  (0.09 secs)
-x_present- loaded  (4.32 secs)
-x_qa- loaded  (5.17 secs)
-x_ans- loaded  (4.95 secs)
-x_ques- loaded  (1.41 secs)
-x_all- loaded  (8.69 secs)


In [2]:
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment[
      x_present[, .(transcriptid, text_present=text)], on=.(transcriptid), nomatch=NULL
    ][x_qa[, .(transcriptid, text_qa=text)], on=.(transcriptid), nomatch=NULL
    ][x_ans[, .(transcriptid, text_ans=text)], on=.(transcriptid), nomatch=NULL
    ][x_ques[, .(transcriptid, text_ques=text)], on=.(transcriptid), nomatch=NULL
    ][x_all[, .(transcriptid, text_all=text)], on=.(transcriptid), nomatch=NULL
    ][order(gvkey, ciq_call_date)
    ][, c('bm', 'roa') := lapply(.SD, partial(nafill, type='locf')), .SDcols=c('bm', 'roa'), keyby=gvkey
    ][, {text_present=str_c(text_present, collapse=' ')
         text_qa=str_c(text_qa, collapse=' ')
         text_ques=str_c(text_ques, collapse=' ')
         text_ans=str_c(text_ans, collapse=' ')
         text_all=str_c(text_all, collapse=' ')
         list(docid=docid[1], ciq_call_date=ciq_call_date[1], fyear=fyear[1], fqtr=fqtr[1],
              release_keydevid=release_keydevid[1], call_keydevid=call_keydevid[1], transcriptid=transcriptid[1],
              ret_0_10=ret_0_10[1], ret_0_20=ret_0_20[1], ret_0_30=ret_0_30[1],
              car_0_10=car_0_10[1], car_0_10_lag1=car_0_10_lag1[1], car_0_10_lead1=car_0_10_lead1[1], 
              car_0_20=car_0_20[1], car_0_20_lag1=car_0_20_lag1[1], car_0_20_lead1=car_0_20_lead1[1], 
              car_0_30=car_0_30[1], car_0_30_lag1=car_0_30_lag1[1], car_0_30_lead1=car_0_30_lead1[1], 
              beta_mktrf=beta_mktrf[1],
              ret_m1_m1=ret_m1_m1[1], ret_m2_m2=ret_m2_m2[1], ret_m30_m3=ret_m30_m3[1],
              car_m1_m1=car_m1_m1[1], car_m2_m2=car_m2_m2[1], car_m30_m3=car_m30_m3[1],
              sest=sest[1], sest_lag1=sest_lag1[1], sest_lead1=sest_lead1[1],
              sue=sue[1], sue_lag1=sue_lag1[1], sue_lead1=sue_lead1[1],
              se=se[1], se_lag1=se_lag1[1], se_lead1=se_lead1[1],
              numest=numest[1], smedest=smedest[1], sstdest=sstdest[1], volume=volume[1], mcap=mcap[1], 
              bm=bm[1], roa=roa[1], debt_asset=debt_asset[1], alpha=alpha[1], volatility=volatility[1],
              revision=revision_0_90[1], inflow=inflow_0_90[1],
              similarity_unigram=similarity_unigram[1], similarity_bigram=similarity_bigram[1], similarity_allgram=similarity_allgram[1],
              present_positive_chunk=present_positive_chunk[1], present_positive_sent=present_positive_sent[1],
              present_neutral_chunk=present_neutral_chunk[1], present_neutral_sent=present_neutral_sent[1],
              present_negative_chunk=present_negative_chunk[1], present_negative_sent=present_negative_sent[1], 
              qa_positive_chunk=qa_positive_chunk[1], qa_positive_sent=qa_positive_sent[1], qa_neutral_chunk=qa_neutral_chunk[1],
              qa_neutral_sent=qa_neutral_sent[1], qa_negative_chunk=qa_negative_chunk[1], qa_negative_sent=qa_negative_sent[1],
              retail_tot_m30_0 = retail_tot_m30_0[1],
              retail_tot_m10_0 = retail_tot_m10_0[1],
              retail_tot_m3_0 = retail_tot_m3_0[1],
              retail_tot_0_3 = retail_tot_0_3[1],
              retail_tot_0_10 = retail_tot_0_10[1],
              retail_tot_0_30 = retail_tot_0_30[1],
              retail_buy_m30_0 = retail_buy_m30_0[1],
              retail_buy_m10_0 = retail_buy_m10_0[1],
              retail_buy_m3_0 = retail_buy_m3_0[1],
              retail_buy_0_3 = retail_buy_0_3[1],
              retail_buy_0_10 = retail_buy_0_10[1],
              retail_buy_0_30 = retail_buy_0_30[1],
              retail_sell_m30_0 = retail_sell_m30_0[1],
              retail_sell_m10_0 = retail_sell_m10_0[1],
              retail_sell_m3_0 = retail_sell_m3_0[1],
              retail_sell_0_3 = retail_sell_0_3[1],
              retail_sell_0_10 = retail_sell_0_10[1],
              retail_sell_0_30 = retail_sell_0_30[1],
              retail_net_m30_0 = retail_net_m30_0[1],
              retail_net_m10_0 = retail_net_m10_0[1],
              retail_net_m3_0 = retail_net_m3_0[1],
              retail_net_0_3 = retail_net_0_3[1],
              retail_net_0_10 = retail_net_0_10[1],
              retail_net_0_30 = retail_net_0_30[1],
              text_present=text_present, text_qa=text_qa, text_ques=text_ques, text_ans=text_ans, text_all=text_all)
        }, 
      keyby=.(gvkey, ciq_release_date)
    ] %>% na.omit(cols=c('docid', 'gvkey', 'ciq_release_date', 'car_0_30', 'sest', 'sue', 'se', 'numest', 'bm', 'roa', 'alpha', 'volatility'))

# fill NA in sentiment cols
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text[,
      ':='(qa_positive_chunk=fillna(qa_positive_chunk, mean(qa_positive_chunk, na.rm=T)),
           qa_neutral_chunk=fillna(qa_neutral_chunk, mean(qa_neutral_chunk, na.rm=T)),
           qa_negative_chunk=fillna(qa_negative_chunk, mean(qa_negative_chunk, na.rm=T)),
           present_positive_chunk=fillna(present_positive_chunk, mean(present_positive_chunk, na.rm=T)),
           present_neutral_chunk=fillna(present_neutral_chunk, mean(present_neutral_chunk, na.rm=T)),
           present_negative_chunk=fillna(present_neutral_chunk, mean(present_negative_chunk, na.rm=T)),
           qa_positive_sent=fillna(qa_positive_sent, mean(qa_positive_sent, na.rm=T)),
           qa_neutral_sent=fillna(qa_neutral_sent, mean(qa_neutral_sent, na.rm=T)),
           qa_negative_sent=fillna(qa_negative_sent, mean(qa_negative_sent, na.rm=T)),
           present_positive_sent=fillna(present_positive_sent, mean(present_positive_sent, na.rm=T)),
           present_neutral_sent=fillna(present_neutral_sent, mean(present_neutral_sent, na.rm=T)),
           present_negative_sent=fillna(present_neutral_sent, mean(present_negative_sent, na.rm=T)))]

# Some id varaibles are incorrectly represented as double. 
# Convert them back to int

id_cols = c('release_keydevid','call_keydevid','transcriptid')
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text[,
    (id_cols) := lapply(.SD, as.integer), .SDcols=id_cols]

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text: %s\n (%s variable)', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text), length(names(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text))) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text[1, -c('text_present', 'text_qa', 'text_ques', 'text_ans', 'text_all')]

write_feather(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text, './data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text.feather')

N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment: 21816
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text: 21763
 (93 variable)

gvkey,ciq_release_date,docid,ciq_call_date,fyear,fqtr,release_keydevid,call_keydevid,transcriptid,ret_0_10,ret_0_20,ret_0_30,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,beta_mktrf,ret_m1_m1,ret_m2_m2,ret_m30_m3,car_m1_m1,car_m2_m2,car_m30_m3,sest,sest_lag1,sest_lead1,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,numest,smedest,sstdest,volume,mcap,bm,roa,debt_asset,alpha,volatility,revision,inflow,similarity_unigram,similarity_bigram,similarity_allgram,present_positive_chunk,present_positive_sent,present_neutral_chunk,present_neutral_sent,present_negative_chunk,present_negative_sent,qa_positive_chunk,qa_positive_sent,qa_neutral_chunk,qa_neutral_sent,qa_negative_chunk,qa_negative_sent,retail_tot_m30_0,retail_tot_m10_0,retail_tot_m3_0,retail_tot_0_3,retail_tot_0_10,retail_tot_0_30,retail_buy_m30_0,retail_buy_m10_0,retail_buy_m3_0,retail_buy_0_3,retail_buy_0_10,retail_buy_0_30,retail_sell_m30_0,retail_sell_m10_0,retail_sell_m3_0,retail_sell_0_3,retail_sell_0_10,retail_sell_0_30,retail_net_m30_0,retail_net_m10_0,retail_net_m3_0,retail_net_0_3,retail_net_0_10,retail_net_0_30
<chr>,<date>,<chr>,<date>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,2008-03-05,001013-2008-03-05,2008-03-05,2008,1,5241424,5152678,320,-12.37037,-6.222223,-6.96296,-9.076894,,11.50231,-6.616572,,4.588225,-6.567197,,0.3902708,0.9685945,0.2971765,-1.536211,0.2199395,0.7939703,-1.043903,-2.655725,0.01217039,,0.02175464,0.006761325,,0.006062767,0.01893171,,0.0278174,17,0.01217039,0.001547043,3.996961,7.46124,0.477,0.099,0.429,-0.08592177,3.171002,0.1509813,149.3935,0.7845072,0.0009473079,0.02925144,0.5322156,0.2613174,0.1649701,0.6041317,0.1649701,0.6041317,0.5445926,0.2761852,0.1044074,0.5408889,0.351,0.1829259,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text- saved  (2.19 mins)


In [11]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text[sue>quantile(sue,0.9), mean(car_0_30)]
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text[sue<quantile(sue,0.1), mean(car_0_30)]

## standardize

In [4]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text = as.data.table(read_feather('./data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text.feather'))

In [5]:
standardize <- function(x) {
    stopifnot(length(x) == sum(!is.na(x)))
    (x-mean(x))/sd(x)
}

cols_to_be_normalized = c('inflow', 'revision', 
                          'ret_0_10', 'ret_0_20', 'ret_0_30',
                          'car_0_10', 'car_0_20', 'car_0_30',
                          'ret_m1_m1', 'ret_m2_m2', 'ret_m30_m3', 
                          'car_m1_m1', 'car_m2_m2', 'car_m30_m3', 
                          'alpha', 'sest', 'sue', 'numest', 'sstdest', 'smedest', 
                          'mcap', 'roa', 'bm', 'debt_asset', 'volatility', 'volume', 
                          'similarity_bigram', 'qa_positive_sent')

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm = copy(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text)[, 
    (str_c(cols_to_be_normalized, '_norm')) := lapply(.SD, standardize), .SDcols=cols_to_be_normalized]

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm[1, -c('text_present', 'text_qa', 'text_ques', 'text_ans', 'text_all')]

write_feather(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm, './data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm.feather')

gvkey,ciq_release_date,docid,ciq_call_date,fyear,fqtr,release_keydevid,call_keydevid,transcriptid,ret_0_10,ret_0_20,ret_0_30,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,beta_mktrf,ret_m1_m1,ret_m2_m2,ret_m30_m3,car_m1_m1,car_m2_m2,car_m30_m3,sest,sest_lag1,sest_lead1,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,numest,smedest,sstdest,volume,mcap,bm,roa,debt_asset,alpha,volatility,revision,inflow,similarity_unigram,...,retail_tot_m3_0,retail_tot_0_3,retail_tot_0_10,retail_tot_0_30,retail_buy_m30_0,retail_buy_m10_0,retail_buy_m3_0,retail_buy_0_3,retail_buy_0_10,retail_buy_0_30,retail_sell_m30_0,retail_sell_m10_0,retail_sell_m3_0,retail_sell_0_3,retail_sell_0_10,retail_sell_0_30,retail_net_m30_0,retail_net_m10_0,retail_net_m3_0,retail_net_0_3,retail_net_0_10,retail_net_0_30,inflow_norm,revision_norm,ret_0_10_norm,ret_0_20_norm,ret_0_30_norm,car_0_10_norm,car_0_20_norm,car_0_30_norm,ret_m1_m1_norm,ret_m2_m2_norm,ret_m30_m3_norm,car_m1_m1_norm,car_m2_m2_norm,car_m30_m3_norm,alpha_norm,sest_norm,sue_norm,numest_norm,sstdest_norm,smedest_norm,mcap_norm,roa_norm,bm_norm,debt_asset_norm,volatility_norm,volume_norm,similarity_bigram_norm,qa_positive_sent_norm
<chr>,<date>,<chr>,<date>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1013,2008-03-05,001013-2008-03-05,2008-03-05,2008,1,5241424,5152678,320,-12.37037,-6.222223,-6.96296,-9.076894,,11.50231,-6.616572,,4.588225,-6.567197,,0.3902708,0.9685945,0.2971765,-1.536211,0.2199395,0.7939703,-1.043903,-2.655725,0.01217039,,0.02175464,0.006761325,,0.006062767,0.01893171,,0.0278174,17,0.01217039,0.001547043,3.996961,7.46124,0.477,0.099,0.429,-0.08592177,3.171002,0.1509813,149.3935,0.7845072,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.08994682,0.1242988,-1.350438,-0.6100161,-0.6486038,-1.058595,-0.6354602,-0.5393092,0.08772626,-0.6854824,-0.1002964,0.4258693,-0.5914129,-0.2489332,-0.6515848,0.001757033,0.1874834,0.6504555,-0.04206152,0.001757033,-1.461533,-0.4330228,-0.1118559,-0.9057401,0.8622228,-0.1818851,0.1868722,0.3212065


In [None]:
# hist: inflow vs. revision
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_sentiment_text_norm %>%
    plot_ly(alpha=0.6) %>%
    add_histogram(x=~inflow_norm, name='inflow') %>%
    add_histogram(x=~car_0_30_norm, name='car') %>%
    add_histogram(x=~revision, name='revision') %>%
    plotly::layout(barmode='overlay', autosize=F)

## process outlier

`car_0_30_norm` in [-5,5]

winsorize 0.3%

In [None]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm = as.data.table(read_feather('data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm.feather'))

wsrz <- function(x, low, high) {
    x[x>=high] = high
    x[x<=low] = low
    x
}

# only cols in the following are winsorized.
# we only winsorize target variables
# the winsorization is carried out in place: I didn't create new variable
cols_to_be_winsorized = c('car_0_10_norm', 'car_0_20_norm', 'car_0_30_norm',
                          'ret_0_10_norm', 'ret_0_20_norm', 'ret_0_30_norm')

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz = copy(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm)[, 
     (cols_to_be_winsorized) := lapply(.SD, wsrz, -5, 5), .SDcols=cols_to_be_winsorized]

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz[1, -c('text_present', 'text_qa', 'text_ques', 'text_ans', 'text_all')]

write_feather(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz, './data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz.feather')

## add outlier flag

In [1]:
ld(gvkey_outlier) # created in 7.2.4

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz = read_feather('./data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz.feather') %>% as.data.table()

-gvkey_outlier- loaded  (0 secs)


In [2]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz_outlierflg = copy(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz)[,
      ':='(outlier_flag1 = ifelse(gvkey%in%gvkey_outlier, T, F))]

write_feather(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz_outlierflg, './data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz_outlierflg.feather')

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz_outlierflg[1, -c('text_present', 'text_qa', 'text_ques', 'text_ans', 'text_all')]

gvkey,ciq_release_date,docid,ciq_call_date,fyear,fqtr,release_keydevid,call_keydevid,transcriptid,ret_0_10,ret_0_20,ret_0_30,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,beta_mktrf,ret_m1_m1,ret_m2_m2,ret_m30_m3,car_m1_m1,car_m2_m2,car_m30_m3,sest,sest_lag1,sest_lead1,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,numest,smedest,sstdest,volume,mcap,bm,roa,debt_asset,alpha,volatility,revision,inflow,similarity_unigram,...,retail_tot_0_3,retail_tot_0_10,retail_tot_0_30,retail_buy_m30_0,retail_buy_m10_0,retail_buy_m3_0,retail_buy_0_3,retail_buy_0_10,retail_buy_0_30,retail_sell_m30_0,retail_sell_m10_0,retail_sell_m3_0,retail_sell_0_3,retail_sell_0_10,retail_sell_0_30,retail_net_m30_0,retail_net_m10_0,retail_net_m3_0,retail_net_0_3,retail_net_0_10,retail_net_0_30,inflow_norm,revision_norm,ret_0_10_norm,ret_0_20_norm,ret_0_30_norm,car_0_10_norm,car_0_20_norm,car_0_30_norm,ret_m1_m1_norm,ret_m2_m2_norm,ret_m30_m3_norm,car_m1_m1_norm,car_m2_m2_norm,car_m30_m3_norm,alpha_norm,sest_norm,sue_norm,numest_norm,sstdest_norm,smedest_norm,mcap_norm,roa_norm,bm_norm,debt_asset_norm,volatility_norm,volume_norm,similarity_bigram_norm,qa_positive_sent_norm,outlier_flag1
<chr>,<date>,<chr>,<date>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
1013,2008-03-05,001013-2008-03-05,2008-03-05,2008,1,5241424,5152678,320,-12.37037,-6.222223,-6.96296,-9.076894,,11.50231,-6.616572,,4.588225,-6.567197,,0.3902708,0.9685945,0.2971765,-1.536211,0.2199395,0.7939703,-1.043903,-2.655725,0.01217039,,0.02175464,0.006761325,,0.006062767,0.01893171,,0.0278174,17,0.01217039,0.001547043,3.996961,7.46124,0.477,0.099,0.429,-0.08592177,3.171002,0.1509813,149.3935,0.7845072,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.08994682,0.1242988,-1.350438,-0.6100161,-0.6486038,-1.058595,-0.6354602,-0.5393092,0.08772626,-0.6854824,-0.1002964,0.4258693,-0.5914129,-0.2489332,-0.6515848,0.001757033,0.1874834,0.6504555,-0.04206152,0.001757033,-1.461533,-0.4330228,-0.1118559,-0.9057401,0.8622228,-0.1818851,0.1868722,0.3212065,False


In [3]:
# how many instances are labeled as outlier?
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_text_norm_wsrz_outlierflg[, .(sum(outlier_flag1==T)/.N)]

V1
<dbl>
0.2298396


## plot (for test)

Task:
- Distribution of calls within a year

In [None]:
dt = read_feather('./data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_sentiment_text_norm_wsrz.feather') %>% as.data.table()

dt[, ':='(year=year(ciq_call_date))]

add_vline <- function(y) {
    list(type = "line", 
         y0 = y, y1 = y, xref = "paper", # i.e. y as a proportion of visible region
         x0 = 0, x1 = 1, 
         line = list(dash='dot', width=1))
}

copy(dt)[ciq_call_date %between% c(ymd('2017-10-01'), ymd('2018-12-31')), .(car_0_30_norm, ciq_call_date, date)] %>%
    plot_ly(x=~ciq_call_date, y=~car_0_30_norm, color=~as.factor(year(ciq_call_date)), type='scatter', mode='markers', alpha=0.3) %>%
    plotly::layout(autosize=F, xaxis=list(title='', type='date'),
                   yaxis=list(title='CAR'), shapes=list(add_vline(-2), add_vline(2)))

In [None]:
copy(dt)[, ':='(date=make_date(2000, month(ciq_call_date), day(ciq_call_date)))
    ][ciq_call_date %between% c(ymd('2008-01-01'), ymd('2018-12-31')), .(car_0_30_norm, ciq_call_date, date)] %>%
    plot_ly(x=~date, y=~car_0_30_norm, color='red', type='scatter', mode='markers', alpha=0.2) %>%
    plotly::layout(autosize=F, xaxis=list(title='', type='date', tickformat = "%b %d"),
                   yaxis=list(title='CAR'))

Only remove missing values here. For scaling, do it in Python.

In [209]:
targets_df = as.data.table(read_feather('data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text.feather'))

all_cols = names(targets_df)
text_cols = c('text_present', 'text_qa', 'text_ans', 'text_ques', 'text_all')
non_text_cols = all_cols[!all_cols %in% text_cols]

targets_df = targets_df[, ..non_text_cols]

index vs CAR

In [None]:
ld(idx_price, path=WRDS_DOWNLOAD_DIR)

spx_daily = idx_price[gvkeyx=='000003' & datadate %between% c(ymd('2008-01-01'), ymd('2018-10-01')), 
                    .(date=datadate, idx=prccd)] %>%
    plot_ly(x=~date, y=~idx, name='Index Return', type='scatter', mode='lines') %>%
    plotly::layout(autosize=F, legend=list(0.1, 0.9))

outlier = f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text[, .(ciq_release_date, car_0_30)] %>%
    plot_ly(x=~ciq_release_date, y=~car_0_30, type='scatter', mode='markers', marker=list(size=1, line=list(width=1), alpha=0.5), name='CAR') %>%
    plotly::layout(autosize=F, legend=list(0.1, 0.9))

subplot(spx_daily, outlier, nrows=2, shareX=T)

Extreme values of CAR

In [None]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text[abs(car_0_30)>=60] %>%
    plot_ly(x=~ciq_release_date, y=~car_0_30, type='scatter', mode='markers', showlegend=T) %>% 
    plotly::layout(autosize=F)

In [None]:
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text %>%
    plot_ly(x=~car_0_30, type='histogram', nbinsx=300) %>% 
    plotly::layout(autosize=F)

In [5]:
dt = read_feather('./data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text.feather')

In [7]:
setDT(dt)
dt[, summary(inflow)]

      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-110.91022   -2.07961   -0.60965   -2.03208   -0.07435   43.82143 

## split train/val/test

Task: create rolling window
- start: `2008-01-01`
- end: `2018-12-31`
- training period: 3 years (12 quarters)
- predict period: next quarter

> `val` is randomly sampled from `train`

### `val` and `train` are of same period

In [90]:
get_rolling_split_dates <- function(train_start, train_end, test_start, test_end, roll_type) {
    rolling_split_dates = data.table(window='roll-01', train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, roll_type=roll_type)
    
    i_window = 1
    while (test_end < ymd('2018-12-31')) {
        i_window = i_window + 1
        train_start = train_start %m+% months(3) 
        train_end = train_end %m+% months(4) %>% rollback()
        test_start = test_start %m+% months(3) 
        test_end = test_end %m+% months(4) %>% rollback()
        
        rolling_split_dates = rbindlist(
            list(rolling_split_dates, 
                 list(window=str_c('roll-', str_pad(i_window, 2, pad = "0")), 
                      train_start=train_start, train_end=train_end, 
                      test_start=test_start, test_end=test_end, roll_type=roll_type)))
    }
    return(rolling_split_dates)
}
train_start_1y = ymd('2008-01-01')
train_end_1y = ymd('2008-12-31')
test_start_1y = ymd('2009-01-01')
test_end_1y = ymd('2009-03-31')

train_start_2y = ymd('2008-01-01')
train_end_2y = ymd('2009-12-31')
test_start_2y = ymd('2010-01-01')
test_end_2y = ymd('2010-03-31')

train_start_3y = ymd('2008-01-01')
train_end_3y = ymd('2010-12-31')
test_start_3y = ymd('2011-01-01')
test_end_3y = ymd('2011-03-31')

train_start_4y = ymd('2008-01-01')
train_end_4y = ymd('2011-12-31')
test_start_4y = ymd('2012-01-01')
test_end_4y = ymd('2012-03-31')

train_start_5y = ymd('2008-01-01')
train_end_5y = ymd('2012-12-31')
test_start_5y = ymd('2013-01-01')
test_end_5y = ymd('2013-03-31')

train_start_6y = ymd('2008-01-01')
train_end_6y = ymd('2013-12-31')
test_start_6y = ymd('2014-01-01')
test_end_6y = ymd('2014-03-31')

rolling_split_dates = rbindlist(
    list(get_rolling_split_dates(train_start_1y, train_end_1y, test_start_1y, test_end_1y, roll_type='1y'),
         get_rolling_split_dates(train_start_2y, train_end_2y, test_start_2y, test_end_2y, roll_type='2y'),
         get_rolling_split_dates(train_start_3y, train_end_3y, test_start_3y, test_end_3y, roll_type='3y'),
         get_rolling_split_dates(train_start_4y, train_end_4y, test_start_4y, test_end_4y, roll_type='4y'),
         get_rolling_split_dates(train_start_5y, train_end_5y, test_start_5y, test_end_5y, roll_type='5y'),
         get_rolling_split_dates(train_start_6y, train_end_6y, test_start_6y, test_end_6y, roll_type='6y')),
    use=T)

nonrolling_split = list(window='non-roll-01', 
                        train_start=ymd('2008-01-01'),
                        train_end=ymd('2018-09-30'),
                        test_start=ymd('2018-10-01'),
                        test_end=ymd('2018-12-31'),
                        roll_type='non')

rolling_split_dates = rbindlist(list(rolling_split_dates, nonrolling_split), fill=T)[order(roll_type, window)
    ][, ':='(yqtr=fcase(month(test_start)==1, str_c(year(test_start)-1, '-q4'),
                        month(test_start)==4, str_c(year(test_start), '-q1'),
                        month(test_start)==7, str_c(year(test_start), '-q2'),
                        month(test_start)==10, str_c(year(test_start), '-q3')))
    ][test_end<ymd('2019-01-01')
    ][, ':='(is_test=ifelse(yqtr %in% c('2015-q3', '2015-q4', '2016-q2', '2018-q1', '2018-q2'), T, F))]

# print previe
rolling_split_dates[roll_type=='5y'][order(roll_type, window)]


# write to csv
sv(rolling_split_dates)
fwrite(rolling_split_dates, './data/split_dates.csv')

window,train_start,train_end,test_start,test_end,roll_type,yqtr,is_test
<chr>,<date>,<date>,<date>,<date>,<chr>,<chr>,<lgl>
roll-01,2008-01-01,2012-12-31,2013-01-01,2013-03-31,5y,2012-q4,False
roll-02,2008-04-01,2013-03-31,2013-04-01,2013-06-30,5y,2013-q1,False
roll-03,2008-07-01,2013-06-30,2013-07-01,2013-09-30,5y,2013-q2,False
roll-04,2008-10-01,2013-09-30,2013-10-01,2013-12-31,5y,2013-q3,False
roll-05,2009-01-01,2013-12-31,2014-01-01,2014-03-31,5y,2013-q4,False
roll-06,2009-04-01,2014-03-31,2014-04-01,2014-06-30,5y,2014-q1,False
roll-07,2009-07-01,2014-06-30,2014-07-01,2014-09-30,5y,2014-q2,False
roll-08,2009-10-01,2014-09-30,2014-10-01,2014-12-31,5y,2014-q3,False
roll-09,2010-01-01,2014-12-31,2015-01-01,2015-03-31,5y,2014-q4,False
roll-10,2010-04-01,2015-03-31,2015-04-01,2015-06-30,5y,2015-q1,False


-rolling_split_dates- saved  (0.01 secs)


In [3]:
rolling_split_dates[, .(.N), keyby=.(month(test_start), day(test_start))]

month,day,N
<dbl>,<int>,<int>
1,1,45
4,1,45
7,1,45
10,1,46


### `val` is *after* `train`

In [82]:
get_rolling_split_dates <- function(train_start, train_end, val_start, val_end, test_start, test_end, roll_type) {
    rolling_split_dates = data.table(window='roll-01', train_start=train_start, train_end=train_end, val_start=val_start,
                                     val_end=val_end, test_start=test_start, test_end=test_end, roll_type=roll_type)
    
    i_window = 1
    while (test_end < ymd('2018-12-31')) {
        i_window = i_window + 1
        train_start=train_start %m+% months(3) 
        train_end=train_end %m+% months(3)
        val_start = val_start %m+% months(3)
        val_end = val_end %m+% months(3)
        test_start=test_start %m+% months(3) 
        test_end=test_end %m+% months(3)
        
        rolling_split_dates = rbindlist(
            list(rolling_split_dates, 
                 list(window=str_c('roll-', str_pad(i_window, 2, pad = "0")), 
                      train_start=train_start, train_end=train_end, val_start = val_start, val_end = val_end,
                      test_start=test_start, test_end=test_end, roll_type=roll_type)))
    }
    return(rolling_split_dates)
}
train_start_1y = ymd('2008-01-01')
train_end_1y = ymd('2008-10-31')
val_start_1y = ymd('2008-11-01')
val_end_1y = ymd('2008-12-31')
test_start_1y = ymd('2009-01-01')
test_end_1y = ymd('2009-03-31')

train_start_2y = ymd('2008-01-01')
train_end_2y = ymd('2009-10-31')
val_start_2y = ymd('2009-11-01')
val_end_2y = ymd('2009-12-31')
test_start_2y = ymd('2010-01-01')
test_end_2y = ymd('2010-03-31')

train_start_3y = ymd('2008-01-01')
train_end_3y = ymd('2010-10-31')
val_start_3y = ymd('2010-11-01')
val_end_3y = ymd('2010-12-31')
test_start_3y = ymd('2011-01-01')
test_end_3y = ymd('2011-03-31')

train_start_4y = ymd('2008-01-01')
train_end_4y = ymd('2011-10-31')
val_start_4y = ymd('2011-11-01')
val_end_4y = ymd('2011-12-31')
test_start_4y = ymd('2012-01-01')
test_end_4y = ymd('2012-03-31')

train_start_5y = ymd('2008-01-01')
train_end_5y = ymd('2012-10-31')
val_start_5y = ymd('2012-11-01')
val_end_5y = ymd('2012-12-31')
test_start_5y = ymd('2013-01-01')
test_end_5y = ymd('2013-03-31')

rolling_split_dates = rbindlist(
    list(get_rolling_split_dates(train_start_1y, train_end_1y, val_start_1y, val_end_1y, test_start_1y, test_end_1y, roll_type='1y'),
         get_rolling_split_dates(train_start_2y, train_end_2y, val_start_2y, val_end_2y, test_start_2y, test_end_2y, roll_type='2y'),
         get_rolling_split_dates(train_start_3y, train_end_3y, val_start_3y, val_end_3y, test_start_3y, test_end_3y, roll_type='3y'),
         get_rolling_split_dates(train_start_4y, train_end_4y, val_start_4y, val_end_4y, test_start_4y, test_end_4y, roll_type='4y'),
         get_rolling_split_dates(train_start_5y, train_end_5y, val_start_5y, val_end_5y, test_start_5y, test_end_5y, roll_type='5y')),
    use=T)

nonrolling_split = list(window='non-roll-01', 
                        train_start=ymd('2008-01-01'),
                        train_end=ymd('2018-07-01'),
                        val_start = ymd('2018-07-01'),
                        val_end = ymd('2018-10-01'),
                        test_start=ymd('2018-10-01'),
                        test_end=ymd('2018-12-31'),
                        roll_type='non')

rolling_split_dates = rbindlist(list(rolling_split_dates, nonrolling_split), fill=T)[order(roll_type, window)][test_end<ymd('2019-01-01')]
rolling_split_dates

# write to csv
sv(rolling_split_dates)
fwrite(rolling_split_dates, './data/split_dates.csv')

window,train_start,train_end,val_start,val_end,test_start,test_end,roll_type
<chr>,<date>,<date>,<date>,<date>,<date>,<date>,<chr>
roll-01,2008-01-01,2008-10-31,2008-11-01,2008-12-31,2009-01-01,2009-03-31,1y
roll-02,2008-04-01,2009-01-31,2009-02-01,2009-03-31,2009-04-01,2009-06-30,1y
roll-03,2008-07-01,2009-04-30,2009-05-01,2009-06-30,2009-07-01,2009-09-30,1y
roll-04,2008-10-01,2009-07-30,2009-08-01,2009-09-30,2009-10-01,2009-12-30,1y
roll-05,2009-01-01,2009-10-30,2009-11-01,2009-12-30,2010-01-01,2010-03-30,1y
roll-06,2009-04-01,2010-01-30,2010-02-01,2010-03-30,2010-04-01,2010-06-30,1y
roll-07,2009-07-01,2010-04-30,2010-05-01,2010-06-30,2010-07-01,2010-09-30,1y
roll-08,2009-10-01,2010-07-30,2010-08-01,2010-09-30,2010-10-01,2010-12-30,1y
roll-09,2010-01-01,2010-10-30,2010-11-01,2010-12-30,2011-01-01,2011-03-30,1y
roll-10,2010-04-01,2011-01-30,2011-02-01,2011-03-30,2011-04-01,2011-06-30,1y


-rolling_split_dates- saved  (0 secs)


## `CAR` target

Task:
- for every call, find the car deciles of the previous month

In [None]:
targets_df = as.data.table(read_feather('data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text.feather'))

all_cols = names(targets_df)
text_cols = c('text_present', 'text_qa', 'text_ans', 'text_ques', 'text_all')
non_text_cols = all_cols[!all_cols %in% text_cols]

targets_df = targets_df[, ..non_text_cols]

car_target = targets_df[, .(docid, gvkey, ciq_call_date, car_target=car_0_30)
    ][order(ciq_call_date, docid)] %>% na.omit()

sv(car_target)
car_target[]

# Backup

In [7]:
sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim)) %>% cat()

f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow = inflow[, ':='(join_date=report_dt)
    ][f_sue_keydevid_car_finratio_vol_transcriptid_sim[, ':='(join_date=ciq_call_date-3)], 
      on=.(permno, join_date>=join_date), allow.cartesian=T
    ][is.na(inflow) | (report_dt-ciq_release_date>35), ':='(inflow=0)
    ][, ':='(inflow=sum(inflow)), keyby=.(docid)
    ][, ':='(join_date=NULL, mv=NULL, report_dt=NULL)
    ] %>% unique()

sprintf('N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: %s\n', nrow(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow)) %>% cat()
sv(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow)
f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow[1]

N f_sue_keydevid_car_finratio_vol_transcriptid_sim: 21822
N f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow: 21822
-f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow- saved  (0.35 secs)


permno,inflow,docid,similarity_unigram,similarity_bigram,similarity_allgram,gvkey,datadate,fyear,fqtr,ciq_release_date,ciq_call_date,rdq,rdq1,repdats,leadrdq1,release_keydevid,call_keydevid,transcriptid,mcap,sue,sue_lag1,sue_lead1,se,se_lag1,se_lead1,sest,sest_lag1,sest_lead1,numest,smedest,sstdest,volume,car_m1_m1,car_m2_m2,car_m30_m3,car_0_10,car_0_10_lag1,car_0_10_lead1,car_0_20,car_0_20_lag1,car_0_20_lead1,car_0_30,car_0_30_lag1,car_0_30_lead1,bm,roa,debt_asset,alpha,beta_mktrf,beta_smb,beta_hml,volatility
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<date>,<dbl>,<dbl>,<date>,<date>,<date>,<date>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
50906,0.1976949,001013-2008-03-05,0.7845072,0.0009473079,0.02925144,1013,2008-01-31,2008,1,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-03-05,2008-06-04,5241424,5152678,320,1.739304,0.006761325,,0.006062767,0.01893171,,0.0278174,0.01217039,,0.02175464,17,0.01217039,0.001547043,3.996961,0.7939703,-1.043903,-2.655725,-0.09076894,,0.1150231,-0.06616572,,0.04588225,-6.567197,,0.3902708,0.477,0.099,0.429,-0.08592177,0.9685945,-0.2939265,1.243559,3.171002


In [64]:
# get targets_df
dt = as.data.table(read_feather('data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text_norm.feather'))
all_cols = names(dt)
text_cols = c('text_present', 'text_qa', 'text_ans', 'text_ques', 'text_all')
non_text_cols = all_cols[!all_cols %in% text_cols]
dt = dt[, ..non_text_cols]

In [69]:
nrow(dt)
dt[, .(sd_car = sd(car_0_30[1:17410]),
       sd_car_norm = sd(car_0_30_norm[17411:.N]))]

sd_car,sd_car_norm
<dbl>,<dbl>
12.12621,1.03667


In [None]:
        similarity = targets.similarity_bigram
        sentiment = targets.qa_positive_sent
        sue = targets.sue
        sest = targets.sest        
        alpha = targets.alpha
        volatility = targets.volatility
        mcap = targets.mcap
        bm = targets.bm
        roa = targets.roa
        debt_asset = targets.debt_asset
        numest = targets.numest
        smedest = targets.smedest
        sstdest = targets.sstdest
        car_m1_m1 = targets.car_m1_m1
        car_m2_m2 = targets.car_m2_m2
        car_m30_m3 = targets.car_m30_m3
        volume = targets.volume