# Init

In [5]:
import datatable as dt
import multiprocessing as mp
import os
import time

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from datatable import f
from functools import partial
from multiprocessing import Process
from tqdm import tqdm

# set working directory
WORK_DIR = '/home/yu/OneDrive/CC'
DATA_DIR = f'{WORK_DIR}/data'
WRDS_DOWNLOAD_DIR = f'{DATA_DIR}/WRDS-download'

os.chdir(WORK_DIR)

# initialize data.table
dt.init_styles()

# Build Doc in spaCy

## Register span.ext

In [2]:
# load spacy into GPU
import spacy
spacy.require_gpu(0)
nlp = spacy.load('en_core_web_lg')

from spacy.attrs import ORTH
from spacy.tokens import Doc, DocBin, Span

# Use the simpliest pipeline
# 'tok2vec', 'parser', 'lemmatizer', 'tagger', 'attribute_ruler'
# nlp = spacy.load("en_core_web_lg", disable=['ner', 'parser', 'tok2vec', 'tagger', 'lemmatizer', 'attribute_ruler'])
# nlp = spacy.load("en_core_web_lg")

# Add a simple sentencizer
# nlp.add_pipe('sentencizer')

# register extension for Doc
Doc.set_extension('transcriptid', default=None, force=True)

# register extension for Span
Span.set_extension('transcriptid', default=None, force=True)
Span.set_extension('componentid', default=None, force=True)
Span.set_extension('componentorder', default=None, force=True)
Span.set_extension('componenttypeid', default=None, force=True)
Span.set_extension('speakerid', default=None, force=True)
Span.set_extension('speakertypeid', default=None, force=True)
Span.set_extension('is_component', default=False, force=True)

## Load data

In [3]:
# Load components as a 2D table
ld('text_component_sp500', ldname='text_component', force=True)
text_component = dt.Frame(text_component)

# conver 2D table to tuples
text_component = text_component.to_tuples()
text_component = [(line[6], 
                   {'transcriptid': line[2],
                    'componentid': line[0],
                    'componenttypeid': line[4],
                    'componentorder': line[3],
                    'speakerid': int(line[5]) if line[5]!=None else None,
                    'speakertypeid': int(line[1]) if line[1]!=None else None
                   }) for line in text_component]

text_component_grouped = {}
for text, context in text_component:
    tid = context['transcriptid']
    if tid in text_component_grouped:
        text_component_grouped[tid].append((text, context))
    else:
        text_component_grouped[tid] = [(text, context)]

"text_component_sp500.feather" (978.0 MB) loaded as "text_component" (2s)


## Build Doc

> Only need to run this sectoin **ONCE**. It will hold all the ground truth and will never be altered.

In [4]:
def make_one_doc(line):
    '''Convert list of components into a single Doc
    '''
    # Output holder
    components = []

    # Within every transcriptid, iterature through every component
    for component, context in nlp.pipe(line, as_tuples=True):
        
        # Assign component-level attributes
        component[:]._.is_component = True
        component[:]._.transcriptid = context['transcriptid']
        component[:]._.componentid = context['componentid']
        component[:]._.componenttypeid = context['componenttypeid']
        component[:]._.componentorder = context['componentorder']
        component[:]._.speakerid = context['speakerid']
        component[:]._.speakertypeid = context['speakertypeid']

        # Assign sentence-level attributes
        for sent in component.sents:
            sent._.componentid = context['componentid']

        # return
        components.append(component)

    # join components into one Doc
    doc = Doc.from_docs(components)

    # Add Doc-level attribute: "transcriptid"
    doc._.transcriptid = context['transcriptid']

    # create SpanGroup "components" for doc
    spans_component = []
    for k, v in doc.user_data.items():
        if k[1]=='is_component':
            if v==True:
                spans_component.append(doc.char_span(k[2], k[3]))

    doc.spans['components'] = spans_component 

    # return     
    return doc

In [5]:
# ------------- With Chunks -------------------
# Because of memory limitation, we split the data into chunks and process/store one by one.

n_chunks = 10
chunk_size = len(text_component_grouped)//n_chunks+1

text_component_grouped_chunked = list(chunks(list(text_component_grouped.values()), chunk_size))

del text_component, text_component_grouped

# Parse
time_start = time.perf_counter()
for i in range(n_chunks):

    print(f'Processing chunks: {i+1}/{n_chunks}')
    
    data = text_component_grouped_chunked[i]

    # parse
    docs = []
    for comps in tqdm(data):
        # print len(nlp.vocab)
        time_gap = time.perf_counter()-time_start
        if time_gap >= 300: # every 5min
            print(f'len(vocab): {len(nlp.vocab)}')
            time_start = time.perf_counter()
            
        docs.append(make_one_doc(comps))
    
    # save Doc
    docbin = DocBin(store_user_data=True, docs=docs, attrs=['ORTH', 'LEMMA', 'MORPH', 'POS', 'TAG', 'HEAD', 'DEP', 'ENT_IOB', 'ENT_TYPE'])
    docbin.to_disk(f'data/doc_sp500_lg_{i}.spacy')

    del docs, docbin

  0%|          | 0/3764 [00:00<?, ?it/s]

Processing chunks: 1/10


 22%|██▏       | 825/3764 [05:00<24:36,  1.99it/s]

len(vocab): 54029


 44%|████▍     | 1673/3764 [10:00<12:44,  2.73it/s]

len(vocab): 78277


 67%|██████▋   | 2519/3764 [15:00<06:57,  2.98it/s]

len(vocab): 96555


 90%|█████████ | 3392/3764 [20:00<02:41,  2.31it/s]

len(vocab): 110216


100%|██████████| 3764/3764 [22:13<00:00,  2.82it/s]
  0%|          | 0/3764 [00:00<?, ?it/s]

Processing chunks: 2/10


  5%|▍         | 183/3764 [01:02<21:36,  2.76it/s]

len(vocab): 117913


 27%|██▋       | 1020/3764 [06:03<19:03,  2.40it/s]

len(vocab): 127268


 50%|████▉     | 1871/3764 [11:03<12:45,  2.47it/s]

len(vocab): 135640


 73%|███████▎  | 2766/3764 [16:03<04:46,  3.49it/s]

len(vocab): 145171


 97%|█████████▋| 3653/3764 [21:03<00:37,  3.00it/s]

len(vocab): 154834


100%|██████████| 3764/3764 [21:41<00:00,  2.89it/s]
  0%|          | 0/3764 [00:00<?, ?it/s]

Processing chunks: 3/10


 13%|█▎        | 475/3764 [02:38<16:22,  3.35it/s]

len(vocab): 160858


 37%|███▋      | 1389/3764 [07:38<14:50,  2.67it/s]

len(vocab): 167734


 61%|██████    | 2304/3764 [12:38<08:25,  2.89it/s]

len(vocab): 174189


 85%|████████▌ | 3207/3764 [17:38<02:29,  3.73it/s]

len(vocab): 180070


100%|██████████| 3764/3764 [20:44<00:00,  3.02it/s]
  0%|          | 0/3764 [00:00<?, ?it/s]

Processing chunks: 4/10


  1%|          | 42/3764 [00:12<19:27,  3.19it/s]

len(vocab): 183716


 25%|██▌       | 952/3764 [05:12<13:59,  3.35it/s]

len(vocab): 189066


 49%|████▉     | 1837/3764 [10:12<10:57,  2.93it/s]

len(vocab): 193706


 72%|███████▏  | 2723/3764 [15:12<06:00,  2.89it/s]

len(vocab): 198119


 97%|█████████▋| 3640/3764 [20:12<00:38,  3.20it/s]

len(vocab): 202495


100%|██████████| 3764/3764 [20:53<00:00,  3.00it/s]
  0%|          | 0/3764 [00:00<?, ?it/s]

Processing chunks: 5/10


 12%|█▏        | 452/3764 [02:35<26:34,  2.08it/s]

len(vocab): 205183


 36%|███▌      | 1341/3764 [07:35<13:19,  3.03it/s]

len(vocab): 209231


 60%|██████    | 2268/3764 [12:35<09:07,  2.73it/s]

len(vocab): 212998


 84%|████████▍ | 3176/3764 [17:36<02:55,  3.36it/s]

len(vocab): 216804


100%|██████████| 3764/3764 [20:49<00:00,  3.01it/s]
  0%|          | 0/3764 [00:00<?, ?it/s]

Processing chunks: 6/10


  0%|          | 7/3764 [00:02<18:31,  3.38it/s]

len(vocab): 219296


 24%|██▍       | 904/3764 [05:02<14:43,  3.24it/s]

len(vocab): 222964


 49%|████▊     | 1834/3764 [10:02<11:03,  2.91it/s]

len(vocab): 226667


 73%|███████▎  | 2761/3764 [15:02<05:15,  3.18it/s]

len(vocab): 230246


 97%|█████████▋| 3653/3764 [20:02<00:35,  3.16it/s]

len(vocab): 233465


100%|██████████| 3764/3764 [20:39<00:00,  3.04it/s]
  0%|          | 0/3764 [00:00<?, ?it/s]

Processing chunks: 7/10


 13%|█▎        | 492/3764 [02:39<20:54,  2.61it/s]

len(vocab): 235593


 38%|███▊      | 1418/3764 [07:39<16:35,  2.36it/s]

len(vocab): 238865


 62%|██████▏   | 2322/3764 [12:40<09:48,  2.45it/s]

len(vocab): 242144


 86%|████████▋ | 3250/3764 [17:40<02:28,  3.45it/s]

len(vocab): 245269


100%|██████████| 3764/3764 [20:24<00:00,  3.07it/s]
  0%|          | 0/3764 [00:00<?, ?it/s]

Processing chunks: 8/10


  3%|▎         | 95/3764 [00:33<25:21,  2.41it/s]

len(vocab): 247383


 27%|██▋       | 1021/3764 [05:33<16:22,  2.79it/s]

len(vocab): 250486


 52%|█████▏    | 1964/3764 [10:33<11:26,  2.62it/s]

len(vocab): 253615


 77%|███████▋  | 2902/3764 [15:34<04:11,  3.43it/s]

len(vocab): 256859


100%|██████████| 3764/3764 [20:12<00:00,  3.10it/s]
  0%|          | 0/3764 [00:00<?, ?it/s]

Processing chunks: 9/10
len(vocab): 259539


 25%|██▍       | 933/3764 [05:00<15:29,  3.04it/s]

len(vocab): 262544


 50%|████▉     | 1873/3764 [10:00<09:25,  3.34it/s]

len(vocab): 265439


 74%|███████▍  | 2799/3764 [15:00<04:05,  3.93it/s]

len(vocab): 268435


 99%|█████████▉| 3739/3764 [20:01<00:06,  3.94it/s]

len(vocab): 271507


100%|██████████| 3764/3764 [20:08<00:00,  3.11it/s]
  0%|          | 0/3754 [00:00<?, ?it/s]

Processing chunks: 10/10


 16%|█▌        | 604/3754 [03:10<20:27,  2.57it/s]

len(vocab): 273486


 41%|████▏     | 1555/3754 [08:10<11:43,  3.13it/s]

len(vocab): 276298


 65%|██████▍   | 2435/3754 [13:11<06:29,  3.38it/s]

len(vocab): 278799


 89%|████████▊ | 3328/3754 [18:11<01:49,  3.89it/s]

len(vocab): 281296


100%|██████████| 3754/3754 [20:27<00:00,  3.06it/s]


In [None]:
# save nlp pipeline
nlp_dir = 'data/nlp_lg_gpu'
if not os.path.exists(nlp_dir):
    os.makedirs(nlp_dir)

# save nlp meta data
lang = nlp.config["nlp"]["lang"]
pipeline = nlp.config["nlp"]["pipeline"]

with open(f'{nlp_dir}/nlp_metadata.txt', 'w+') as f:
    f.write(f'lang: {lang}\n')
    f.write(f'pipeline: {str(pipeline)}')
    
# save nlp
nlp.to_disk(nlp_dir)

# Sentencize

In [None]:
import spacy
# spacy.require_gpu(0)
nlp_dir = 'data/nlp_lg_gpu'
nlp = spacy.load(nlp_dir)

from spacy.tokens import Doc, DocBin, Span


# register extension for Doc
Doc.set_extension('transcriptid', default=None, force=True)

# Register extension for Span
Span.set_extension('transcriptid', default=None, force=True)
Span.set_extension('componentid', default=None, force=True)
Span.set_extension('componentorder', default=None, force=True)
Span.set_extension('componenttypeid', default=None, force=True)
Span.set_extension('speakerid', default=None, force=True)
Span.set_extension('speakertypeid', default=None, force=True)
Span.set_extension('is_component', default=False, force=True)

# results holder
sents = []

# fill results
for _ in tqdm(range(10)): # last 10%
    # load doc
    docs_chunk = list(DocBin(store_user_data=True)\
                      .from_disk(f'data/doc_sp500_lg_{_}.spacy')\
                      .get_docs(nlp.vocab))
    print(f'Chunk {_}: N_doc={len(docs_chunk)}')

    # get sents from doc
    for doc in tqdm(docs_chunk):
        for i_sent, sent in enumerate(doc.sents):
            sents.append((doc._.transcriptid, 
                          sent._.componentid, 
                          i_sent, sent.text))
            
# convert results into Frame
dt_sents = dt.Frame(sents, names=['transcriptid', 'componentid', 'sentenceid', 'text'])

# save feather
sv('dt_sents', svname='dt_sents_sp500')
sv('sents', svname='sents_sp500')

# Build (tid, cid) pair (R)

## Link: tid_cid

> `tid_cid_pair` has the form:
>
> `{tid:[cid1, cid2, ...]}`
>
> It's used to select text of interests

In [15]:
# load data
suppressMessages(suppressWarnings({
    library(utilr)
}))

WORK_DIR = '/home/yu/OneDrive/CC'
DATA_DIR = str_c(WORK_DIR, '/data')
WRDS_DOWNLOAD_DIR = str_c(DATA_DIR, '/WRDS-download')
setwd(WORK_DIR)
cat(str_c('Current working directory: ', getwd(), '\n'))

ld(f_ciq_transcript_component_sp500)
ld(ciq_transcript_speaker, path=WRDS_DOWNLOAD_DIR)

Current working directory: /home/yu/OneDrive/CC
f_ciq_transcript_component_sp500 (974.5 MB) already loaded, will NOT load again! (0 secs) (2021-03-03 5:28 PM)
"ciq_transcript_speaker.feather" (7.8 GB) loaded (1.68 mins) (2021-03-03 5:30 PM)


`speakertype`
- 1: Operator
- 2: Exeutives
- 3: Analyst
- 4: Shareholders
- 5: Attendees

`transcriptcomponenttypeid`
- 1: Presentation Operator Message
- 2: Presenter Speech
- 3: Question
- 4: Answer
- 5: Presentation Section (NULL)
- 6: Question and Answer Section (NULL)
- 7: Question and Answer Operator Message
- 8: Unknown Question and Answer Message

### All

In [None]:
# text: all text
selected_componentid = ciq_transcript_speaker[transcriptcomponenttypeid %in% c(2,3,4) & speakertypeid %in% c(2,3), unique(transcriptcomponentid)]

tid_cid_pair_all = f_ciq_transcript_component_sp500[,
      .(transcriptid, componentid=transcriptcomponentid, componentorder)
    ][componentid %in% selected_componentid
    ][order(transcriptid, componentorder)
    ][, .(componentid=list(componentid)), keyby=transcriptid]
      
tid_cid_pair_all[1:2]
sv(tid_cid_pair_all)

### MD

In [18]:
# text: Only Management Discusson
selected_componentid = ciq_transcript_speaker[transcriptcomponenttypeid==2 & speakertypeid==2, unique(transcriptcomponentid)]

tid_cid_pair_md = f_ciq_transcript_component_sp500[,
      .(transcriptid, componentid=transcriptcomponentid, componentorder)
    ][componentid %in% selected_componentid
    ][order(transcriptid, componentorder)
    ][, .(componentid=list(componentid)), keyby=transcriptid]

tid_cid_pair_md[1]
sv(tid_cid_pair_md)

transcriptid,componentid
<int>,<list>
108,"30185, 30186, 30187, 30188"


"tid_cid_pair_md" saved as "tid_cid_pair_md.feather" (853.7 KB) (0.47 secs, 2021-03-03 17:39:24)


### QA

In [19]:
selected_componentid = ciq_transcript_speaker[transcriptcomponenttypeid %in% c(3,4) & speakertypeid %in% c(2,3), unique(transcriptcomponentid)]

tid_cid_pair_qa = f_ciq_transcript_component_sp500[,
      .(transcriptid, componentid=transcriptcomponentid, componentorder)
    ][componentid %in% selected_componentid
    ][order(transcriptid, componentorder)
    ][, .(componentid=list(componentid)), keyby=transcriptid]

tid_cid_pair_qa[1]      
sv(tid_cid_pair_qa)

transcriptid,componentid
<int>,<list>
108,"30190, 30191, 30192, 30193, 30194, 30195, 30196, 30197, 30198, 30199, 30200, 30202, 30203, 30204, 30205, 30206, 30207, 30208, 30209, 30210, 30211, 30212, 30214, 30215, 30216, 30217, 30218, 30220, 30221, 30222, 30224, 30225, 30226, 30227, 30229, 30231, 30233, 30235, 30237, 30239, 30241, 30243, 30244, 30245, 30246, 30247, 30248, 30249, 30250, 30251, 30252, 30253, 30255, 30256, 30257, 30258, 30259, 30261"


"tid_cid_pair_qa" saved as "tid_cid_pair_qa.feather" (8.9 MB) (0.38 secs, 2021-03-03 17:41:03)


### QA_analyst

In [20]:
selected_componentid = ciq_transcript_speaker[transcriptcomponenttypeid %in% c(3) & speakertypeid %in% c(3), unique(transcriptcomponentid)]

tid_cid_pair_qa_analyst = f_ciq_transcript_component_sp500[,
      .(transcriptid, componentid=transcriptcomponentid, componentorder)
    ][componentid %in% selected_componentid
    ][order(transcriptid, componentorder)
    ][, .(componentid=list(componentid)), keyby=transcriptid]

tid_cid_pair_qa_analyst[1]      
sv(tid_cid_pair_qa_analyst)

transcriptid,componentid
<int>,<list>
108,"30190, 30192, 30194, 30196, 30198, 30200, 30202, 30204, 30206, 30208, 30210, 30212, 30214, 30216, 30218, 30220, 30222, 30225, 30227, 30229, 30231, 30233, 30235, 30237, 30239, 30241, 30243, 30245, 30247, 30249, 30251, 30253, 30255, 30257, 30259"


"tid_cid_pair_qa_analyst" saved as "tid_cid_pair_qa_analyst.feather" (4 MB) (0.41 secs, 2021-03-03 17:41:45)


### QA_manager

In [22]:
selected_componentid = ciq_transcript_speaker[transcriptcomponenttypeid %in% c(4) & speakertypeid %in% c(2), unique(transcriptcomponentid)]

tid_cid_pair_qa_manager = f_ciq_transcript_component_sp500[,
      .(transcriptid, componentid=transcriptcomponentid, componentorder)
    ][componentid %in% selected_componentid
    ][order(transcriptid, componentorder)
    ][, .(componentid=list(componentid)), keyby=transcriptid]

tid_cid_pair_qa_manager[1]      
sv(tid_cid_pair_qa_manager)

transcriptid,componentid,componentorder
<int>,<int>,<int>
108,30191,8


"tid_cid_pair_qa_manager" saved as "tid_cid_pair_qa_manager_old.feather" (6.5 MB) (0.01 secs, 2021-03-03 17:49:57)


## Link: tid_tid

> Output links in the form of `{transcript_from: [transcript_to]}`

In [None]:
# text: link to previous n qtrs
ld(targets_final, 'targets_df')

make_tid_from_to_pair <- function(n_yqtr) {
    n_yqtr = n_yqtr - 1
    
    tid_from_to_pair = targets_df[order(gvkey, ciq_call_date)
        ][, {
          l = list()
          for (t in 1:.N) {
              if (t<=n_yqtr) l[[t]] = list('transcriptid_from'=transcriptid[t],
                                      'transcriptid_to'=list(transcriptid[t:1]))
              else l[[t]] = list('transcriptid_from'=transcriptid[t],
                                 'transcriptid_to'=list(transcriptid[t:(t-n_yqtr)]))
          }
          rbindlist(l)
          }, 
          keyby=.(gvkey)
        ]
}


tid_from_to_pair_1qtr = make_tid_from_to_pair(1); sv(tid_from_to_pair_1qtr)
tid_from_to_pair_2qtr = make_tid_from_to_pair(2); sv(tid_from_to_pair_2qtr)
tid_from_to_pair_4qtr = make_tid_from_to_pair(4); sv(tid_from_to_pair_4qtr)
tid_from_to_pair_8qtr = make_tid_from_to_pair(8); sv(tid_from_to_pair_8qtr)

tid_from_to_pair_8qtr

# Explore transcript-speaker

> Switch to R kernel now!!!

In [5]:
# library
suppressMessages(suppressWarnings({
    library(utilr)
}))

# set working directory
WORK_DIR = '/home/yu/OneDrive/CC'
DATA_DIR = sprintf('%s/data', WORK_DIR)
WRDS_DOWNLOAD_DIR = sprintf('%s/WRDS-download', DATA_DIR)

Yu's data science toolbox loaded! 


In [91]:
ld(ciq_transcript_speaker, path=WRDS_DOWNLOAD_DIR)
ld(ciq_transcript_detail, path=WRDS_DOWNLOAD_DIR)
ld(ciq_keydev, path=WRDS_DOWNLOAD_DIR)
ld(sp500_cst)
ld(ciq_person, path=WRDS_DOWNLOAD_DIR)
ld(f_ciq_transcript_detail_sp500)
ld(ciq_wrds_professional, path=WRDS_DOWNLOAD_DIR)
ld(f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_norm_outlier, ldname=earnings, force=T)

ciq_transcript_speaker (7.8 GB) already loaded, will NOT load again! (0 secs)
ciq_transcript_detail (54.6 MB) already loaded, will NOT load again! (0 secs)
ciq_keydev (899.8 MB) already loaded, will NOT load again! (0 secs)
sp500_cst (24.8 KB) already loaded, will NOT load again! (0 secs)
ciq_person (167 MB) already loaded, will NOT load again! (0 secs)
f_ciq_transcript_detail_sp500 (2.4 MB) already loaded, will NOT load again! (0 secs)
ciq_wrds_professional (988.3 MB) already loaded, will NOT load again! (0 secs)
"f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_retail_sentiment_norm_outlier.feather" (15.9 MB) loaded as "earnings" (0.03 secs)


In [64]:
transcriptid_sp500 = unique(f_ciq_transcript_detail_sp500$transcriptid)

speaker = ciq_transcript_speaker[transcriptid %in% transcriptid_sp500
    ][!is.na(proid),
      .(speakertypename), keyby=.(transcriptid, proid)
    ][ciq_transcript_detail[, .(transcriptid, date=mostimportantdateutc)], on=.(transcriptid)
    ][!is.na(date)] %>% unique()

speaker[1]

transcriptid,proid,speakertypename,date
<dbl>,<dbl>,<chr>,<date>
93162,,,2010-11-11


> Q: How many executives speak in an earnings call?

In [12]:
speaker[speakertypename=='Executives', .(n=uniqueN(proid)), keyby=.(transcriptid)]$n %>% table()

.
    1     2     3     4     5     6     7     8     9    10    11    12    14 
  862  6198 12567  5918  3369  1441   518   162    41    13     1     1     1 

In [15]:
# this transriptid has 12 executives speaking!!!
tid = speaker[speakertypename=='Executives', .(n=uniqueN(proid)), keyby=.(transcriptid)
    ][n==12]$transcriptid

f_ciq_transcript_detail_sp500[transcriptid==tid]

keydevid,companyid,transcriptid,headline,mostimportantdateutc,keydeveventtypeid,keydeveventtypename,companyname,transcriptcollectiontypeid,transcriptcollectiontypename,transcriptpresentationtypeid,transcriptpresentationtypename,transcriptcreationdate_utc,transcriptcreationtime_utc,audiolengthsec,isdelayed_flag,delayreasontypeid,delayreasontypename,latest_transcript_version,gvkey
<dbl>,<dbl>,<dbl>,<chr>,<date>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>
654393367,313055,1936461,"Walmart Inc., Q4 2020 Earnings Call, Feb 18, 2020",2020-02-18,48,Earnings Calls,Walmart Inc.,8,Audited Copy,5,Final,2020-03-13,50559,23614,0,,,8,11259


> Who asks questions?

In [81]:
ciq_wrds_professional[1]

companyid,personid,proid,profunctionid,companyname,personname,profunctionname,yearborn,yearfounded,countryid,⋯,topkeyexecflag,advisorflag,graduateflag,dealmakerflag,sponsorflag,undergraduateflag,onlyoneflag,companyflag,hideflag,committeeid
<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
18493,66577,66578,10,DC Venture Partners,"Wakefield, Kevin",Senior Key Executive,,1998,213,⋯,0,1,0,0,1,0,0,1,0,


In [120]:
analysts = speaker[speakertypename=='Analysts'
    ][f_ciq_transcript_detail_sp500[,.(transcriptid, gvkey, calldate=mostimportantdateutc)], on=.(transcriptid), nomatch=NULL
    ][ciq_wrds_professional[, .(proid, companyid)], on=.(proid), nomatch=NULL
    ][, .(n_analyst=uniqueN(proid), n_company=uniqueN(companyid)), keyby=.(gvkey, transcriptid, calldate)
    ]


analysts = earnings[, .(transcriptid, numest)
    ][analysts, .(gvkey, calldate, transcriptid, numest, n_analyst, n_company), on=.(transcriptid)
    ][order(gvkey, calldate, transcriptid)]

analysts[1]

analysts %>% fwrite('data/analysts.csv')

gvkey,calldate,transcriptid,numest,n_analyst,n_company
<chr>,<date>,<int>,<dbl>,<int>,<int>
1013,2009-06-03,23454,,6,6


In [122]:
analysts[, summary(numest)]
analysts[, summary(n_analyst)]

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   2.00    7.00   11.00   12.62   17.00   49.00   12183 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   7.000   9.000   9.259  12.000  27.000 

In [107]:
dt = earnings[analysts, on=.(transcriptid)][!is.na(docid)]
dt[1]

docid,present_positive_chunk,present_positive_sent,present_neutral_chunk,present_neutral_sent,present_negative_chunk,present_negative_sent,qa_positive_chunk,qa_positive_sent,qa_neutral_chunk,⋯,volatility_norm,volume_norm,similarity_bigram_norm,qa_positive_sent_norm,outlier_flag1,i.gvkey,calldate,i.numest,n_analyst,n_company
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<chr>,<date>,<dbl>,<int>,<int>
001013-2010-05-05,0.7199231,0.3747692,0.1030769,0.5261538,0.1030769,0.5261538,0.5890883,0.2158689,0.09418803,⋯,1.021818,-0.1758587,-0.1371144,-0.7544068,False,1013,2010-05-05,11,9,9


In [119]:
lm(car_0_30_norm ~ mcap_norm + sue + car_m1_m1 + car_m2_m2 + car_m30_m3 + bm + roa + debt_asset + 
    volatility + alpha + numest + sstdest + smedest + volume + inflow + revision + I(growth(n_analyst)), data=dt) %>% summary()


Call:
lm(formula = car_0_30_norm ~ mcap_norm + sue + car_m1_m1 + car_m2_m2 + 
    car_m30_m3 + bm + roa + debt_asset + volatility + alpha + 
    numest + sstdest + smedest + volume + inflow + revision + 
    I(growth(n_analyst)), data = dt)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.2185 -0.4066  0.0156  0.4200  7.9833 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)           4.274e-02  2.960e-02   1.444  0.14880    
mcap_norm            -4.464e-02  7.559e-03  -5.906 3.58e-09 ***
sue                   3.860e+00  4.246e-01   9.091  < 2e-16 ***
car_m1_m1            -6.860e-03  3.984e-03  -1.722  0.08508 .  
car_m2_m2            -8.950e-03  4.084e-03  -2.192  0.02843 *  
car_m30_m3           -3.325e-03  7.916e-04  -4.200 2.68e-05 ***
bm                    1.347e-02  1.423e-02   0.947  0.34390    
roa                   9.599e-02  6.735e-02   1.425  0.15415    
debt_asset            2.091e-03  2.829e-02   0.074  0.94110    
volatilit