# Init

In [1]:
import datatable as dt
import multiprocessing as mp
import numpy as np
import re

# load trained nlp
import spacy
# spacy.require_gpu(0)
nlp_dir = 'data/nlp_lg_gpu'
nlp = spacy.load(nlp_dir)

import time

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from datatable import f
from functools import partial
from spacy.tokens import Doc, DocBin, Span
from tqdm import tqdm

dt.init_styles()

WORK_DIR = '/home/yu/OneDrive/CC'
DATA_DIR = f'{WORK_DIR}/data'
WRDS_DOWNLOAD_DIR = f'{DATA_DIR}/WRDS-download'
os.chdir(WORK_DIR)

# Filter text

## Load Doc

In [5]:
# ----------------  Load DocBin from disk -----------------

# register extension for Doc
Doc.set_extension('transcriptid', default=None, force=True)

# Register extension for Span
Span.set_extension('transcriptid', default=None, force=True)
Span.set_extension('componentid', default=None, force=True)
Span.set_extension('componentorder', default=None, force=True)
Span.set_extension('componenttypeid', default=None, force=True)
Span.set_extension('speakerid', default=None, force=True)
Span.set_extension('speakertypeid', default=None, force=True)
Span.set_extension('is_component', default=False, force=True)


# --------------- Select 2015-2020 ----------------
# Frist, Let's select transcriptid from year 2015-2020
ld('transcriptid_2015_2020')
tid_2015_2020 = set(transcriptid_2015_2020['transcriptid'].to_list()[0])

# start loading
docs = []
for _ in tqdm(range(5,10)): # 2015 and after
    doc = list(DocBin(store_user_data=True).from_disk(f'data/doc_sp500_lg_{_}.spacy').get_docs(nlp.vocab))
    
    res = [d for d in doc if d._.transcriptid in tid_2015_2020]
    docs.extend(res)
    
    print(f'i={_}, N_doc={len(res)}')
    del res
    
print(f'N_docs:{len(docs)}')

# save results to doc_sp500_lg_2015_2020.spacy
DocBin(docs, store_user_data=True,).to_disk('data/doc_sp500_lg_2015_2020.spacy', attrs=['ORTH', 'LEMMA', 'MORPH', 'POS', 'TAG', 'HEAD', 'DEP', 'ENT_IOB', 'ENT_TYPE'])

 20%|██        | 1/5 [01:14<04:58, 74.72s/it]

i=5, N_doc=1038


 40%|████      | 2/5 [02:02<02:55, 58.65s/it]

i=6, N_doc=3764


 60%|██████    | 3/5 [02:49<01:47, 53.56s/it]

i=7, N_doc=3736


 80%|████████  | 4/5 [03:38<00:51, 51.74s/it]

i=8, N_doc=3600


100%|██████████| 5/5 [04:28<00:00, 53.64s/it]

i=9, N_doc=3683





# Convert Doc to "string tokens"

In [9]:
# Select componentid that belongs to MD and QA
ld('text_component_sp500', ldname='text_component')
text_component = dt.Frame(text_component)

# componentid: Management Discussion
componentids_md = set(text_component[(f.transcriptcomponenttypeid==2) & (f.speakertypeid==2), f.transcriptcomponentid].to_list()[0])

# componentid: Q & A
componentids_qa = set(text_component[((f.transcriptcomponenttypeid==3) | (f.transcriptcomponenttypeid==4)) & ((f.speakertypeid==2)|(f.speakertypeid==3)), f.transcriptcomponentid].to_list()[0])

# componentid: Q
componentids_q = set(text_component[(f.transcriptcomponenttypeid==3) & ((f.speakertypeid==2)|(f.speakertypeid==3)), f.transcriptcomponentid].to_list()[0])

# componentid: A
componentids_a = set(text_component[(f.transcriptcomponenttypeid==4) & ((f.speakertypeid==2)|(f.speakertypeid==3)), f.transcriptcomponentid].to_list()[0])

"text_component_sp500.feather" (978.0 MB) loaded as "text_component" (3s)


In [10]:
# ----------- Convert Doc to "text tokens" (NO parallel) -------------

# Filtering Rule:
# - only keep lemma
# - no stop words (stop words is informative while comparing)
# - no punctuation
# - no "like number"
# - no currency (e.g., $)
# - token can't contain space

def make_text_tokens(docs):
    
    # results holder
    texttoken = {'md':{}, 'q':{}, 'a':{}}

    # for every doc, generate its texttoken
    for doc in tqdm(docs):
        
        # we'll collect three types of speech
        txttok_md = []
        txttok_q = []
        txttok_a = []

        for span in doc.spans['components']:
            txttok_span = [t.lemma_ for t in span 
                if ((not t.is_punct) and (not t.like_num) and (not t.is_stop) \
                    and (not t.is_space) and (not t.is_currency))]

            # If no text found, add an empty str
            if len(txttok_span)==0:
                txttok_span = []

            # add text tokens
            if span._.componentid in componentids_md:
                txttok_md.extend(txttok_span)
            elif span._.componentid in componentids_q:
                txttok_q.extend(txttok_span)
            elif span._.componentid in componentids_a:
                txttok_a.extend(txttok_span)

        texttoken['md'][doc._.transcriptid] = txttok_md
        texttoken['q'][doc._.transcriptid] = txttok_q
        texttoken['a'][doc._.transcriptid] = txttok_a

    # return
    return texttoken

texttoken = make_text_tokens(docs)
print(f'N docs: {len(texttoken)}') # 2015-2020: 15821 (N_doc)

sv('texttoken')

100%|██████████| 15821/15821 [00:52<00:00, 298.89it/s]


# LDA

In [15]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.matutils import Sparse2Corpus
from gensim.corpora import Dictionary


# -------------- Use ALL text to learn vocabulary ------------------

ld('texttoken', force=True)

vectorizer = CountVectorizer(preprocessor=lambda x: x,
                             tokenizer=lambda x: x,
                             lowercase=False,
                             ngram_range=(1,1),
                             min_df=100,
                             max_df=0.8)

vectorizer.fit([t for t in texttoken['a'].values() if len(t)>0]);
vectorizer.fit([t for t in texttoken['q'].values() if len(t)>0]);
vectorizer.fit([t for t in texttoken['md'].values() if len(t)>0]);

# create idx-word map
id2word = {v:k for k, v in vectorizer.vocabulary_.items()};

print(f'Vocab size: {len(id2word)}');



"texttoken.pkl" (506.3 MB) loaded (5s)




Vocab size: 6287


In [18]:
# ------------------- train model -------------
from gensim.models import LdaModel, LdaMulticore

def train(texttoken):

    # convert to DTM
    dtm = vectorizer.transform(texttoken)
    print(f'N_doc:{dtm.shape[0]}, N_feature:{dtm.shape[1]}')

    # convert to gensim corpus
    corpus = Sparse2Corpus(dtm, documents_columns=False)

    # Train LDA model.
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,

        num_topics=25,
        # workers=8,
        passes=10,
        iterations=100,
        chunksize=30000,

        alpha='auto',
        eta='auto',
        eval_every=2, # slow down the traning, only for debugging,
        per_word_topics=True
    )
    # save model
    
    return model, corpus

model_md, corpus_md = train([t for t in texttoken['md'].values() if len(t)>0])
model_md.save('data/ldamodel/ldamodel_md')

# model_q = train([t for t in texttoken['q'].values() if len(t)>0])
# model_q.save('data/ldamodel/ldamodel_q')

# model_a = train([t for t in texttoken['a'].values() if len(t)>0])
# model_a.save('data/ldamodel/ldamodel_a')

2021-02-03 02:42:33,598 : INFO : using autotuned alpha, starting with [0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04]
2021-02-03 02:42:33,599 : INFO : using serial LDA version on this node
2021-02-03 02:42:33,609 : INFO : running online (multi-pass) LDA training, 25 topics, 10 passes over the supplied corpus of 15819 documents, updating model once every 15819 documents, evaluating perplexity every 15819 documents, iterating 100x with a convergence threshold of 0.001000


N_doc:15819, N_feature:6287


2021-02-03 02:43:33,074 : INFO : -9.366 per-word bound, 659.8 perplexity estimate based on a held-out corpus of 15819 documents with 17000046 words
2021-02-03 02:43:33,075 : INFO : PROGRESS: pass 0, at document #15819/15819
2021-02-03 02:43:56,381 : INFO : optimized alpha [0.050057646, 0.070677474, 0.06190853, 0.06426127, 0.06550222, 0.055606075, 0.06408838, 0.05983901, 0.06030108, 0.054721233, 0.05522122, 0.050722957, 0.0429625, 0.064972505, 0.06584807, 0.06579763, 0.05416649, 0.061528184, 0.07155537, 0.060614157, 0.060017332, 0.06867961, 0.052871656, 0.051722452, 0.048049398]
2021-02-03 02:43:56,388 : INFO : topic #12 (0.043): 0.006*"well" + 0.006*"day" + 0.006*"production" + 0.006*"oil" + 0.005*"price" + 0.004*"guidance" + 0.004*"compare" + 0.004*"activity" + 0.004*"rig" + 0.004*"project"
2021-02-03 02:43:56,389 : INFO : topic #24 (0.048): 0.010*"customer" + 0.005*"product" + 0.004*"U.S." + 0.004*"segment" + 0.004*"range" + 0.004*"project" + 0.004*"guidance" + 0.004*"service" + 0.00

In [20]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()


# config
corpus = corpus_md

# plot
dictionary = Dictionary.from_corpus(corpus, id2word)
pyLDAvis.gensim.prepare(model_md, corpus, dictionary)

2021-02-03 13:18:25,200 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-03 13:18:28,253 : INFO : adding document #10000 to Dictionary(0 unique tokens: [])
2021-02-03 13:18:30,170 : INFO : built Dictionary(6287 unique tokens: ['afternoon', 'welcome', 'fiscal', 'month', 'November']...) from 15819 documents (total 17000046 corpus positions)


In [10]:
# print topics (WITHOUT weight)
# topics = model.show_topics(20, 50, formatted=False)

for t in model_md.show_topics(100, 50, formatted=False):
    id_topic = t[0]
    words = ', '.join([w[0] for w in t[1]])
    print(f'TOPIC_ID: {id_topic}\nWORDS: {words}\n')

TOPIC_ID: 0
WORDS: sale, point, basis, product, margin, currency, operate, organic, customer, cost, volume, Tools, operating, deliver, offset, digit, gain, adjust, grow, acquisition, line, foreign, income, compare, segment, tool, improve, lead, U.S., unfavorable, flavor, like, work, innovation, relate, net, slide, company, brand, repair, expense, great, base, progress, decline, level, single, benefit, positive, improvement

TOPIC_ID: 1
WORDS: customer, investment, plan, New, service, gas, project, program, utility, support, infrastructure, distribution, generation, state, cost, slide, operate, term, system, electric, transmission, company, base, long, power, case, deliver, work, reliability, capacity, benefit, revenue, Virginia, morning, energy, month, operating, plant, Ohio, nuclear, approve, approximately, file, PJM, let, compare, FERC, guidance, early, regulatory

TOPIC_ID: 2
WORDS: sale, product, volume, segment, cost, U.S., America, demand, China, approximately, Brazil, water, Nor

In [None]:
# print topics (WITH weight)
for idx, topic in model.print_topics(20, 50):
    id_topic = idx
    print(f'id_topic:{id_topic}\nwords:{topic}\n')

# Convert to DTM

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert to DTM

# Setting:
# - keep ALL tokens
vectorizer = CountVectorizer(preprocessor=lambda x: x,
                             tokenizer=lambda x: x,
                             lowercase=False,
                             ngram_range=(1,2))

# Learn vocabulary 
vectorizer.fit(docs)



CountVectorizer(lowercase=False, ngram_range=(1, 2),
                preprocessor=<function <lambda> at 0x7f4afbd1bdc0>,
                tokenizer=<function <lambda> at 0x7f4afbce79d0>)

In [8]:
vectorizer.fit(texttoken_qa.values())

# Make DTM
dtm_md = vectorizer.transform(texttoken_md.values())
dtm_qa = vectorizer.transform(texttoken_qa.values())

# Compute similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = np.diag(cosine_similarity(dtm_md, dtm_qa))
similarity

'\nfrom sklearn.feature_extraction.text import CountVectorizer\n\n# Convert to DTM\n\n# Setting:\n# - keep ALL tokens\nvectorizer = CountVectorizer(preprocessor=lambda x: x,\n                             tokenizer=lambda x: x,\n                             lowercase=False,\n                             ngram_range=(1,2))\n\n# Learn vocabulary \nvectorizer.fit(texttoken_md.values())\nvectorizer.fit(texttoken_qa.values())\n\n# Make DTM\ndtm_md = vectorizer.transform(texttoken_md.values())\ndtm_qa = vectorizer.transform(texttoken_qa.values())\n\n# Compute similarity\nfrom sklearn.metrics.pairwise import cosine_similarity\nsimilarity = np.diag(cosine_similarity(dtm_md, dtm_qa))\nsimilarity\n\n'