# Init

In [1]:
import datatable as dt
import multiprocessing as mp
import numpy as np
import re

# load trained nlp
import spacy
# spacy.require_gpu(0)
nlp_dir = 'data/nlp_lg_gpu'
nlp = spacy.load(nlp_dir)

import time

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from datatable import f
from functools import partial
from spacy.tokens import Doc, DocBin, Span
from tqdm import tqdm

dt.init_styles()

WORK_DIR = '/home/yu/OneDrive/CC'
DATA_DIR = f'{WORK_DIR}/data'
WRDS_DOWNLOAD_DIR = f'{DATA_DIR}/WRDS-download'
os.chdir(WORK_DIR)

# Filter text

## Load Doc

In [5]:
# ----------------  Load DocBin from disk -----------------

# register extension for Doc
Doc.set_extension('transcriptid', default=None, force=True)

# Register extension for Span
Span.set_extension('transcriptid', default=None, force=True)
Span.set_extension('componentid', default=None, force=True)
Span.set_extension('componentorder', default=None, force=True)
Span.set_extension('componenttypeid', default=None, force=True)
Span.set_extension('speakerid', default=None, force=True)
Span.set_extension('speakertypeid', default=None, force=True)
Span.set_extension('is_component', default=False, force=True)


# --------------- Select 2015-2020 ----------------
# Frist, Let's select transcriptid from year 2015-2020
ld('transcriptid_2015_2020')
tid_2015_2020 = set(transcriptid_2015_2020['transcriptid'].to_list()[0])

# start loading
docs = []
for _ in tqdm(range(5,10)): # 2015 and after
    doc = list(DocBin(store_user_data=True).from_disk(f'data/doc_sp500_lg_{_}.spacy').get_docs(nlp.vocab))
    
    res = [d for d in doc if d._.transcriptid in tid_2015_2020]
    docs.extend(res)
    
    print(f'i={_}, N_doc={len(res)}')
    del res
    
print(f'N_docs:{len(docs)}')

# save results to doc_sp500_lg_2015_2020.spacy
DocBin(docs, store_user_data=True,).to_disk('data/doc_sp500_lg_2015_2020.spacy', attrs=['ORTH', 'LEMMA', 'MORPH', 'POS', 'TAG', 'HEAD', 'DEP', 'ENT_IOB', 'ENT_TYPE'])

 20%|██        | 1/5 [01:14<04:58, 74.72s/it]

i=5, N_doc=1038


 40%|████      | 2/5 [02:02<02:55, 58.65s/it]

i=6, N_doc=3764


 60%|██████    | 3/5 [02:49<01:47, 53.56s/it]

i=7, N_doc=3736


 80%|████████  | 4/5 [03:38<00:51, 51.74s/it]

i=8, N_doc=3600


100%|██████████| 5/5 [04:28<00:00, 53.64s/it]

i=9, N_doc=3683





# Convert Doc to "string tokens"

In [9]:
# Select componentid that belongs to MD and QA
ld('text_component_sp500', ldname='text_component')
text_component = dt.Frame(text_component)

# componentid: Management Discussion
componentids_md = set(text_component[(f.transcriptcomponenttypeid==2) & (f.speakertypeid==2), f.transcriptcomponentid].to_list()[0])

# componentid: Q & A
componentids_qa = set(text_component[((f.transcriptcomponenttypeid==3) | (f.transcriptcomponenttypeid==4)) & ((f.speakertypeid==2)|(f.speakertypeid==3)), f.transcriptcomponentid].to_list()[0])

# componentid: Q
componentids_q = set(text_component[(f.transcriptcomponenttypeid==3) & ((f.speakertypeid==2)|(f.speakertypeid==3)), f.transcriptcomponentid].to_list()[0])

# componentid: A
componentids_a = set(text_component[(f.transcriptcomponenttypeid==4) & ((f.speakertypeid==2)|(f.speakertypeid==3)), f.transcriptcomponentid].to_list()[0])

"text_component_sp500.feather" (978.0 MB) loaded as "text_component" (3s)


In [10]:
# ----------- Convert Doc to "text tokens" (NO parallel) -------------

# Filtering Rule:
# - only keep lemma
# - no stop words (stop words is informative while comparing)
# - no punctuation
# - no "like number"
# - no currency (e.g., $)
# - token can't contain space

def make_text_tokens(docs):
    
    # results holder
    texttoken = {'md':{}, 'q':{}, 'a':{}}

    # for every doc, generate its texttoken
    for doc in tqdm(docs):
        
        # we'll collect three types of speech
        txttok_md = []
        txttok_q = []
        txttok_a = []

        for span in doc.spans['components']:
            txttok_span = [t.lemma_ for t in span 
                if ((not t.is_punct) and (not t.like_num) and (not t.is_stop) \
                    and (not t.is_space) and (not t.is_currency))]

            # If no text found, add an empty str
            if len(txttok_span)==0:
                txttok_span = []

            # add text tokens
            if span._.componentid in componentids_md:
                txttok_md.extend(txttok_span)
            elif span._.componentid in componentids_q:
                txttok_q.extend(txttok_span)
            elif span._.componentid in componentids_a:
                txttok_a.extend(txttok_span)

        texttoken['md'][doc._.transcriptid] = txttok_md
        texttoken['q'][doc._.transcriptid] = txttok_q
        texttoken['a'][doc._.transcriptid] = txttok_a

    # return
    return texttoken

texttoken = make_text_tokens(docs)
print(f'N docs: {len(texttoken)}') # 2015-2020: 15821 (N_doc)

sv('texttoken')

100%|██████████| 15821/15821 [00:52<00:00, 298.89it/s]


# LDA

In [9]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.matutils import Sparse2Corpus
from gensim.corpora import Dictionary


# -------------- Use ALL text to learn vocabulary ------------------

ld('texttoken', force=True)

vectorizer = CountVectorizer(preprocessor=lambda x: x,
                             tokenizer=lambda x: x,
                             lowercase=False,
                             ngram_range=(1,1),
                             min_df=100,
                             max_df=0.8)

vectorizer.fit([t for t in texttoken['a'].values() if len(t)>0]);
vectorizer.fit([t for t in texttoken['q'].values() if len(t)>0]);
vectorizer.fit([t for t in texttoken['md'].values() if len(t)>0]);

# create idx-word map
id2word = {v:k for k, v in vectorizer.vocabulary_.items()};

print(f'Vocab size: {len(id2word)}');

"texttoken.pkl" (506.3 MB) loaded (4s)




Vocab size: 6287


In [5]:
# ------------------- train model -------------
from gensim.models import LdaModel, LdaMulticore

def train(texttoken):

    # convert to DTM
    dtm = vectorizer.transform(texttoken)
    print(f'N_doc:{dtm.shape[0]}, N_feature:{dtm.shape[1]}')

    # convert to gensim corpus
    corpus = Sparse2Corpus(dtm, documents_columns=False)

    # Train LDA model.
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,

        num_topics=50,
        # workers=8,
        passes=3,
        iterations=100,
        chunksize=30000,

        alpha='auto',
        eta='auto',
        eval_every=2, # slow down the traning, only for debugging,
        per_word_topics=True
    )
    # save model
    
    return model, corpus

model_md, corpus_md = train([t for t in texttoken['md'].values() if len(t)>0])
model_md.save('data/ldamodel/ldamodel_md')
sv('corpus_md', svpath=)

# model_q = train([t for t in texttoken['q'].values() if len(t)>0])
# model_q.save('data/ldamodel/ldamodel_q')

# model_a = train([t for t in texttoken['a'].values() if len(t)>0])
# model_a.save('data/ldamodel/ldamodel_a')

  and should_run_async(code)
2021-02-05 15:10:31,374 : INFO : using autotuned alpha, starting with [0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02]
2021-02-05 15:10:31,375 : INFO : using serial LDA version on this node
2021-02-05 15:10:31,395 : INFO : running online (multi-pass) LDA training, 50 topics, 3 passes over the supplied corpus of 15819 documents, updating model once every 15819 documents, evaluating perplexity every 15819 documents, iterating 100x with a convergence threshold of 0.001000


N_doc:15819, N_feature:6287


2021-02-05 15:11:36,390 : INFO : -9.425 per-word bound, 687.5 perplexity estimate based on a held-out corpus of 15819 documents with 17000046 words
2021-02-05 15:11:36,391 : INFO : PROGRESS: pass 0, at document #15819/15819
2021-02-05 15:12:04,299 : INFO : optimized alpha [0.02518888, 0.02837071, 0.023912279, 0.024512157, 0.028167007, 0.0287961, 0.019066678, 0.023021808, 0.02571688, 0.024016652, 0.03366176, 0.026943076, 0.024686389, 0.022644604, 0.024807014, 0.02908915, 0.023650091, 0.024685122, 0.023235358, 0.030830538, 0.02420509, 0.03384535, 0.023214718, 0.023938343, 0.024730938, 0.027563022, 0.028281577, 0.02180894, 0.028677389, 0.022029975, 0.025329137, 0.026269652, 0.023649499, 0.030200116, 0.03230304, 0.024500255, 0.02158535, 0.026418947, 0.026892304, 0.023745349, 0.022595119, 0.020362569, 0.02403772, 0.020592725, 0.023996856, 0.02908092, 0.025871465, 0.026919145, 0.028164044, 0.026811646]
2021-02-05 15:12:04,315 : INFO : topic #6 (0.019): 0.007*"product" + 0.007*"U.S." + 0.006*

In [6]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()


# config
corpus = corpus_md

# plot
dictionary = Dictionary.from_corpus(corpus, id2word)
pyLDAvis.gensim.prepare(model_md, corpus, dictionary)

  and should_run_async(code)
2021-02-05 15:15:10,274 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-05 15:15:13,278 : INFO : adding document #10000 to Dictionary(0 unique tokens: [])
2021-02-05 15:15:15,161 : INFO : built Dictionary(6287 unique tokens: ['afternoon', 'welcome', 'fiscal', 'month', 'November']...) from 15819 documents (total 17000046 corpus positions)


In [10]:
# print topics (WITHOUT weight)
# topics = model.show_topics(20, 50, formatted=False)

for t in model_md.show_topics(100, 50, formatted=False):
    id_topic = t[0]
    words = ', '.join([w[0] for w in t[1]])
    print(f'TOPIC_ID: {id_topic}\nWORDS: {words}\n')

TOPIC_ID: 0
WORDS: sale, point, basis, product, margin, currency, operate, organic, customer, cost, volume, Tools, operating, deliver, offset, digit, gain, adjust, grow, acquisition, line, foreign, income, compare, segment, tool, improve, lead, U.S., unfavorable, flavor, like, work, innovation, relate, net, slide, company, brand, repair, expense, great, base, progress, decline, level, single, benefit, positive, improvement

TOPIC_ID: 1
WORDS: customer, investment, plan, New, service, gas, project, program, utility, support, infrastructure, distribution, generation, state, cost, slide, operate, term, system, electric, transmission, company, base, long, power, case, deliver, work, reliability, capacity, benefit, revenue, Virginia, morning, energy, month, operating, plant, Ohio, nuclear, approve, approximately, file, PJM, let, compare, FERC, guidance, early, regulatory

TOPIC_ID: 2
WORDS: sale, product, volume, segment, cost, U.S., America, demand, China, approximately, Brazil, water, Nor

In [None]:
# print topics (WITH weight)
for idx, topic in model.print_topics(20, 50):
    id_topic = idx
    print(f'id_topic:{id_topic}\nwords:{topic}\n')

# Topic temporal variation

In [26]:
from gensim.models import LdaModel

model_type = 'md_ngram1'

# load model, corpus, id2word
model = LdaModel.load(f'data/ldamodel/{model_type}/{model_type}')

ld(f'corpus_{model_type}', ldname='corpus', 
   path=f'data/ldamodel/{model_type}',
   force=True)

with open(f'data/ldamodel/{model_type}/{model_type}.id2word', 'rb') as f:
    id2word = pickle.load(f)

"corpus_md_ngram1.pkl" (88.3 MB) loaded as "corpus" (<1s)


  and should_run_async(code)


In [42]:
model.get_document_topics(corpus, minimum_probability=0)[3]

  and should_run_async(code)


[(0, 9.946568e-06),
 (1, 7.2855446e-06),
 (2, 9.266503e-06),
 (3, 6.531851e-06),
 (4, 6.9328926e-06),
 (5, 9.481807e-06),
 (6, 7.138347e-06),
 (7, 7.619164e-06),
 (8, 7.501787e-06),
 (9, 0.594549),
 (10, 7.0287133e-06),
 (11, 6.2655668e-06),
 (12, 8.715623e-06),
 (13, 6.5409217e-06),
 (14, 7.768118e-06),
 (15, 8.620974e-06),
 (16, 6.705562e-06),
 (17, 1.0560851e-05),
 (18, 6.1567325e-06),
 (19, 7.890452e-06),
 (20, 7.704483e-06),
 (21, 0.027641375),
 (22, 6.8468344e-06),
 (23, 7.4116883e-06),
 (24, 0.15446077),
 (25, 6.936564e-06),
 (26, 7.035831e-06),
 (27, 6.944442e-06),
 (28, 8.275919e-06),
 (29, 7.94386e-06),
 (30, 7.3050833e-06),
 (31, 7.845521e-06),
 (32, 6.8591976e-06),
 (33, 7.0073256e-06),
 (34, 8.580774e-06),
 (35, 6.654884e-06),
 (36, 7.6431215e-06),
 (37, 8.729947e-06),
 (38, 6.7483943e-06),
 (39, 0.058631882),
 (40, 7.3731303e-06),
 (41, 6.427603e-06),
 (42, 7.3332194e-06),
 (43, 8.023269e-06),
 (44, 7.1884206e-06),
 (45, 9.924444e-06),
 (46, 7.901134e-06),
 (47, 6.5268787

In [None]:
import pyLDAvis
import pyLDAvis.gensim
from gensim.corpora import Dictionary

pyLDAvis.enable_notebook()

# plot
dictionary = Dictionary.from_corpus(corpus, id2word)
pyLDAvis.gensim.prepare(model, corpus, dictionary)