# Init

In [2]:
import datatable as dt
import spacy

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from datatable import f
from spacy.tokens import Doc, DocBin, Span
from tqdm.auto import tqdm

# Convert Doc to "text tokens"

In [3]:
# Select componentid that belongs to MD and QA
ld('text_component_sp500', ldname='text_component')
text_component = dt.Frame(text_component)

componentids_md = text_component[(f.transcriptcomponenttypeid==2) & (f.speakertypeid==2), f.transcriptcomponentid].to_list()[0]

componentids_qa = text_component[((f.transcriptcomponenttypeid==3) | (f.transcriptcomponenttypeid==4)) & ((f.speakertypeid==2)|(f.speakertypeid==3)), f.transcriptcomponentid].to_list()[0]

"text_component_sp500.feather" (978.0 MB) loaded as "text_component" (2s)


In [4]:
# Load DocBin from disk
nlp = spacy.load('en_core_web_lg')
docs = list(DocBin(store_user_data=True).from_disk('data/doc_sp500.spacy').get_docs(nlp.vocab))

# Register extension for Span
Span.set_extension('transcriptid', default=None, force=True)
Span.set_extension('componentid', default=None, force=True)
Span.set_extension('componentorder', default=None, force=True)
Span.set_extension('componenttypeid', default=None, force=True)
Span.set_extension('speakerid', default=None, force=True)
Span.set_extension('speakertypeid', default=None, force=True)
Span.set_extension('is_component', default=False, force=True)

In [5]:
# Convert Doc to list of str

# Filtering Rule:
# - only keep lemma
# - no stop words
# - no punctuation
# - no "like number" 
texttoken_md = []
for doc in docs:
    txttok = []
    for span in doc.spans['components']:
        if span._.componentid in componentids_md:
            txttok.extend([t.lemma_ for t in span 
                           if ((not t.is_punct) & (not t.is_stop) & (not t.like_num))])
    texttoken_md.append(txttok)
    

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(preprocessor=lambda x: x,
                             tokenizer=lambda x: x)
x = vectorizer.fit_transform(texttoken_md)

array([[ 36,   0,   0, ...,   0,   0,   1],
       [ 48,   0,   0, ...,   0,   0,   0],
       [ 64,   0,   0, ...,   0,   0,   0],
       ...,
       [ 67,   0,   0, ...,   0,   0,   0],
       [467,   0,   2, ...,   0,   1,   0],
       [ 95,   0,   0, ...,   1,   0,   0]])