# Init

In [2]:
import datatable as dt
import numpy as np
import spacy

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from datatable import f
from spacy.tokens import Doc, DocBin, Span
from tqdm.auto import tqdm

dt.init_styles()

# Convert Doc to "text tokens"

In [None]:
# Load DocBin from disk
nlp = spacy.load('en_core_web_lg')

docs = []
for _ in range(4):
    docs.extend(list(DocBin(store_user_data=True).from_disk(f'data/doc_sp500_sm_{_}.spacy').get_docs(nlp.vocab)))

# register extension for Doc
Doc.set_extension('transcriptid', default=None, force=True)

# Register extension for Span
Span.set_extension('transcriptid', default=None, force=True)
Span.set_extension('componentid', default=None, force=True)
Span.set_extension('componentorder', default=None, force=True)
Span.set_extension('componenttypeid', default=None, force=True)
Span.set_extension('speakerid', default=None, force=True)
Span.set_extension('speakertypeid', default=None, force=True)
Span.set_extension('is_component', default=False, force=True)

In [4]:
# Select componentid that belongs to MD and QA
ld('text_component_sp500', ldname='text_component')
text_component = dt.Frame(text_component)

# componentid: Management Discussion
componentids_md = text_component[(f.transcriptcomponenttypeid==2) & (f.speakertypeid==2), f.transcriptcomponentid].to_list()[0]

# componentid: Q & A
componentids_qa = text_component[((f.transcriptcomponenttypeid==3) | (f.transcriptcomponenttypeid==4)) & ((f.speakertypeid==2)|(f.speakertypeid==3)), f.transcriptcomponentid].to_list()[0]

"text_component_sp500.feather" (978.0 MB) loaded as "text_component" (2s)


In [5]:
# Convert Doc to "text tokens"

# Filtering Rule:
# - only keep lemma
# - KEEP stop words (stop words is informative while comparing)
# - no punctuation
# - no "like number"

def make_text_tokens(docs, componentids):
    texttoken = {}
    
    # For every doc, join the required spans into a list of str
    for doc in docs:
        txttok = []
        for span in doc.spans['components']:
            if span._.componentid in componentids:
                txttok.extend([t.lemma_ for t in span 
                if ((not t.is_punct) & (not t.like_num))])

        # If no text found, add an empty str
        if len(txttok)==0:
            txttok = ['']

        # return
        texttoken[doc._.transcriptid] = txttok
    
    return texttoken

texttoken_md = make_text_tokens(docs, componentids_md)
texttoken_qa = make_text_tokens(docs, componentids_qa)

# Convert to DTM

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert to DTM

# Setting:
# - keep ALL tokens
vectorizer = CountVectorizer(preprocessor=lambda x: x,
                             tokenizer=lambda x: x,
                             lowercase=False,
                             ngram_range=(1,2))

# Learn vocabulary 
vectorizer.fit(texttoken_md.values())
vectorizer.fit(texttoken_qa.values())

# Make DTM
dtm_md = vectorizer.transform(texttoken_md.values())
dtm_qa = vectorizer.transform(texttoken_qa.values())

# Compute similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = np.diag(cosine_similarity(dtm_md, dtm_qa))
similarity



UsageError: Line magic function `%capture` not found (But cell magic `%%capture` exists, did you mean that instead?).
