# Init

In [1]:
import datatable as dt
import spacy

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from datatable import f
from spacy.attrs import ORTH
from spacy.tokens import Doc, DocBin, Span
from tqdm.auto import tqdm

# set working directory
WORK_DIR = '/home/yu/OneDrive/CC'
DATA_DIR = '/home/yu/OneDrive/CC/data'

os.chdir(WORK_DIR)

# initialize data.table
dt.init_styles()

# Build Doc in spaCy

## Register Extension

In [2]:
# Use the simpliest pipeline
# 'tok2vec', 'parser', 'lemmatizer', 'tagger', 'attribute_ruler'
nlp = spacy.load("en_core_web_lg", disable=['ner'])

'''
# Add a simple sentencizer
nlp.add_pipe('sentencizer')

# add [EOC] as special case in tokenization
special_case = [{ORTH: "[EOC]"}]
nlp.tokenizer.add_special_case("[EOC]", special_case)
'''

# register extension for Span
Span.set_extension('transcriptid', default=None, force=True)
Span.set_extension('componentid', default=None, force=True)
Span.set_extension('componentorder', default=None, force=True)
Span.set_extension('componenttypeid', default=None, force=True)
Span.set_extension('speakerid', default=None, force=True)
Span.set_extension('speakertypeid', default=None, force=True)
Span.set_extension('is_component', default=False, force=True)

'\n# Add a simple sentencizer\nnlp.add_pipe(\'sentencizer\')\n\n# add [EOC] as special case in tokenization\nspecial_case = [{ORTH: "[EOC]"}]\nnlp.tokenizer.add_special_case("[EOC]", special_case)\n'

## Load data

In [4]:
# Load components as a 2D table
ld('text_component_sp500', ldname='text_component', force=True)
text_component = dt.Frame(text_component)

# conver 2D table to tuples
text_component = text_component[:100,:].to_tuples()
text_component = [(line[6], 
                   {'transcriptid': line[2],
                    'componentid': line[0],
                    'componenttypeid': line[4],
                    'componentorder': line[3],
                    'speakerid': int(line[5]) if line[5]!=None else None,
                    'speakertypeid': int(line[1]) if line[1]!=None else None
                   }) for line in text_component]

text_component_grouped = {}
for text, context in text_component:
    tid = context['transcriptid']
    if tid in text_component_grouped:
        text_component_grouped[tid].append((text, context))
    else:
        text_component_grouped[tid] = [(text, context)]

"text_component_sp500.feather" (978.0 MB) loaded as "text_component" (2s)


## Build Doc

> Only need to run this sectoin **ONCE**. It will hold all the ground truth and will never be altered.

In [5]:
# Final output holder
docs = []

# Iterate through every transcriptid
for line in tqdm(text_component_grouped.values(), total=len(text_component_grouped)):

    # Output holder
    components = []

    # Within every transcriptid, iterature through every component
    for component, context in nlp.pipe(line, as_tuples=True):
        
        # Assign component-level attributes
        component[:]._.is_component = True
        component[:]._.transcriptid = context['transcriptid']
        component[:]._.componentid = context['componentid']
        component[:]._.componenttypeid = context['componenttypeid']
        component[:]._.componentorder = context['componentorder']
        component[:]._.speakerid = context['speakerid']
        component[:]._.speakertypeid = context['speakertypeid']

        # Assign sentence-level attributes
        for sent in component.sents:
            sent._.componentid = context['componentid']

        # return
        components.append(component)

    # join components into one Doc
    doc = Doc.from_docs(components)

    # create SpanGroup "components" for doc
    spans_component = []
    for k, v in doc.user_data.items():
        if k[1]=='is_component':
            if v==True:
                spans_component.append(doc.char_span(k[2], k[3]))

    doc.spans['components'] = spans_component 

    # return     
    docs.append(doc)

# DocBin(store_user_data=True, docs=docs).to_disk('data/doc_sp500.spacy')

100%|██████████| 2/2 [00:00<00:00,  2.73it/s]


In [9]:
docs[0].spans['components'][0]._.componentid

30184

## Save DocBin

> **Warnings**

> When saving DocBin, you must also save the nlp object for recovery.

In [None]:
DocBin(store_user_data=True, docs=docs).to_disk('data/doc_sp500.spacy')