# Init

In [2]:
import datatable as dt
import numpy as np
import spacy
import torch
import torch.nn.functional as F

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from datatable import f, join
from spacy.tokens import Doc, DocBin, Span
from tqdm.auto import tqdm

dt.init_styles()

ROOT_DIR = '/home/yu/OneDrive/CC'
DATA_DIR = f'{ROOT_DIR}/data'

print(f'ROOT_DIR: {ROOT_DIR}')
print(f'DATA_DIR: {DATA_DIR}')

ROOT_DIR: /home/yu/OneDrive/CC
DATA_DIR: /home/yu/OneDrive/CC/data


# N-gram

> Computing similarity using bag-of-words

## Convert Doc to "text tokens"

In [3]:
# Load DocBin from disk
nlp = spacy.load('en_core_web_lg')

docs = []
for _ in tqdm(range(10)):
    docs.extend(list(DocBin(store_user_data=True).from_disk(f'data/doc_sp500_lg_{_}.spacy').get_docs(nlp.vocab)))

# register extension for Doc
Doc.set_extension('transcriptid', default=None, force=True)

# Register extension for Span
Span.set_extension('transcriptid', default=None, force=True)
Span.set_extension('componentid', default=None, force=True)
Span.set_extension('componentorder', default=None, force=True)
Span.set_extension('componenttypeid', default=None, force=True)
Span.set_extension('speakerid', default=None, force=True)
Span.set_extension('speakertypeid', default=None, force=True)
Span.set_extension('is_component', default=False, force=True)

  0%|          | 0/10 [00:00<?, ?it/s]

In [4]:
# Select componentid that belongs to MD and QA
ld('text_component_sp500', ldname='text_component')
text_component = dt.Frame(text_component)

# componentid: Management Discussion
componentids_md = text_component[(f.transcriptcomponenttypeid==2) & (f.speakertypeid==2), f.transcriptcomponentid].to_list()[0]
componentids_md = set(componentids_md)

# componentid: Q & A
componentids_qa = text_component[((f.transcriptcomponenttypeid==3) | (f.transcriptcomponenttypeid==4)) & ((f.speakertypeid==2)|(f.speakertypeid==3)), f.transcriptcomponentid].to_list()[0]
componentids_qa = set(componentids_qa)


del text_component

"text_component_sp500.feather" (978.0 MB) loaded as "text_component" (3s)


In [14]:
# Convert Doc to "text tokens"

# Filtering Rule:
# - only keep lemma
# - KEEP stop words (stop words is informative while comparing)
# - no punctuation
# - no "like number"

def make_text_tokens(docs, componentids):
    texttoken = {}
    
    # For every doc, join the required spans into a list of str
    for doc in tqdm(docs):
        txttok = []
        for span in doc.spans['components']:
            if span._.componentid in componentids:
                txttok.extend([t.lemma_ for t in span 
                if ((not t.is_punct) & (not t.like_num))])

        # If no text found, add an empty str
        if len(txttok)==0:
            txttok = ['']

        # return
        texttoken[doc._.transcriptid] = txttok
    
    return texttoken

texttoken_md = make_text_tokens(docs, componentids_md)
sv('texttoken_md')

texttoken_qa = make_text_tokens(docs, componentids_qa)
sv('texttoken_qa')

  0%|          | 0/37630 [00:00<?, ?it/s]

"texttoken_qa" saved as "texttoken_qa.pkl" (1.2 GB) (45s)


In [15]:
del docs

## Convert to DTM

In [1]:
ld('texttoken_qa')
ld('texttoken_md')

"texttoken_qa.pkl" (1.2 GB) loaded (14s)
"texttoken_md.pkl" (894.5 MB) loaded (9s)


In [31]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert to DTM

# Setting:
# - keep ALL tokens

def get_similarity(ngram_type:str):
    if ngram_type == 'unigram':
        ngram_range = (1,1)
    elif ngram_type == 'bigram':
        ngram_range = (2,2)
    elif ngram_type == 'allgram':
        ngram_range = (1,2)
    else:
        print('Wrong ngram_type!')
    
    print('Tokenizing...')
    vectorizer = CountVectorizer(preprocessor=lambda x: x,
                                 tokenizer=lambda x: x,
                                 lowercase=False,
                                 ngram_range=ngram_range)

    # Learn vocabulary 
    vectorizer.fit(texttoken_md.values())
    vectorizer.fit(texttoken_qa.values())

    # Make DTM
    dtm_md = vectorizer.transform(texttoken_md.values())
    dtm_qa = vectorizer.transform(texttoken_qa.values())
    
    # get transcriptid
    assert list(texttoken_md.keys())==list(texttoken_qa.keys()),\
           'transcriptids of MD and QA are different!'
    transcriptids = list(texttoken_md.keys())
    
    # compute similarity
    from sklearn.metrics.pairwise import cosine_similarity
    similarity = []
    n = dtm_md.shape[0]

    print('Computing similarity...')
    for i in tqdm(range(n)):
        s = cosine_similarity(dtm_md[i,:],dtm_qa[i,:])[0,0]
        similarity.append(s)
        
    df = dt.Frame({'transcriptid':transcriptids, 
                   f'similarity_{ngram_type}':similarity})
        
    return df

# similarity_unigram = get_similarity('unigram')
# sv('similarity_unigram')

# similarity_bigram = get_similarity('bigram')
# sv('similarity_bigram')

similarity_allgram = get_similarity('allgram')
sv('similarity_allgram')


Tokenizing...
Computing similarity...


  0%|          | 0/37630 [00:00<?, ?it/s]

"similarity_allgram" saved as "similarity_allgram.feather" (439.6 KB) (<1s)


## Join results

In [37]:
ld('similarity_unigram')
ld('similarity_bigram')
ld('similarity_allgram')

similarity_unigram.key = 'transcriptid'
similarity_bigram.key = 'transcriptid'
similarity_allgram.key = 'transcriptid'

similarity_unigram (439.5 KB) already loaded, will NOT load again!
similarity_bigram (439.4 KB) already loaded, will NOT load again!
similarity_allgram (439.6 KB) already loaded, will NOT load again!


In [38]:
similarity = similarity_unigram[:,:,join(similarity_bigram)
    ][:,:,join(similarity_allgram)]

In [40]:
sv('similarity')

"similarity" saved as "similarity.feather" (1022.2 KB) (<1s)


# Longformer

**Computing similarity using FinBERT**

Steps:
- Load sentence sentence embeddings
- Average embeddings for MD and QA
- Compute similarity

In [3]:
%%time

# load all pre-embedding
emb0 = torch.load(f'{DATA_DIR}/Embeddings/preembeddings_longformer_rank0.pt')
emb1 = torch.load(f'{DATA_DIR}/Embeddings/preembeddings_longformer_rank1.pt')

CPU times: user 41.5 s, sys: 7.14 s, total: 48.7 s
Wall time: 1min 7s


In [5]:
for tid, cid_emb in tqdm(emb1.items()):
    for cid, emb in cid_emb.items():
        emb0[tid].update({cid:emb})

  0%|          | 0/37448 [00:00<?, ?it/s]

In [8]:
# create tid-cid pairs
def load_tid_cid_pair(tid_cid_pair_name):
    '''load DataFrame tid_cid_pair, convert it into a Dict
    
    output: {tid:[cid1, cid2, ...]}
    
    tid_cid_pair_name: str. e.g., "md", "qa", "all"
    '''
    pair = feather.read_feather(f'data/tid_cid_pair_{tid_cid_pair_name}.feather')
    tids = pair.transcriptid.tolist()
    cids = [cid.tolist() for cid in pair.componentid]
    
    return dict(zip(tids, cids))

tid_cid_pair_md = load_tid_cid_pair('md')
tid_cid_pair_qa = load_tid_cid_pair('qa_manager')

In [15]:
# if have both MD and QA, compute similarity
# else, similarity is 1
null_similarity = 0
outputs = {}

for i, (tid, components) in enumerate(tqdm(emb0.items())):
        
    cids_md = tid_cid_pair_md.get(tid,{})
    cids_qa = tid_cid_pair_qa.get(tid,{})
    
    if len(cids_md)>0 and len(cids_qa)>0:
        emb_md = [emb['embedding'] for cid, emb in components.items()
                  if cid in cids_md]
        emb_md = torch.stack(emb_md).max(dim=0).values.unsqueeze(dim=0)
        
        emb_qa = [emb['embedding'] for cid, emb in components.items()
                  if cid in cids_qa]
        emb_qa = torch.stack(emb_qa).max(dim=0).values.unsqueeze(dim=0)
        
        similarity = F.cosine_similarity(emb_md, emb_qa).item()
    else:
        similarity = 1
        null_similarity += 1
    
    outputs[tid] = similarity
    
print(f'N calls with MD/QA missing: {null_similarity}')

similarity = dt.Frame({'transcriptid':list(outputs.keys()),
                       'similarity_longformer':list(outputs.values())})
sv('similarity', 'similarity_longformer')

  0%|          | 0/37630 [00:00<?, ?it/s]

N calls with MD/QA missing: 904
"similarity" saved as "similarity_longformer.feather" (337.9 KB) (<1s) (2021-03-10 7:16 PM)


In [17]:
similarity.shape

(37630, 2)