# Init

In [2]:
# import tensorflow as tf
import comet_ml
import datatable as dt
import gc
import json
import pandas as pd
import pytorch_lightning as pl
import pyarrow.feather as feather
import re
import spacy
# import sentence_transformers
import torch.nn.functional as F
import torch.optim as optim

import multiprocessing as mp
import numpy as np
import os
import torch

from argparse import Namespace
from collections import OrderedDict, defaultdict
from datatable import f, update, by
from itertools import chain
from spacy.lang.en import English
from scipy.sparse import coo_matrix
from torch import nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from tqdm.auto import tqdm

from transformers import BertModel, BertTokenizer, BertTokenizerFast, GPT2Model, GPT2Tokenizer, RobertaTokenizer, RobertaModel, LongformerModel, LongformerTokenizerFast
# from sentence_transformers import SentenceTransformer

# working directory
ROOT_DIR = '/home/yu/OneDrive/CC'
DATA_DIR = f'{ROOT_DIR}/data'
WRDS_DOWNLOAD_DIR = f'{DATA_DIR}/WRDS-download'
print(f'ROOT_DIR: {ROOT_DIR}')
print(f'DATA_DIR: {DATA_DIR}')

# set random seed
np.random.seed(42)
torch.manual_seed(42);
torch.backends.cudnn.deterministic = False;
torch.backends.cudnn.benchmark = True;

# set device 'cuda' or 'cpu'
if torch.cuda.is_available():
    n_cuda = torch.cuda.device_count();

    def log_gpu_memory(verbose=False):
        torch.cuda.empty_cache()
        if verbose:
            for _ in range(n_cuda):
                print(f'GPU {_}:')
                print(f'{torch.cuda.memory_summary(_, abbreviated=True)}')
        else:
            for _ in range(n_cuda):
                memory_total = torch.cuda.get_device_properties(_).total_memory/(1024**3)
                memory_allocated = torch.cuda.memory_allocated(_)/(1024**3)
                print(f'GPU {_}: {memory_allocated: .2f}/{memory_total: .2f} (GB)')

    print(f'\n{n_cuda} GPUs found:');
    for _ in range(n_cuda):
        globals()[f'cuda{_}'] = torch.device(f'cuda:{_}');
        print(f'    {torch.cuda.get_device_name(_)} (cuda{_})');
        
    print('\nGPU memory:');
    log_gpu_memory();
else:
    print('GPU NOT enabled');
    
cpu = torch.device('cpu');
n_cpu = int(mp.cpu_count()/2);

print(f'\nCPU count (physical): {n_cpu}');

ROOT_DIR: /home/yu/OneDrive/CC
DATA_DIR: /home/yu/OneDrive/CC/data

2 GPUs found:
    GeForce RTX 2080 Ti (cuda0)
    GeForce RTX 2080 Ti (cuda1)

GPU memory:
GPU 0:  0.00/ 10.76 (GB)
GPU 1:  0.00/ 10.76 (GB)

CPU count (physical): 16


# Helpers

In [23]:
# helper: refresh cuda memory
def refresh_cuda_memory():
    """
    Re-allocate all cuda memory to help alleviate fragmentation
    """
    # Run a full garbage collect first so any dangling tensors are released
    gc.collect()

    # Then move all tensors to the CPU
    for obj in gc.get_objects():
        if isinstance(obj, torch.Tensor) and obj.device!=torch.device('cpu'):
            obj.data = torch.empty(0)
            if isinstance(obj, torch.nn.Parameter) and obj.grad is not None:
                obj.grad.data = torch.empty(0)

    # Now empty the cache to flush the allocator
    torch.cuda.empty_cache()
    
def elapsed_time(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    print(f'{int(hours)}h {int(minutes)}min {seconds:.1f}s')

# Longformer

> The encoding part is in "task-longformer-component-encode.py".

> After Feb 5, 2021, we no longer merge/filter the embeddings in "C-preEncode.ipynb". Instead, we'll put it in the Dataset module in "C-model".

> After Mar 1, 2021, pre-embedding of every transcript will be save into an independent file on disk, and will be read as needed in "C-model". (Like reading individual images)

Steps:

- Merge "rank0" and "rank1"
- Save pre-embedding of every transript into a seperate file.

Note:
- Save embeddings as "numpy.ndarray" instead of "Tensor" because the ndarray is MUCH smaller.

In [3]:
import torch

def load_preembeddings(preembedding_name):
        
    # find the embedding files
    emb_paths = [path for path in os.listdir('data/Embeddings')
                 if re.search(f'{preembedding_name}_rank', path)]
    emb_paths.sort()
    assert len(emb_paths)==2, "Expect two files: rank0 and rank1"

    # load the embedding files
    print(f'{datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}, Loading "{emb_paths[0]}"...')
    emb0 = torch.load(f"{DATA_DIR}/Embeddings/{emb_paths[0]}")
    print(f'{datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")}, Loading "{emb_paths[1]}"...')
    emb1 = torch.load(f"{DATA_DIR}/Embeddings/{emb_paths[1]}")

    # merge two ranks into one (update emb0 with emb1)
    for tid, cid_emb in emb1.items():
        for cid, emb in cid_emb.items():
            emb0[tid].update({cid:emb})
    print('Merging completes!')

    # write embedding to globals()
    return emb0

emb = load_preembeddings('preembeddings_longformer')

2021-03-03 04:18:30 PM, Loading "preembeddings_longformer_rank0.pt"...
2021-03-03 04:19:01 PM, Loading "preembeddings_longformer_rank1.pt"...
Merging completes!


In [35]:
svdir = '/home/yu/OneDrive/CC/data/Embeddings/longformer'
for i, (tid, components) in enumerate(tqdm(emb.items())):
    
    for cid in components:
        if not isinstance(components[cid]['embedding'], np.ndarray):
            components[cid]['embedding'] = components[cid]['embedding'].numpy()
    
    torch.save(components, f'{svdir}/{tid}.pt')

  0%|          | 0/37630 [00:00<?, ?it/s]

# finBERT

## encode

> This section is just an **interactive, non-parallel, test** version for finBERT encoding. For the final version, please refer to `task-finBERT-encode.py`

In [18]:
# load raw text
ld('dt_sents_sp500', ldname='dt_sents', force=True)
print(f'N sentences: {dt_sents.nrows}')
print(f'N transcriptid: {len(set(dt_sents[:,f.transcriptid].to_list()[0]))}')

"dt_sents_sp500.feather" (1.0 GB) loaded as "dt_sents" (8s)
N sentences: 19913851
N transcriptid: 37630


In [None]:
# helper: refresh cuda memory
def refresh_cuda_memory():
    """
    Re-allocate all cuda memory to help alleviate fragmentation
    """
    # Run a full garbage collect first so any dangling tensors are released
    gc.collect()

    # Then move all tensors to the CPU
    for obj in gc.get_objects():
        if isinstance(obj, torch.Tensor) and obj.device!=torch.device('cpu'):
            obj.data = torch.empty(0)
            if isinstance(obj, torch.nn.Parameter) and obj.grad is not None:
                obj.grad.data = torch.empty(0)

    # Now empty the cache to flush the allocator
    torch.cuda.empty_cache()

# helper: print elapsed time (given start and end)
def elapsed_time(start, end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    print(f'{int(hours)}h {int(minutes)}min {seconds:.1f}s')

In [None]:
ld('sents_sp500', ldname='sents', force=True)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('./data/finBERT', return_dict=True)
model.eval()
model.to('cuda')

In [None]:
# --------------------------
# code WITHOUT batch
# --------------------------

output = {}

for tid, cid, sid, text in tqdm(sents[:200]):
    # print(text)
    tokens = tokenizer(text,return_tensors='pt', truncation=True, max_length=384)
    tokens = tokens.to('cuda')
    # print(tokens.input_ids)
    
    # create mask
    mask = tokens.attention_mask[:,:-1].float() # (B, S)
    mask = F.softmax(mask, dim=-1).unsqueeze(-1).float() # (B, S, 1)
    
    embedding = model(**tokens).last_hidden_state[:,:-1,:].transpose(-1, 1) # (B, E, S)
    
    embedding_pool = torch.bmm(embedding, mask).squeeze().detach() # (B, E)
    
    output[f'{tid}_{sid}'] = embedding_pool
    
output['108_83']

In [None]:
# ----------------------------
# compute embedding WITH batch
# ----------------------------

sentence_ids = ciq_components_sentencized['sentence_id']
texts = ciq_components_sentencized['text']

batch = 1
output = {}
n_batches = int(len(texts)/batch)+1

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained(f'{DATA_DIR}/finBERT', return_dict=True)
model.to('cuda')

for batch_idx in tqdm(range(n_batches)):
    if batch_idx>=2:
        break
                      
    # For every 1/10 of data, 
    # - release cuda memory
    # - save results
    # - empty output
    # if batch_idx<=209860:
    #     continue
        
    if (batch_idx%(n_batches//100)==0) or (batch_idx==(n_batches-1)):

        refresh_cuda_memory()

        # create model and tokenizer
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('data/finBERT', return_dict=True)
        model = torch.nn.DataParallel(model)
        model.to('cuda')

        sv_name = f'preembeddings_finbert_nocls_{batch_idx}.pt'
        print(f'Saving to {sv_name}')
        torch.save(output, f'data/Embeddings/{sv_name}')
        output = {}

    sids = sentence_ids[batch_idx*batch:(batch_idx+1)*batch].to_list()
    text = texts[batch_idx*batch:(batch_idx+1)*batch].to_list()

    # tokenize
    tokens = tokenizer(text,return_tensors='pt', padding=True, truncation=True, max_length=512)
    tokens = tokens.to('cuda')
    
    # get mask
    mask = tokens.attention_mask.float() # (B, S)
    valid_seq_len = torch.sum(mask==1, dim=1) # (B,)

    # Option 1:
    # - the 1st and last token are special token, set to zero
    # - devide mask by (seq_len-2)
    # for i, l in enumerate(valid_seq_len):
    #     mask[i, [0, l-1]] = 0
    #     mask[i] /= (l-2)
        
    # Option 2:
    # - all tokens (including [CLS] and [EOS]) are preserved
    # - devide mask by (seq_len)
    for i, l in enumerate(valid_seq_len):
        mask[i] /= l
    
    mask = mask.unsqueeze(-1) # (B, S, 1)

    # compute embedding
    embedding = model(**tokens).last_hidden_state.transpose(-1, 1) # (B, E, S)
    embedding_pool = torch.bmm(embedding, mask).squeeze().detach().to('cpu') # (B, E)   
    if len(embedding_pool.shape)==1:
        embedding_pool = embedding_pool.unsqueeze(0)
    

    del embedding, mask, tokens

    for _ in range(len(sids)):
        output[sids[_]] = embedding_pool[_,...]
    
print(output)

torch.save(output, 'data/Embeddings/preembeddings_fibert_no_cls.pt')

## merge embeddings

### Remove invalid sentences
(too short or too long)

In [3]:
# load sentenceids that are longer than 10 char
ld('sentenceid_longerThan10Char', force=True)
sentenceid_longerThan10Char = set(sentenceid_longerThan10Char.to_list()[0])

# find the embedding paths
embedding_paths = [file for file in os.listdir(f'{DATA_DIR}/Embeddings') 
                   if re.search(f'preembeddings_finbert_onlyCLS_rank', file)]
assert len(embedding_paths)==2
print(f'Found embeddings: {embedding_paths}')

# load and filter
embeddings = {}
for embedding_path in tqdm(embedding_paths):
    embedding = torch.load(f'{DATA_DIR}/Embeddings/{embedding_path}')
    
    n_before = len(embedding)
    embedding = {k: v['embedding'] for k, v in tqdm(embedding.items()) 
                 if k in sentenceid_longerThan10Char}
    n_after = len(embedding)
    print(f'{n_before} -> {n_after} ({n_after/n_before*100:.1f}%)')
    
    embeddings.update(embedding)
    
    del embedding
    
print(f'N sentences left: {len(embeddings)}')

"sentenceid_longerThan10Char.feather" (136.2 MB) loaded (5s)
Found embeddings: ['preembeddings_finbert_onlyCLS_rank0.pt', 'preembeddings_finbert_onlyCLS_rank1.pt']


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/9956926 [00:00<?, ?it/s]

9956926 -> 8554583 (85.9)


  0%|          | 0/9956926 [00:00<?, ?it/s]

9956926 -> 8549346 (85.9)
N sentences left: 17103928


In [4]:
torch.save(embeddings, 'data/Embeddings/preembeddings_finbert_onlyCLS.pt')

### Create different versions

Step 1:
- find the valid `sentence_id` and `transcriptid`, and use them to filter the original preembeddings

Steps 2:
- Stack sentence embeddings with the same `transcriptid`, and save the results

In [3]:
ciq_components_sentencized_withouttext = feather.read_feather(f'{DATA_DIR}/ciq_components_sentencized_withouttext.feather')
ciq_components_sentencized_withouttext.iloc[0:3]

Unnamed: 0,sentence_id,transcriptid,transcriptcomponentid,sentence_order_id,componentorder,transcriptcomponenttypeid,transcriptpersonid
0,30184-0,108,30184,0,1,1,1.0
1,30184-1,108,30184,1,1,1,1.0
2,30184-2,108,30184,2,1,1,1.0


In [4]:
%%time 
# load preembeddings
# the preembedding is in the form of {sentence_id: {seq_len:, embedding:}

# preemb_name = 'preembeddings_finBERT_without_special_tokens'
preemb_name = 'preembeddings_finBERT_with_special_tokens'

preembeddings = torch.load(f'data/Embeddings/{preemb_name}.pt')

Wall time: 9min 42s


In [5]:
%%time
# --------------------------------------------------------
# Collect `valid_sentence_ids` and `valid_transcript_ids`
# --------------------------------------------------------

# Sort sentences
ciq_components_sentencized_withouttext.sort_values(['transcriptid', 'componentorder', 'sentence_order_id'], inplace=True)

# valid_ids = ciq_components_sentencized_withouttext.loc[
#     ciq_components_sentencized_withouttext.transcriptcomponenttypeid.isin([4]), 
#     ['transcriptid', 'sentence_id']]

valid_ids = ciq_components_sentencized_withouttext[['transcriptid', 'sentence_id']]

# the valid sentence_id must also be present in preembeddings
valid_ids = valid_ids.loc[valid_ids.sentence_id.isin(preembeddings.keys())]

Wall time: 17.4 s


In [6]:
# -------------------------------------------------------------------------------
# Select subset of preembeddings by `valid_transcriptids` and `valid_sentence_ids`
# -------------------------------------------------------------------------------
preembeddings_new = {}

for tid, group in tqdm(valid_ids.groupby('transcriptid')):
    sids = group.sentence_id.tolist()
    emb = torch.stack([preembeddings[sid]['embedding'] for sid in sids])
    preembeddings_new[tid] = emb

HBox(children=(FloatProgress(value=0.0, max=35077.0), HTML(value='')))




In [7]:
# save subset of preembeddings
preemb_new_name = 'preembeddings_finBERT_with_special_tokens_test'
torch.save(preembeddings_new, f'data/Embeddings/{preemb_new_name}.pt')

In [None]:
{transcript_id_1: [...],
 transcript_id_2: [...]}

In [None]:
{sentence_id_1: {transcript_id:int,
                 sentence_type:str,
                 embedding:[...]},
 sentence_id_2: {transcript_id:int,
                 sentence_type:str,
                 embedding:[...]}}

# SBERT

## load model

In [11]:
model_path = "C:/Users/rossz/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_roberta-large-nli-stsb-mean-tokens.zip"

with open(os.path.join(model_path, 'modules.json')) as fIn:
    contained_modules = json.load(fIn)
    
sbert_modules = OrderedDict()
for module_config in contained_modules:
    module_class = sentence_transformers.util.import_from_string(module_config['type'])
    module = module_class.load(os.path.join(model_path, module_config['path']))
    sbert_modules[module_config['name']] = module
    
sbert_pad_token_id = 1

sbert_model = nn.Sequential(sbert_modules)
sbert_model = nn.DataParallel(sbert_model)
sbert_model.to(cuda0);
log_gpu_memory();

GPU 0:  1.32/ 11.00 (GB)
GPU 1:  0.00/ 11.00 (GB)


## define Dataset

In [3]:
class Tokenize():
    def __init__(self, modules, pad_token_id, max_seq_len):
        '''
        max_seq_len: There're still ass-cover statement in the call, which are very long.
            I remove every sentence which are longer than `max_seq_len`
        pad_token_id: for empty sentences, set length to 1 and fill with `pad_token_id`
        '''
        self.max_seq_len = max_seq_len
        self.pad_token_id = pad_token_id
        self.modules = modules
        
    def __call__(self, sample):
        transcriptid, sentenceid, sent = sample
        sent = self.modules[next(iter(self.modules))].tokenize(sent)
        
        if len(sent) == 0 or len(sent) < self.max_seq_len:
            return transcriptid, sentenceid, sent
        else:
            return transcriptid, sentenceid, [self.pad_token_id]        


class CCDataset(Dataset):
    def __init__(self, df, transform=None):
        '''
        Args:
            df: DataFrame 
        '''
        self.transform = transform
        self.df = df
        self.length_sorted_idx = np.argsort([len(sent) for sent in df['text'].tolist()])

        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        # sample: (transcripid, sentenceid, text)
        sample = tuple(self.df.iloc[self.length_sorted_idx[idx]])
            
        if self.transform:
            sample = self.transform(sample)
            
        return sample
    
# MAX_SENT_LEN = 256
# ds = CCDataset(text_present_sentencized, transform=Tokenize(sbert_tokenizer, modules, pad_token_id=0, max_seq_len=MAX_SENT_LEN))

## define DataLoader

In [4]:
# --------------------------- Create DataLoader--------------------------
def collate_fn(data: list, modules):
    '''
    Returns:
        featurs: a list of features. {'input_ids', 'input_mask', 'sentence_lengths'}
    '''
    transcriptids, sentenceids, sents = list(zip(*data))
    meta = (transcriptids, sentenceids)

    # valid seq_len
    valid_seq_len = [len(sent) for sent in sents]
    longest_seq_len = max(valid_seq_len)
    
    # pad
    features = {}
    for sent in sents:
        sentence_features = modules[next(iter(modules))].get_sentence_features(sent, longest_seq_len)
        
        for feature_name in sentence_features:
            if feature_name not in features:
                features[feature_name] = []
            features[feature_name].append(sentence_features[feature_name])
            
    for feature_name in features:
        features[feature_name] = torch.tensor(np.asarray(features[feature_name]))
            
    return {'features': features, 'meta': meta}

# dl = DataLoader(ds, batch_size=32,
#                 shuffle=False, num_workers=0,
#                 collate_fn=partial(collate_fn, modules=modules),
#                 drop_last=False,
#                 pin_memory=False)
# one_batch = next(iter(dl))
# one_batch

## encode

In [None]:
def pre_encode_sbert(dl, model, save_path, start):
    with torch.no_grad():
        res = []
        for batch in tqdm(dl):
            features = batch['features']
            transcriptids, sentenceids = batch['meta']
            embeddings = model(features)['sentence_embedding'].to(cpu).numpy()
            
            for transcriptid, sentenceid, embedding in zip(transcriptids, sentenceids, embeddings):
                res.append((transcriptid, sentenceid, embedding))
            
        # save every chunk
        torch.save(res, f'{save_path}_{start}.pt')   
        
    return res


text_df = pd.read_feather(f'{DATA_DIR}/text_present_sentencized.feather')
start = 800000
stop = len(text_df)
chunksize = 400000 # 400000 for 1/10 to tatal 
MAX_SENT_LEN = 256
PREENCODE_BATCH_SIZE = 512

res = []
for i in range(start, stop, chunksize):
    print(f'Processing {i}/{stop}...{i/stop*100: .1f}% {Now()}')
    
    try:
        df = text_df.iloc[i:min(i+chunksize, stop)]
        if min(i+chunksize, stop) % 2 != 0:
            df = df.iloc[:-1]

        ds = CCDataset(df, transform=Tokenize(sbert_modules, pad_token_id=sbert_pad_token_id, max_seq_len=MAX_SENT_LEN))
        dl = DataLoader(ds, batch_size=PREENCODE_BATCH_SIZE,
                        shuffle=False, num_workers=0,
                        collate_fn=partial(collate_fn, modules=sbert_modules),
                        drop_last=False,
                        pin_memory=True)

        res.extend(pre_encode_sbert(dl, model=sbert_model, save_path='./data/text_present_sbert_roberta_nlistsb_encoded', start=i))
    except Exception as e:
        print(f'Exception i={i}')
        print(f'   {e}')

## Merge embeddings

### merge pre-embedding

In [None]:
%%time
def merge_preembeddings(preembedding_type, text_type):
    # load text_sentencied, (tid, sid, text)
    # which is used for checking embedding number
    if f'text_{text_type}_sentencized' not in globals():
        text_sentencized = pd.read_feather(f'{DATA_DIR}/text_{text_type}_sentencized.feather')
    
    # load preembeddings
    embedding_paths = [file for file in os.listdir(f'{DATA_DIR}/embeddings') if re.search(f'text_{preembedding_type}', file)]
    for path in embedding_paths: print(path)
        
    print(f'Loading preembeddings...')
    preembeddings_tmp = []
    for embedding_path in tqdm(embedding_paths):
        preembeddings_tmp.extend(torch.load(f'{DATA_DIR}/embeddings/{embedding_path}'))
        
    emb_dim = preembeddings_tmp[0][2].shape[0]
        
    # check if every sentence in `text_sentencized` has been preencoded
    # for every missing sentences, replacing with torch.zeros(1024)
    # The reason that some sentences have not been encoded is that the batch_size is 
    # an even number while the total number of sentences may be odd, in which case
    # the last sentence of the dataset will be removed.
    tid_sid_from_text_sentencized = set(f'{tid}-{sid}' for tid, sid in zip(text_sentencized.transcriptid, text_sentencized.sentenceid))

    tid_sid_from_preembeddings = set(f'{tid}-{sid}' for tid, sid, _ in preembeddings_tmp)

    for tid_sid in tid_sid_from_text_sentencized:
        if tid_sid not in tid_sid_from_preembeddings:
            tid, sid = tid_sid.split('-')
            tid = int(tid)
            sid = int(sid)
            text = text_sentencized.loc[(text_sentencized.transcriptid==tid) & (text_sentencized.sentenceid==sid)]['text'].values[0]
            print('Not found:')
            print(f'  trascriptid: {tid}  sentenceid: {sid}')
            print(f'  text: {text}')
            print(f'-----------')

            preembeddings_tmp.append((tid, sid, np.zeros(emb_dim)))
            
    assert len(preembeddings_tmp)==len(text_sentencized), 'preembedding # != sentence #'
        
    # sort by (transcriptid, sentenceid)
    print(f'sorting by (transcriptid, sentenceid)')
    preembeddings_tmp.sort(key=itemgetter(0,1))
    
    # group by transcriptid
    preembeddings_bytid = defaultdict(list)
    for transcriptid, _, emb in preembeddings_tmp:
        preembeddings_bytid[transcriptid].append(emb)

    preembeddings_bytid_stacked = {}
    print('Stacking embeddings...')
    for k, v in tqdm(preembeddings_bytid.items()):
        preembeddings_bytid_stacked[k] = torch.tensor(np.array(v))
    print(f'N call event: {len(preembeddings_bytid_stacked)}')

    return preembeddings_bytid_stacked



for text_type in ['all']:
    # merge preembeddings
    preembedding_type = f'{text_type}_sbert_roberta_nlistsb_encoded'
    preembedding_name = f'preembeddings_{preembedding_type}'
    preembeddings = merge_preembeddings(preembedding_type, text_type)
    # save preembeddings
    print(f'saving preembeddings...')
    torch.save(preembeddings, f'{DATA_DIR}/embeddings/{preembedding_name}.pt')

Save valid preembeddings keys
> I found some `transcriptid`s in `targets_df` are NOT in `preembeddings`. So I save all valid preembeddings keys and use that to filter out invalid obervations `targets_df` 

In [15]:
# helpers: load preembeddings
def load_preembeddings(preembedding_type):
    if 'preembeddings' not in globals():
        print(f'Loading preembeddings...@{Now()}')
        globals()['preembeddings'] = torch.load(f"{DATA_DIR}/embeddings/preembeddings_{preembedding_type}.pt")
        print(f'Loading finished. @{Now()}')
        
load_preembeddings('all_sbert_roberta_nlistsb_encoded')

valid_preembedding_keys_all = pd.DataFrame({'valid_keys_all':list(set(preembeddings.keys()))})
feather.write_feather(valid_preembedding_keys_all, 'data/valid_preembedding_keys_all.feather', compression='uncompressed')

### check `id-text` pair 

> Task: final check that id-text are correctly matched
>
> Check **Pass!**

In [None]:
cpu_model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [16]:
text_sentencized = pd.read_feather(f'{DATA_DIR}/text_all_sentencized.feather')
targets_df = pd.read_feather(f'{DATA_DIR}/f_sue_keydevid_car_finratio_transcriptid_text.feather')

In [33]:
preembeddings[1441428][123]

tensor([0.5312, 1.3299, 0.4254,  ..., 0.1801, 0.9385, 0.5650])

In [34]:
text = text_sentencized[(text_sentencized.transcriptid==1441428) & (text_sentencized.sentenceid==123)]
text = text['text'].tolist()

cpu_model.encode(text)

[array([0.531192  , 1.3299239 , 0.42536   , ..., 0.18007116, 0.9384972 ,
        0.56497455], dtype=float32)]

In [52]:
class CCDataset(Dataset):
    
    def __init__(self, preembeddings: list, targets_df, split_window, split_type, transcriptids=None, transform=None):
        '''
        Args:
            preembeddings: list of embeddings. Each element is a tensor (S, E) where S is number of sentences in a call
            targets_df: DataFrame of targets variables.
            split_window: str. e.g., "roll-09"
            split_type: str. 'train' or 'test'
            transcriptids: list. If provided, only the given transcripts will be used in generating the Dataset. `transcriptids` is applied **on top of** `split_window` and `split_type`
        '''

        # get split dates from `split_df`
        _, train_start, train_end, test_start, test_end = tuple(split_df.loc[split_df.window==split_window].iloc[0])
        train_start = datetime.strptime(train_start, '%Y-%m-%d').date()
        train_end = datetime.strptime(train_end, '%Y-%m-%d').date()
        test_start = datetime.strptime(test_start, '%Y-%m-%d').date()
        test_end = datetime.strptime(test_end, '%Y-%m-%d').date()
        
        # select valid transcriptids (preemb_keys) according to split dates 
        if split_type=='train':
            transcriptids = targets_df[targets_df.ciq_call_date.between(train_start, train_end)].transcriptid.tolist()
        elif split_type=='test':
            transcriptids = targets_df[targets_df.ciq_call_date.between(test_start, test_end)].transcriptid.tolist()

        self.valid_preemb_keys = set(transcriptids).intersection(set(preembeddings.keys()))
        
        if transcriptids is not None:
            self.valid_preemb_keys = self.valid_preemb_keys.intersection(set(transcriptids))
        
        # self attributes
        self.targets_df = targets_df
        self.preembeddings = preembeddings
        self.transform = transform
        self.sent_len = sorted([(k, preembeddings[k].shape[0]) 
            for k in self.valid_preemb_keys],
            key=itemgetter(1))
        self.train_start = train_start
        self.train_end = train_end
        self.test_start = test_start
        self.test_end = test_end
        self.n_samples = len(self.sent_len)
        self.split_window = split_window
        self.split_type = split_type
        
    def __len__(self):
        return (len(self.valid_preemb_keys))
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        transcriptid = self.sent_len[idx][0]
        targets = self.targets_df[self.targets_df.transcriptid==transcriptid].iloc[0]
        
        # inputs: preembeddings
        embeddings = self.preembeddings[transcriptid]
        
        # all of the following targests are
        # of type `numpy.float64`
        mcap = targets.mcap
        sue = targets.sue
        suelag = targets.sue_lag1
        selag = targets.se_lag1
        se = targets.se
        selead = targets.se_lead1
        sestlag = targets.sest_lag1
        sest = targets.sest
        car_0_30 = targets.car_0_30
        car_0_30_lag = targets.car_0_30_lag1
        
        return transcriptid, embeddings, mcap, suelag, sue, car_0_30_lag, car_0_30, se, selead, sestlag, sest


# # test DataSet...
# targets_df_path = f'{DATA_DIR}/f_sue_keydevid_car_finratio_transcriptid_text.feather'
# preembedding_type = 'ques_sbert_roberta_nlistsb_encoded'

# # load preembeddings
# if 'preembeddings' not in globals():
#     print(f'Loading preembeddings...{Now()}')
#     preembeddings = torch.load(f'{DATA_DIR}/embeddings/preembeddings_{preembedding_type}.pt')
#     print(f'Loading finished. {Now()}')
    
# # load targets_df
# if 'targets_df' not in globals():
#     targets_df = pd.read_feather(targets_df_path)

# # choose train/val split
# split_df = pd.read_csv(f'{DATA_DIR}/split_dates.csv')

# # create Dataset
# test_ds = CCDataset(preembeddings, targets_df, split_window='roll-19', split_type='train')

# test_ds[876]