# Init

In [4]:
# import tensorflow as tf
import math
import time
import torch
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import logging
import pynvml
from spacy.lang.en import English
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer

# working directory
ROOT_DIR = 'C:/Users/rossz/OneDrive/CC'
DATA_DIR = f'{ROOT_DIR}/data'
print(f'ROOT_DIR: {ROOT_DIR}')
print(f'DATA_DIR: {DATA_DIR}')

# set random seed
np.random.seed(42)
torch.manual_seed(42);
torch.backends.cudnn.deterministic = True;
torch.backends.cudnn.benchmark = False;

# helper to show GPU memory
def log_gpu_memory():
    torch.cuda.empty_cache()
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    print(f'GPU Memory allocated/cached (GB): {torch.cuda.memory_allocated()/1024**3:.4}/{torch.cuda.memory_cached()/1024**3:.4}')
    print(f'GPU Memory used/total (GB): {meminfo.used/1024**3: .4}/{meminfo.total/1024**3: .4}')
    pynvml.nvmlShutdown()
    
# set device name: 'cuda' or 'cpu'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Show if GPU enabled
if device.type == 'cuda':
    print(f'\nGPU enabled: {torch.cuda.get_device_name(0)}\n');
    log_gpu_memory()
else: 
    print('GPU NOT enabled')
pass;

ROOT_DIR: C:/Users/rossz/OneDrive/CC
DATA_DIR: C:/Users/rossz/OneDrive/CC/data

GPU enabled: GeForce GTX 1080 Ti

GPU Memory allocated/cached (GB): 0.0/0.0
GPU Memory used/total (GB):  0.2848/ 11.0


# Doc_emb

## read-in-`df`

In [3]:
# cc_5y: past 5 years (2014-2018), 70,114 obs
cc = pd.read_feather(f'C:/Users/rossz/OneDrive/CC/data/cc_5y.feather')
print(f'num of cc: {len(cc)}')
md = cc['md'].to_list() # (n_cc,) -> str
qa = cc['qa'].to_list() # (n_cc,) -> str

UsageError: Cell magic `%%lprun` not found.


## load `spaCy` and `Transformer`

In [6]:
%%time
# spacy model
nlp = English()  
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

Wall time: 254 ms


In [7]:
%%time
'''
# BERT model
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking')
bert_model = BertModel.from_pretrained('bert-large-cased-whole-word-masking')
'''

# GPT-2
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2Model.from_pretrained('gpt2')
gpt_model.eval();
gpt_model.to('cuda');
pass;
log_gpu_memory()

GPU Memory allocated/cached (GB): 0.5144/0.5371
GPU Memory used/total (GB):  1.182/ 11.0
Wall time: 12.3 s


## `doc_to_tokens`

In [8]:
def doc_to_tokens_v1(doc:str, nlp, tokenizer, max_seq_len):
    '''Split doc to sequences, then to tokens
    v1: a sequence is defined as a natural "setence" given by spaCy SBD
    
    Args:
        doc: a document string
        nlp: spaCy model to split sentences
        
    input: a document as a string
    
    output: a list of tokens: 
        [[tokens_of_seq_1], [tokens_of_seq_2],...]
    '''
    
    # split into sentences
    # sents: a list,
    #     [sent_1_as_string, sent_2_as_string,...]
    sents = [sent.text for sent in nlp(doc).sents] 
    
    # tokenize sentence
    # sents_tokenized: a list,
    #     [[tokens_of_seq_1], [tokens_of_seq_2],...]
    sents_tokenized = [tokenizer.encode(sent) for sent in sents]   
    
    # if seq_len exceeds `max_seq_len`, 
    # split them into multiple sequences
    sents_tokenized = [torch.tensor(subsent) for sent in sents_tokenized for subsent in list(chunks(sent, max_seq_len))]
    
    return sents_tokenized



def doc_to_tokens_v2(doc:str, tokenizer, max_seq_len):
    '''Split doc to sequences, then to tokens
    
    v2: a sequence's length is fixed to `max_seq_len`
    
    input: a document as a string
    
    output: a list of tokens: 
        [[tokens_of_seq_1], [tokens_of_seq_2],...]
    '''
    
    # split into sentences
    
    # tokenize sentence
    # doc_tokenized: a list,
    #     [token_1, token_2,..., token_end_of_doc]
    doc_tokenized = tokenizer.encode(doc)
    
    # chunk doc_tokenized into multiple sequences
#     sents_tokenized = [torch.tensor(subsent) for sent in sents_tokenized for subsent in list(chunks(sent, max_seq_len))]
    sents_tokenized = [torch.tensor(sent) 
                       for sent in list(chunks(doc_tokenized, max_seq_len))]
    # print(len(sents_tokenized))
    
    
    return sents_tokenized

# sents_tokenized = doc_to_tokens_v2(md[0], gpt_tokenizer, 25)
# for _ in range(3):
#     print(sents_tokenized[_])

# sents_tokenized = doc_to_tokens_v1(md[0], nlp, gpt_tokenizer, 25)
# for _ in range(3):
#     print(sents_tokenized[_])

## `tokens_to_emb`

In [9]:
def tokens_to_emb(sents_tokenized, max_n_seq_per_batch, model):
    '''
    Args:
        sent_tokenized: a list of sentence tokens. 
            e.g. [[0, 3], 
                  [3], 
                  [3, 13, 134]]
            
        max_n_seq_per_batch: e.g., for the above example, if 
            n_seq_per_batch is 2, then the output will be: 
            [[[0, 3], [3]],
             [[3, 13, 134]]]
    '''
    
    # split into batchs
    # print(f'inp n_seq: {len(sents_tokenized)}')
    sents_tokenized_as_batch = list(
        chunks(sents_tokenized, max_n_seq_per_batch))
    # print(f'n_batch_in_doc: {len(sents_tokenized_as_batch)}')
    
    # output place holder
    emb_avgpool = []
    emb_maxpool = []
    
    # for every batch, generate emb
    for batch in sents_tokenized_as_batch:
        # batch: list [seq_1_in_batch, seq_2_in_batch,... seq_n_in_batch]
        
        # n_seq: current n_seq in the batch, most of them is
        # max_n_seq_per_batch, only the last will the less
        n_seq = len(batch)
        
        # valid_seq_len: (n_seq, 1)
        valid_seq_len = torch.tensor([sent.shape[0] for sent in batch]).to(device)
        # print(valid_seq_len)

        # pad setences to max length
        # (n_seq_per_batch, max_seq_len)
        sents_padded_tokenized = pad_sequence(
            batch, batch_first=True).to(device)
        # print(sents_padded_tokenized)

        # create mask for padded sentences
        # mask: (n_seq_per_batch, max_seq_len)
        mask = (sents_padded_tokenized != 0).int().float().to(device)
        # print(mask)

        # get sentence emb
        # (n_seq_in_doc, max_seq_len, d_model)
        sents_emb = model(sents_padded_tokenized,
                          attention_mask = mask)[0]
        d_model = sents_emb.shape[-1]
        # print(sents_emb[:,:,:4])

        # create a placeholder to store the output
        # MUST explicitly newly created tensor to its device!
        avgpool = torch.zeros(n_seq, d_model).to(device)
        maxpool = torch.zeros(n_seq, d_model).to(device)
        
        # for every doc in a batch, do mask average pooling
        for i, end in enumerate(valid_seq_len):
            avgpool[i] = torch.mean(sents_emb[i, :end], 0)
            maxpool[i] = torch.max(sents_emb[i, :end], 0)[0]
        # print(avgpool[:,:4])
        # print(maxpool[:,:4])
        
        # add to output
        emb_avgpool.append(avgpool)
        emb_maxpool.append(maxpool)
        
    # final output
    emb_avgpool = torch.cat(emb_avgpool, 0)
    emb_maxpool = torch.cat(emb_maxpool, 0)
    
    return emb_avgpool, emb_maxpool
    
# with torch.no_grad():
#     tokens_to_emb(
#         sents_tokenized[:5],
#         max_n_seq_per_batch=2,
#         model=gpt_model)

## `doc_to_emb`

In [10]:
def doc_to_emb(
    docs, save_name, 
    tokenizer, model, 
    max_seq_len, max_n_seq_per_batch, 
    start_i=0,
    nlp=None):
    
    log_gpu_memory()
    
    # final: contain ALL doc
    # temp: a bunch of doc, for intermediate saving
    doc_emb_avg_final = []
    doc_emb_avg_temp = []

    doc_emb_max_final = []
    doc_emb_max_temp = []
    
    with torch.no_grad():
        for i, doc in enumerate(tqdm(docs)):
#         for i in tqdm(range(start_i, len(docs))):
#             doc = docs[i]
            if (doc is None) or (doc == ''):
                doc_emb_avg_final.append(None)
                doc_emb_avg_temp.append(None)
                doc_emb_max_final.append(None)
                doc_emb_max_temp.append(None)
            else:
                '''
                tokens = doc_to_tokens_v1(
                    doc, nlp, tokenizer, max_seq_len)
                '''
                tokens = doc_to_tokens_v2(
                    doc, tokenizer, max_seq_len)
                
                emb_avg, emb_max = tokens_to_emb(
                    tokens, max_n_seq_per_batch, model)

                # must send result to 'cpu', otherwise
                # they'll accumulate in GPU memory!
                emb_avg_cpu = emb_avg.to('cpu')
                emb_max_cpu = emb_max.to('cpu')

                doc_emb_avg_final.append(emb_avg_cpu)
                doc_emb_max_final.append(emb_max_cpu)

                doc_emb_avg_temp.append(emb_avg_cpu)
                doc_emb_max_temp.append(emb_max_cpu)

                # delete intermediate result to save GPU memory
                del emb_avg, emb_max

            # save every n doc
            if (i % 5000 == 4999) or (i == len(docs)):
                print(f'Save at i = {i}')
                sv_dir_avg = f'{DATA_DIR}/{save_name}_avg_{str(i).zfill(6)}.pt'
                sv_dir_max = f'{DATA_DIR}/{save_name}_max_{str(i).zfill(6)}.pt'
                torch.save(doc_emb_avg_temp, sv_dir_avg)
                torch.save(doc_emb_max_temp, sv_dir_max)
                doc_emb_avg_temp = []
                doc_emb_max_temp = []                
                
        # when job is done, save again
        print('Saving final results...')
        sv_dir_avg = f'{DATA_DIR}/{save_name}_avg_final.pt'
        sv_dir_max = f'{DATA_DIR}/{save_name}_max_final.pt'
        torch.save(doc_emb_avg_final, sv_dir_avg)
        torch.save(doc_emb_max_final, sv_dir_max)
        print('Saving done!')
        return doc_emb_avg_final, doc_emb_max_final
                
    log_gpu_memory()

In [8]:
%%time
# suppress "seq too long warning"
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

md_emb_avg, md_emb_max = doc_to_emb(
    md, 'md_emb',
    gpt_tokenizer, gpt_model, 
    max_seq_len=64, max_n_seq_per_batch=1024)

logging.getLogger("transformers.tokenization_utils").setLevel(logging.INFO)

GPU Memory allocated/cached (GB): 0.5144/0.5371
GPU Memory used/total (GB):  1.415/ 11.0


  7%|██▍                                | 4999/70114 [12:53<2:34:59,  7.00it/s]

Save at i = 4999


 14%|████▉                              | 9999/70114 [25:34<2:50:23,  5.88it/s]

Save at i = 9999


 21%|███████▎                          | 14999/70114 [39:00<2:34:44,  5.94it/s]

Save at i = 14999


 29%|█████████▋                        | 19999/70114 [51:57<2:16:40,  6.11it/s]

Save at i = 19999


 36%|███████████▍                    | 24998/70114 [1:04:44<2:25:40,  5.16it/s]

Save at i = 24999


 43%|█████████████▋                  | 29999/70114 [1:17:55<2:27:49,  4.52it/s]

Save at i = 29999


 50%|███████████████▉                | 34999/70114 [1:31:00<1:15:51,  7.72it/s]

Save at i = 34999


 57%|██████████████████▎             | 39998/70114 [1:43:48<1:42:31,  4.90it/s]

Save at i = 39999


 64%|████████████████████▌           | 44999/70114 [1:57:29<1:14:52,  5.59it/s]

Save at i = 44999


 71%|████████████████████████▏         | 49999/70114 [2:10:32<55:11,  6.07it/s]

Save at i = 49999


 78%|██████████████████████████▋       | 54999/70114 [2:23:25<47:06,  5.35it/s]

Save at i = 54999


 86%|█████████████████████████████     | 59999/70114 [2:37:05<30:59,  5.44it/s]

Save at i = 59999


 93%|███████████████████████████████▌  | 64999/70114 [2:50:01<14:01,  6.08it/s]

Save at i = 64999


100%|█████████████████████████████████▉| 69999/70114 [3:03:51<00:21,  5.38it/s]

Save at i = 69999


100%|██████████████████████████████████| 70114/70114 [3:04:10<00:00,  6.01it/s]


Saving final results...
Saving done!
Wall time: 3h 4min 31s


In [11]:
%%time
# suppress "seq too long warning"
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

qa_emb_avg, qa_emb_max = doc_to_emb(
    qa, 'qa_emb',
    gpt_tokenizer, gpt_model, 
    max_seq_len=64, max_n_seq_per_batch=1024)

logging.getLogger("transformers.tokenization_utils").setLevel(logging.INFO)

GPU Memory allocated/cached (GB): 0.5144/0.5371
GPU Memory used/total (GB):  1.182/ 11.0


  7%|██▍                                | 4999/70114 [16:54<3:36:04,  5.02it/s]

Save at i = 4999


 14%|████▉                              | 9999/70114 [33:57<3:05:26,  5.40it/s]

Save at i = 9999


 21%|███████▎                          | 14999/70114 [51:02<3:35:09,  4.27it/s]

Save at i = 14999


 29%|█████████▏                      | 19999/70114 [1:09:13<3:36:21,  3.86it/s]

Save at i = 19999


 36%|███████████▍                    | 24998/70114 [1:26:26<3:02:04,  4.13it/s]

Save at i = 24999


 43%|█████████████▋                  | 29999/70114 [1:43:17<3:27:12,  3.23it/s]

Save at i = 29999


 50%|███████████████▉                | 34999/70114 [2:01:52<2:05:41,  4.66it/s]

Save at i = 34999


 57%|██████████████████▎             | 39999/70114 [2:19:10<1:22:58,  6.05it/s]

Save at i = 39999


 64%|████████████████████▌           | 44999/70114 [2:37:45<1:32:55,  4.50it/s]

Save at i = 44999


 71%|██████████████████████▊         | 49999/70114 [2:55:27<1:35:00,  3.53it/s]

Save at i = 49999


 78%|██████████████████████████▋       | 54999/70114 [3:13:33<45:09,  5.58it/s]

Save at i = 54999


 86%|█████████████████████████████     | 59997/70114 [3:32:27<46:14,  3.65it/s]

Save at i = 59999


 93%|███████████████████████████████▌  | 64999/70114 [3:49:51<15:11,  5.61it/s]

Save at i = 64999


100%|█████████████████████████████████▉| 69999/70114 [4:09:01<00:36,  3.17it/s]

Save at i = 69999


100%|██████████████████████████████████| 70114/70114 [4:09:24<00:00,  4.76it/s]


Saving final results...
Saving done!
Wall time: 4h 10min 57s


In [13]:
# len of qa must match md
# torch.load(f'{DATA_DIR}/qa_emb_avg_final.pt')
# md_emb_avg = torch.load(f'{DATA_DIR}/md_emb_avg_final.pt')

assert len(md_emb_avg) == len(qa_emb_avg)

# `Dataset`

## prepare a toy

In [None]:
%%time
# laod parsed MD and QA
qa_avgpool = torch.load(f'{DATA_DIR}/md_emb_avgpool.pt')
qa_maxpool = torch.load(f'{DATA_DIR}/md_emb_maxpool.pt')
md_avgpool = torch.load(f'{DATA_DIR}/qa_emb_avgpool.pt')
md_maxpool = torch.load(f'{DATA_DIR}/qa_emb_maxpool.pt')

expert = torch.randn((100,))
non_expert = torch.randn((100,))
# expert = torch.stack([torch.randn((100,)), torch.zeros(100)], 1)
# non_expert = torch.stack([torch.zeros(100), torch.randn((100,))], 1)

data = {'qa_avgpool': qa_avgpool,
        'qa_maxpool': qa_maxpool,
        'md_avgpool': md_avgpool,
        'md_maxpool': md_maxpool,
        'expert': expert,
        'non_expert': non_expert}

In [138]:
len(qa_avgpool)
len(qa_maxpool)

100

100

## `Dataset`

In [141]:
class CCDataset(Dataset):
    '''Conference Call Dataset'''
    
    def __init__(self, data, 
                 output,
                 transform=None):
        '''
        Args:
            data: a dict containing all the input
                Note: we didn't use DataFrame because it's a bad idea
                to store tensors in a DataFrame
            output: which text we'll use. Could be one of the following:
                - md_avgpool
                - md_maxpool
                - qa_avgpool
                - qa_maxpool
                - all: (not implemented)
                
        Returns:
            text
            expert
            non_expert
        '''
        self.data = data
        self.output = output
        self.transform = transform
        
        # delete any None in the output field
        for i in range(len(data['expert'])):
            if data[output][i] is None:
                del self.data[output][i]
                del self.data['expert'][i]
                del self.data['non_expert'][i]
        
    def __len__(self):
        return (len(data['expert']))
    
    def __getitem__(self, idx):
        '''parse cc on the fly'''
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        text = self.data[self.output][idx]
        expert = self.data['expert'][idx]
        non_expert = self.data['non_expert'][idx]
        
        if self.transform:
            text = self.transform(text)
            
        return text, expert, non_expert

In [142]:
ccds = CCDataset(data, output='qa_avgpool')
print(f'text: {ccds[50][0].shape}')
print(f'expert: {ccds[50][1].shape}')
print(f'non_expert: {ccds[50][2].shape}')
print('------------')

text: torch.Size([190, 768])
expert: torch.Size([])
non_expert: torch.Size([])
------------


## split train/valid

In [143]:
ccds_train_len = int(len(ccds) * 0.8)
ccds_valid_len = int(len(ccds) * 0.1)
ccds_test_len = len(ccds) - ccds_train_len - ccds_valid_len

torch.manual_seed(42); # must reset random seed!
ccds_train, ccds_valid, ccds_test = random_split(ccds, [ccds_train_len, ccds_valid_len, ccds_test_len])
pass;

## `DataLoader`

You must create a NEW `DataLoader` in EVERY epoch!

In [144]:
def generate_batch(data):
    '''create mini-batch
    
    Retures:
        text: tensor, (batch_size, max_seq_len, d_model)
        expert: tensor, (batch_size,)
        non_expert: tensor, (batch_size,)
        length: list, valide length for each padded seq
    '''
    
    # sort a data list by seq_len (descending)
    data.sort(key=lambda x: x[0].shape[0], reverse=True)
    text, expert, non_expert = zip(*data)
    
    # expert/non_expert: (batch_size, 2)
    expert_tgt = torch.zeros((BATCH_SIZE, N_TARGET))
    expert_tgt[:,0] = torch.stack(expert).squeeze()
    
    non_expert_tgt = torch.zeros((BATCH_SIZE, N_TARGET))
    non_expert_tgt[:, 1] = torch.stack(non_expert).squeeze()
    
    assert (expert_tgt * non_expert_tgt).sum() == 0
    
    # collate text
    lengths = [t.shape[-2] for t in text]
    text = pad_sequence(text, batch_first=True) # (n_batch, max_seq_len, d)
    text = torch.transpose(text, 0, 1) # (max_seq_len, n_batch, d)
    
    # create mask: (n_batch, max_seq_len)
    mask = torch.ones((text.shape[1], text.shape[0]))
    for i, length in enumerate(lengths):
        mask[i, :length] = 0
    mask = mask == 1
    
    return text, expert_tgt, non_expert_tgt, mask

In [146]:
ccdl = DataLoader(ccds, batch_size=3, shuffle=False, 
                  num_workers=0,
                  collate_fn=generate_batch)

for i, batch in enumerate(ccdl):
#     print(batch[0])
    print(f'batch.text.is_pinned()={batch[0].is_pinned()}')
    print(f'text: {batch[0].shape}')
    print(f'expert: {batch[1].shape}')
    print(f'non_expert: {batch[2].shape}')
    print(f'mask: {batch[3].shape}')
    print('------------')
    
    if i == 0:
        break

batch.text.is_pinned()=False
text: torch.Size([298, 3, 768])
expert: torch.Size([3, 1])
non_expert: torch.Size([3, 1])
mask: torch.Size([3, 298])
------------


# Model

## `PositionEnc`

In [152]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=10000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # pe: (max_len, 1, d_model)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # output x: (seq_len_of_x, batch_size, d_model)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

## MaskPool

In [155]:
class MaskAvgPool(nn.Module):
    '''
    Given a (max_seq_len, batch_size, d_model) `input` tensor and a
            (batch, max_seq_len) `mask`,
    pool into (batch_size, d_model)
    '''
    def __init__(self):
        super(MaskAvgPool, self).__init__()
    
    def forward(self, x, mask):
        
        # inp x: (max_seq_len, batch_size, d_model)
        # inp mask: (batch_size, max_seq_len)
        _, batch_size, d_model = x.shape
        
        
        # compute valid length of each sequence
        # valid_seq_len: (batch_size,)
        valid_seq_len = torch.sum(mask==False, -1)

        
        # create a placeholder to store the output
        # MUST explicitly newly created tensor to its device!
        pooled = torch.zeros(batch_size, d_model).to(device)
        
        # for every doc in a batch, do mask average pooling
        for i, end in enumerate(valid_seq_len):
            pooled[i] = torch.mean(x[:end, i], 0)
            
        return pooled

    
class MaskMaxPool(nn.Module):
    '''
    Given a (max_seq_len, batch_size, d_model) `input` tensor and a
            (batch, max_seq_len) `mask`,
    pool into (batch_size, d_model)
    '''
    def __init__(self):
        super(MaskMaxPool, self).__init__()
        
        
    def forward(self, x, mask):
        # inp x: (max_seq_len, batch_size, d_model)
        # inp mask: (batch_size, max_seq_len)
        _, batch_size, d_model = x.shape
        
        # compute valid length of each sequence
        # valid_seq_len: (batch_size,)
        valid_seq_len = torch.sum(mask==False, -1).to(device)
        
        # create a placeholder to store the output
        pooled = torch.zeros(batch_size, d_model)
        
        # for every doc in a batch, do mask average pooling
        for i, end in enumerate(valid_seq_len):
            pooled[i] = torch.max(text[:end, i], 0)[0]
            
        return pooled    

## `module`

In [None]:
class CC(nn.Module):
    def __init__(
        self, d_model, 
        n_layers_inp, n_layers_y1, n_layers_y2,
        n_head_inp=8, n_head_y1=8, n_head_y2=8, 
        dff=2048, attn_dropout=0.1, dropout=0.5):
        
        super(CC, self).__init__()
        
        # positional encoding
        self.encoder_pos = PositionalEncoding(d_model, attn_dropout)
        
        # encoder layers for input, y1, y2
        inp_encoder_layers = nn.TransformerEncoderLayer(
            d_model, n_head_inp, dff, attn_dropout)
        y1_encoder_layers = nn.TransformerEncoderLayer(
            d_model, n_head_y1, dff, attn_dropout)
        y2_encoder_layers = nn.TransformerEncoderLayer(
            d_model, n_head_y1, dff, attn_dropout)
        
        self.encoder_inp = nn.TransformerEncoder(
            inp_encoder_layers, n_layers_inp)
        self.encoder_y1 = nn.TransformerEncoder(
            y1_encoder_layers, n_layers_y1)
        self.encoder_y2 = nn.TransformerEncoder(
            y2_encoder_layers, n_layers_y2)
        
        # pooling layers
        self.mask_avgpool = MaskAvgPool().to(device)
        self.mask_maxpool = MaskMaxPool().to(device)
        
        # dropout layers
        self.dropout_x = nn.Dropout(dropout)
        self.dropout_y1 = nn.Dropout(dropout)
        self.dropout_y1_2 = nn.Dropout(dropout)
        self.dropout_y2 = nn.Dropout(dropout)
        self.dropout_y2_2 = nn.Dropout(dropout)
        
        # final layer
        self.final_y1 = nn.Linear(d_model, 2)
        self.final_y2 = nn.Linear(d_model, 2)
    
    def forward(self, x, mask):
        # inp mask: (batch_size, seq_len)
        # inp x: (seq_len, batch_size, d_model)
        
        # positional encoding
        x = self.encoder_pos(x)
        
        # encode input
        # x: (seq_len, batch_size, d_model)
        x = self.encoder_inp(
            x, src_key_padding_mask=mask)
        x = self.dropout_x(x)
        
        # split heads
        # final y1/y2: (seq_len, batch_size, d_model)
        y1 = self.encoder_y1(
            x, src_key_padding_mask=mask)
        y2 = self.encoder_y2(
            x, src_key_padding_mask=mask)
        y1 = self.dropout_y1(y1)
        y2 = self.dropout_y2(y2)
        
        # pool outcome
        # y1/y2: (batch_size, d_model)
        y1 = self.mask_avgpool(y1, mask)
        y2 = self.mask_avgpool(y2, mask)
        y1 = self.dropout_y1_2(y1)
        y2 = self.dropout_y2_2(y2)
        
        # regularizer
        dist = torch.dist(y1, y2)
        
        # final layer
        # (batch_size, 2)
        y1 = self.final_y1(y1)
        y2 = self.final_y2(y2)
        
        return y1, y2, dist
    
# cc = CC(
#     d_model=768, 
#     n_layers_inp=2, n_layers_y1=2, n_layers_y2=2).to(device)

In [12]:
x = torch.tensor([[1, 2],
                  [5, 8.]])
y = torch.tensor([[8, 8],
                  [1, 2.]])
torch.dist(x, y)

RuntimeError: _th_dist not supported on CPUType for Long

## test forward

In [157]:
for i, (text, expert, non_expert, mask) in enumerate(ccdl):
    text = text.to('cuda')
    mask = mask.to('cuda')
    
    y1, y2 = cc(text, mask)
    print(f'y1.shape, y2.shape, dist.shape:')
    print(y1.shape, y2.shape)
    print('---------------')
    
    if i == 1:
        break

torch.Size([3, 1]) torch.Size([3, 1])
---------------
torch.Size([3, 1]) torch.Size([3, 1])
---------------


## loss

In [11]:
# every task has one criterion
criterion = [nn.MSELoss(), nn.MSELoss()]
optimizer = torch.optim.Adam(cc.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [None]:
y = torch.random((3, 2))
t = torch.random((3, 2))
nn.MSELoss(y, t)

## train

In [17]:
def train_one_epoch(ccds_train):
    # For each epoch, create a new DataLoader
    n_batch_in_one_epoch = len(ccds_train)//BATCH_SIZE
    train_loss = 0.0
    ccdl_train = DataLoader(
        ccds_train,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=0,
        collate_fn=generate_batch)
        
    # train one epoch
    for i, (text, expert, non_expert, mask) in enumerate(
        tqdm(ccdl_train, total=n_batch_in_one_epoch)):
        # send to GPU
        text = text.to(device)
        mask = mask.to(device)
        expert = expert.to(device)
        non_expert = non_expert.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        y1, y2, dist = cc(text, mask)
        loss_y1 = criterion[0](y1, expert)
        loss_y2 = criterion[1](y2, non_expert)
        loss = loss_y1 + loss_y2 - LAMBDA * dist
        loss.backward()
        optimizer.step()

        # print statistics
        train_loss += loss.item()
        if i % 5 == 0:
            # print(f'\t{i+1: 5d} loss: {train_loss/100:.3f}')
            train_loss = 0.0
          
    # adjust learning rate
    scheduler.step()
    
    # return mean loss of the epoch
    return train_loss / ccds_train_len

def evaluate(ccds_eval):
    loss = 0.0
    
    # create DataLoader
    ccdl_eval = DataLoader(
        ccds_eval,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=0,
        collate_fn=generate_batch)
    
    # evaluate
    for i, (text, expert, non_expert, mask) in enumerate(ccdl_eval):
        text = text.to(device)
        mask = mask.to(device)
        expert = expert.to(device)
        non_expert = non_expert.to(device)

        with torch.no_grad():
            y1, y2, dist = cc(text, mask)
            loss_y1 = criterion[0](y1, expert)
            loss_y2 = criterion[1](y2, non_expert)
            loss = loss_y1 + loss_y2 - LAMBDA * dist
            loss += loss.item()
            
    # return mean loss
    return loss / len(ccds_eval)


# Let's train!
N_EPOCHS = 2
N_TARGET = 2
BATCH_SIZE = 10
LAMBDA = 1


cc.train();
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_one_epoch(ccds_train)
    valid_loss = evaluate(ccds_valid)
    
    secs = int(time.time() - start_time)
    mins = secs // 60
    secs = secs % 60

    print(f'Epoch: {epoch+1}  |  time in {mins} mins, {secs} secs')
    print(f'  Loss: {train_loss:.3f} (train)')
    print(f'  Loss: {valid_loss:.3f} (valid)')
cc.eval();

100%|████████████████████████████████████████████| 8/8 [00:01<00:00,  6.81it/s]


Epoch: 1  |  time in 0 mins, 1 secs
  Loss: 0.049 (train)
  Loss: 0.397 (valid)


100%|████████████████████████████████████████████| 8/8 [00:01<00:00,  7.07it/s]


Epoch: 2  |  time in 0 mins, 1 secs
  Loss: 0.039 (train)
  Loss: 0.365 (valid)


## save/load

In [34]:
# save
torch.save(cc, f'{DATA_DIR}/cc.pt')

In [35]:
# load
# set to eval mode to disable dropout/normalization
cc = torch.load(f'{DATA_DIR}/cc.pt')
cc.eval();

# Deprecated

## `gpu-memory-log`

In [7]:
import gc
import datetime
import pynvml

import torch
import numpy as np
import sys

def _get_tensors():
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            tensor = obj
        else:
            continue
        if tensor.is_cuda:
            yield tensor

def _write_log(f, write_str):
    print(write_str)
    f.write("%s\n" % write_str)

def gpu_memory_log(gpu_log_file="gpu_mem.log", device=0):
    stack_layer = 1
    func_name = sys._getframe(stack_layer).f_code.co_name
    file_name = sys._getframe(stack_layer).f_code.co_filename
    line = sys._getframe(stack_layer).f_lineno
    now_time = datetime.datetime.now()
    log_format = 'LINE:%s, FUNC:%s, FILE:%s, TIME:%s, CONTENT:%s'

    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)

    with open(gpu_log_file, 'a+') as f:
        write_str = log_format % (line, func_name, file_name, now_time, "")
        _write_log(f, write_str)

        ts_list = [tensor.size() for tensor in _get_tensors()]
        new_tensor_sizes = {(type(x), 
                             tuple(x.size()), 
                             ts_list.count(x.size()), 
                             np.prod(np.array(x.size()))*4/1024**2)
                             for x in _get_tensors()}
        for t, s, n, m in new_tensor_sizes:
            write_str = '[tensor: %s * Size:%s | Memory: %s M | %s]' %(str(n), str(s), str(m*n)[:6], str(t))
            _write_log(f, write_str)

        write_str = "memory_allocated:%f Mb" % float(torch.cuda.memory_allocated()/1024**2)
        _write_log(f, write_str)
        write_str = "max_memory_allocated:%f Mb" % float(torch.cuda.max_memory_allocated()/1024**2)
        _write_log(f, write_str)
        write_str = "memory_cached:%f Mb" % float(torch.cuda.memory_cached()/1024**2)
        _write_log(f, write_str)
        write_str = "max_memory_cached:%f Mb" % float(torch.cuda.max_memory_cached()/1024**2)
        _write_log(f, write_str)
        write_str = "Used Memory:%f Mb" % float(meminfo.used/1024**2)
        _write_log(f, write_str)
        write_str = "Free Memory:%f Mb" % float(meminfo.free/1024**2)
        _write_log(f, write_str)
        write_str = "Total Memory:%f Mb" % float(meminfo.total/1024**2)
        _write_log(f, write_str)

    pynvml.nvmlShutdown()