In [2]:
import pandas as pd 
import torch
import json 
from torch.utils.data import DataLoader

from gptonly import GPT
from gptonly.utils import plot_trp
from data import GenerationDM

In [3]:
device = "cuda:0"

In [4]:
model = GPT(
        pretrained_model_name="gpt2",
        finetune=True,
        device=device,
        speaker_tokens=True,
        projection_labels=False,
    )

model.to(device)

Tokenizer initialization:
	We added 4 tokens -> Special token map
	bos_token: <|endoftext|>
	eos_token: <ts>
	unk_token: <|endoftext|>
	pad_token: <|endoftext|>
	additional_special_tokens: ['<speaker1>', '<speaker2>']

Initalized <ts> -> avg(['!', '?', '.'])


GPT(
  (dropout): Dropout(p=0.1, inplace=False)
  (gpt): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50260, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50260, bias=False)
  )
  (

In [5]:
criterion = torch.nn.CrossEntropyLoss().to(device)
# , weight_decay=config.weight_decay)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [6]:
test_ds = GenerationDM(
            split="test",
            tokenizer=model.get_tokenizer(),
            overwrite=False,
            max_length=200,
            keep_length=64,
            overlap_length=10,
            datasets=["switchboard"],
        )
test_ds.prepare_data()
test_dl = DataLoader(
    test_ds,
    batch_size=4,
    collate_fn=test_ds.collate_fn,
    num_workers=8,
    shuffle=True
)

In [7]:
def generate_labels(input_ids, mask=None, pad_id=-100):
        labels = input_ids.clone()
        labels[torch.logical_not(mask)] = pad_id

        return labels

def generate_projection_labels(labels):
    batch_size, num_labels = labels.size()

    mask = (labels == model.tokenizer.eos_token_id)
    distances = torch.full((batch_size, num_labels),
                           num_labels, device=labels.device)
    distances[mask] = 0

    for i in range(num_labels - 2, -1, -1):
        distances[:, i] = torch.minimum(
            distances[:, i], distances[:, i+1] + 1)

    return distances

In [8]:
def is_not_trp_example(logits, labels):
        probs = logits.softmax(dim=-1)
        trp_prob = probs[..., model.tokenizer.eos_token_id]
        trp_prob = trp_prob[..., :-1]

        labels = labels[..., 1:]
        is_trp = labels == model.tokenizer.eos_token_id
        not_trp = labels != model.tokenizer.eos_token_id

        return torch.max(trp_prob - is_trp.long()).item() > 0.5

In [11]:
batches = []
BATCHES_LENGTH = 20
model.eval()

outputs = []
with torch.no_grad():
    for step, batch in enumerate(test_dl):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        labels = generate_labels(input_ids, mask=attention_mask)
        projection_labels = generate_projection_labels(labels)
        out = model.forward(
            input_ids, labels=labels, projection_labels=projection_labels, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        logits = out.logits 

        probs = logits.softmax(dim=-1)
        trp_prob = probs[..., model.tokenizer.eos_token_id]
        trp_prob = trp_prob[..., :-1]
        
        labels = labels[..., 1:]
        is_trp = labels == model.tokenizer.eos_token_id
        not_trp = labels != model.tokenizer.eos_token_id

        write = {'trp_prob': trp_prob, 'labels': labels}
        outputs.append(write) 

        if len(outputs) == 1000:
            with open('output.txt', 'a') as convert_file: 
                 convert_file.write(json.dumps(outputs))
                 outputs = []

        for j in range(len(trp_prob)):
            max_val = torch.max(trp_prob[j] - is_trp[j].long())
            max_idx = torch.argmax(trp_prob[j] - is_trp[j].long())
            if len(batches) >= BATCHES_LENGTH and batches[0][2] <= max_val:
                batches.pop(0)
            elif len(batches) >= BATCHES_LENGTH:
                continue
                
            i = 0 
            while i < len(batches) and batches[i][2] < max_val:
                i+=1
    
            batches.insert(i, (trp_prob[j], labels[j], max_val, max_idx))

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

OutOfMemoryError: CUDA out of memory. Tried to allocate 154.00 MiB. GPU 0 has a total capacty of 23.69 GiB of which 138.06 MiB is free. Process 845633 has 23.55 GiB memory in use. Of the allocated memory 23.08 GiB is allocated by PyTorch, and 179.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [10]:
[key.item() for _,_,key in batches]

ValueError: too many values to unpack (expected 3)

In [None]:
figs = []
for b in range(len(batches)):
    idx = batches[b][3]
    text = model.tokenizer.convert_ids_to_tokens(batches[b][1][idx-25:idx+25])
    # print(batches[b][0], (batches[b][0].cpu()).shape, text)
    fig, _ = plot_trp(
        trp=batches[b][0][idx-25:idx+25].cpu(),
        text=text,
        eos_token='<ts>'
    )
    figs.append(fig)

In [None]:
trp_prob.shape