In [1]:
from torch.utils.data.dataloader import DataLoader
from transformers import DataCollatorWithPadding
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
import transformers
import numpy as np
import torch
import re

In [2]:
# FEED_FORWARD_DIM = 32
# BATCH_SIZE       = 2
# EMBED_DIM        = 300
# NUM_HEADS        = 3
# NUM_LAYERS       = 2
# EPOCHS           = 3
# NUM_TOKENS_TO_GENERATE = 80

In [3]:
tokenizer_gpt = AutoTokenizer.from_pretrained("distilbert-base-uncased") 
# tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2")
setfit = load_dataset("SetFit/bbc-news")
dataset = concatenate_datasets([setfit['train'], setfit['test']])

In [4]:
tokenizer_gpt

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
VOCAB_SIZE       = tokenizer_gpt.vocab_size
SEQ_LEN          = 512

In [6]:
# def tokenize_function(example):
#     return tokenizer_gpt(example["text"], padding=True, truncation=True, max_length=SEQ_LEN)

# tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [7]:
tokenizer_gpt.added_tokens_decoder

{0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)}

In [8]:
def sliding_window_inputs(text, max_length = 512):
    max_len = max_length
    tokenized = tokenizer_gpt(text)
    input_ids = tokenized['input_ids']
    # attn_mask = tokenized['attention_mask']
    input_ids.pop(0) # 101
    input_ids.pop(-1) # 102

    if len(input_ids) <= max_len:
        return [ [input_ids[:-1], input_ids[1:]] ]
    else:
        data = []

        for i in range(len(input_ids) - max_len):

            data.append(
                [input_ids[i:(i+max_len)], input_ids[(i+1):(i+1+max_len)]]
            )

        return data
    

In [9]:
def padding_and_attn(arr_tup, max_length=512):
    arr = arr_tup
    
    ones = len(arr)
    zeros = max_length - len(arr)
    
    if ones < max_length:
        attn = [1 for _ in range(ones)] + [0 for _ in range(zeros)]
        arr_pad = arr + [0 for _ in range(zeros)]
        return arr_pad, attn 
    else: 
        return arr, [1 for _ in range(max_length)]

In [10]:
gpt_data = {
    'label': [],
    'input_ids' : [], 
    'attention_mask' : []
}

In [11]:
final = []
for i in range(len(dataset)): 
    temp = sliding_window_inputs(dataset[i]['text'])
    final.append(temp)

Token indices sequence length is longer than the specified maximum sequence length for this model (706 > 512). Running this sequence through the model will result in indexing errors


In [21]:
gpt_template = {
    'label': [],
    'input_ids' : [], 
    'attention_mask' : []
}

max_len = 512

for i in range(len(final)): 
    temp = final[i]

    for t in temp:
        arr1, attn = padding_and_attn(t[0], max_len)
        arr2, attn = padding_and_attn(t[1], max_len)
        gpt_template['label'].append(arr2)
        gpt_template['attention_mask'].append(attn)
        gpt_template['input_ids'].append(arr1)

ds = Dataset.from_dict(gpt_template)

In [22]:
ds

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 144892
})

In [None]:
for i in range(len(dataset)): 
    temp = sliding_window_inputs(dataset[i]['text'])
    
    if len(temp) == 2:
        arr1, attn = padding_and_attn(temp[0], max_len)
        arr2, _ = padding_and_attn(temp[1], max_len)
        gpt_template['label'].append(arr2)
        gpt_template['attention_mask'].append(attn)
        gpt_template['input_ids'].append(arr1)
    else:
        for t in temp:
            arr1, attn = padding_and_attn(t[0], max_len)
            arr2, attn = padding_and_attn(t[1], max_len)
            gpt_template['label'].append(arr2)
            gpt_template['attention_mask'].append(attn)
            gpt_template['input_ids'].append(arr1)

In [None]:
ds = Dataset.from_dict(gpt_data)
# ds.set_format(type='torch', columns=['label', 'input_ids', 'attention_mask'])

In [None]:
dataloader = DataLoader(
    ds,
    batch_size=30
)

In [None]:
test = next(iter(dataloader))

In [None]:
input_ids = torch.stack(test['input_ids']).T
label = test['label']
attn = torch.stack(test['attention_mask']).T 