In [1]:
from transformers import BartTokenizerFast
import polars as pl
import torch
import random

In [2]:
df_full = pl.read_parquet('/kaggle/input/nepbart-chunked-tokenized-dataset/nepbart_chunked_train.parquet')

In [3]:
df = df_full[:1_000_000].clone()

In [4]:
del df_full

In [5]:
tokenizer = BartTokenizerFast.from_pretrained("/kaggle/input/nepbart-tokenizer/nepbart_tokenizer")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'BartTokenizerFast'.


In [6]:
SEQUENCE_LENGTH = 512
CHUNK_LENGTH = 510

In [7]:
max(df.with_columns(pl.col('labels').map_elements(lambda x: len(x), return_dtype=int).alias('length'))['length'])

510

In [8]:
def token_infilling(
    tokenized_sequence: torch.Tensor,
    mask_token_id: int,
    mask_probability: float = 0.15,
    list_special_tokens: list = [],
) -> pl.List(pl.UInt16):
    # print(len(tokenized_sequence))
    # print(tokenized_sequence)
    # if len(tokenied_sequence) <= 10:
    span_length = int(torch.poisson(torch.tensor([3.0])))
    perturbed_ids = torch.empty(0, dtype=torch.long)
    ## If span_length is found to be 0, make it 1, 2, or 3 randomly
    if span_length == 0:
        span_length = random.choice([1, 2, 3])
    # print(span_length)
    for i in range(0, len(tokenized_sequence), span_length):
        mask_pl = torch.rand(1)
        # print(mask_pl)
        if mask_pl < mask_probability:
            # check if the span does not contain special tokens
            # print([token in list_special_tokens for token in tokenized_sequence[i : i + span_length]])
            if not any([token in list_special_tokens for token in tokenized_sequence[i : i + span_length]]):
                perturbed_ids = torch.cat(
                    (perturbed_ids, torch.tensor([mask_token_id], dtype=torch.long))
                )
        else:
            perturbed_ids = torch.cat(
                (perturbed_ids, tokenized_sequence[i : i + span_length])
            )

    ## We ensure the input always has atleast 1 masked token
    if mask_token_id not in perturbed_ids:
        return token_infilling(
            tokenized_sequence,
            mask_token_id,
            mask_probability,
            list_special_tokens,
        )
                
    # print(len(perturbed_ids))
    return perturbed_ids.tolist()


In [9]:
def validate(df, tokenizer, seq_len):
    def add_padding(x: list[int]):
        sequence = [tokenizer.bos_token_id] + x.to_list() + [tokenizer.eos_token_id]
        final_sequence = sequence + [tokenizer.pad_token_id] * (seq_len - len(sequence))
        return final_sequence
    df = df.with_columns(pl.col('text').map_elements(lambda x: tokenizer(x, return_tensors='pt')['input_ids'][0].tolist(), return_dtype=list[int]).alias('labels'))
    df = df.with_columns(pl.col('text').map_elements(lambda x: token_infilling(tokenizer(x, return_tensors='pt')['input_ids'][0], tokenizer.mask_token_id).tolist(), return_dtype=list[int]).alias('input_ids'))
    df = df.with_columns(pl.col('labels').map_elements(lambda x: len(x), return_dtype=int).alias('labels_len'))
    df = df.with_columns(pl.col('input_ids').map_elements(lambda x: [1] * (len(x)+2) + [0] * (seq_len - len(x) - 2), return_dtype=list[int]).alias('attention_mask'))
    df = df.with_columns(pl.col('attention_mask').map_elements(lambda x: len(x), return_dtype=int).alias('attention_mask_len'))
    assert len(set(df['attention_mask_len'].to_list())) == 1 and list(set(df['attention_mask_len'].to_list()))[0] == seq_len 
    df = df.with_columns(pl.col('labels').map_elements(lambda x: add_padding(x), return_dtype=list[int]).alias('labels'))
    df = df.with_columns(pl.col('labels').map_elements(lambda x: len(x), return_dtype=int).alias('labels_len'))
    assert len(set(df['labels_len'].to_list())) == 1 and list(set(df['labels_len'].to_list()))[0] == seq_len 
    df = df.with_columns(pl.col('input_ids').map_elements(lambda x: add_padding(x), return_dtype=list[int]).alias('input_ids'))
    df = df.with_columns(pl.col('input_ids').map_elements(lambda x: len(x), return_dtype=int).alias('input_ids_len'))
    assert len(set(df['input_ids_len'].to_list())) == 1 and list(set(df['input_ids_len'].to_list()))[0] == seq_len 
    df = df.with_columns(pl.col('input_ids').map_elements(lambda x: len([1 for i in x if i != 1]), return_dtype=int).alias('input_ids_pad_len'))
    df = df.with_columns(pl.col('attention_mask').map_elements(lambda x: len([1 for i in x if i == 1]), return_dtype=int).alias('attn_mask_1_len'))
    assert df['input_ids_pad_len'].to_list() == df['attn_mask_1_len'].to_list()
    

In [10]:
# validate(df[:10], tokenizer, SEQUENCE_LENGTH)

In [11]:
def add_padding(x: list[int], tokenizer, seq_len):
    sequence = [tokenizer.bos_token_id] + x  + [tokenizer.eos_token_id]

    assert len(sequence) <= SEQUENCE_LENGTH

    if len(sequence) == SEQUENCE_LENGTH:
        return sequence
    
    final_sequence = sequence + [tokenizer.pad_token_id] * (seq_len - len(sequence))
    # print(final_sequence)
    return final_sequence

In [12]:
# input_ids = []
# # attention_mask = []

# labels = df['labels'].to_list()

# del df

# for idx, label in enumerate(labels):

#     if (idx + 1) % 100_000 == 0:
#         print(idx+1, 'completed')
#     # pass
#     # labels.append(label)
#     input_ids.append(token_infilling(torch.tensor(label), tokenizer.mask_token_id).tolist())
#     # # attention_mask.append([1] * (len(input_id)+2) + [0] * (SEQUENCE_LENGTH - len(input_id) - 2))
#     # # labels.append(add_padding(label, tokenizer, SEQUENCE_LENGTH))
#     # input_ids.append(add_padding(input_id, tokenizer, SEQUENCE_LENGTH))

In [13]:
# token_infilling(torch.tensor([3772, 131, 332, 231]), tokenizer.mask_token_id, mask_probability=0.15, list_special_tokens=tokenizer.all_special_ids)

In [14]:
df = df.with_columns(pl.col('labels').map_elements(lambda x: token_infilling(torch.tensor(x), tokenizer.mask_token_id, mask_probability=0.15, list_special_tokens=tokenizer.all_special_ids), return_dtype=pl.List(pl.UInt16)).alias('input_ids'))

In [15]:
# pl.DataFrame(data = {
#     # 'labels': labels,
#     'input_ids': input_ids,
#     # 'attention_mask': attention_mask
#     },
#     schema={
#         # 'labels': list[int],
#         'input_ids': pl.List(pl.UInt16),
#         # 'attention_mask': list[int]
#     }).write_parquet('/kaggle/working/nepbart_tokenized_test.parquet')

In [16]:
# pl.read_parquet('/kaggle/working/nepbart_tokenized_train.parquet')

In [17]:
# df

In [18]:
# df = df.with_columns(pl.col('text').map_elements(lambda x: tokenizer(x, return_tensors='pt')['input_ids'][0].tolist(), return_dtype=list[int]).alias('labels'))
# df = df.with_columns(pl.col('text').map_elements(lambda x: token_infilling(tokenizer(x, return_tensors='pt')['input_ids'][0], tokenizer.mask_token_id).tolist(), return_dtype=list[int]).alias('input_ids'))

In [19]:
# df = df.drop('text')
# df = df.with_columns(pl.col('input_ids').map_elements(lambda x: [1] * (len(x)+2) + [0] * (SEQUENCE_LENGTH - len(x) - 2), return_dtype=list[int]).alias('attention_mask'))
# df = df.with_columns(pl.col('labels').map_elements(lambda x: add_padding(x, tokenizer, SEQUENCE_LENGTH), return_dtype=list[int]).alias('labels'))
# df = df.with_columns(pl.col('input_ids').map_elements(lambda x: add_padding(x, tokenizer, SEQUENCE_LENGTH), return_dtype=list[int]).alias('input_ids'))
# df.write_parquet('/kaggle/working/nepbart_tokenized_train.parquet')
# df