---
description: API details.
output-file: data.html
title: Data
---


In [None]:
#| default_exp data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#| include: false
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
#| include: false
from nbdev.showdoc import *

In [4]:
#| export
import random
import torch

import numpy as np
import pytorch_lightning as pl

from dataclasses import dataclass
from datasets import load_dataset
from itertools import chain
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, default_data_collator
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
from torch.utils.data import DataLoader
from torch.utils.data._utils.collate import default_collate
from tqdm.auto import tqdm
from typing import Dict, List, Optional

2023-03-22 13:17:35.577991: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-22 13:17:36.048085: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.6/lib64:
2023-03-22 13:17:36.048133: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.6/lib64:


In [5]:
#| export
ENC_MAX_LEN = 512
DEC_MAX_LEN = 128
BATCH_SIZE = 32
VOCAB_SIZE = 2**15
random.seed(115)

In [6]:
#| export
def train_tokenizer(train_dataset, vocab_size=2**15):
    # Code modified from: https://huggingface.co/blog/codeparrot

    # Base tokenizer
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    base_vocab = list(bytes_to_unicode().values())

    # Load dataset
    training_corpus = (
        train_dataset[i : i + 1000]["original_method"]
        for i in range(0, len(train_dataset), 1000)
    )

    # Training and saving
    tokenizer = tokenizer.train_new_from_iterator(
        training_corpus,
        vocab_size=vocab_size,
        initial_alphabet=base_vocab
    )
    tokenizer.add_special_tokens(
        {
            "mask_token": "<MASK>",
            "pad_token": "<PAD>",
            "bos_token": "<BOS>",
            "eos_token": "<EOS>",
        }
    )
    tokenizer.add_tokens(["<NEW_LINE>"])
    tokenizer.save_pretrained("completeformer_tokenizer_java")

    return tokenizer

In [7]:
#| export
def tokenize(examples, tokenizer, enc_max_len, dec_max_len):
    tokenized_example = tokenizer(
        examples["input"],
        padding=False,
        truncation=True,
        max_length=enc_max_len
    )
    examples["target"] = [
        "<BOS>" + x  + "<EOS>" for x in examples["target"]
    ]
    targets = tokenizer(
        examples["target"],
        padding=False,
        truncation=True,
        max_length=dec_max_len
    )
    tokenized_example["labels"] = targets["input_ids"]
    return tokenized_example

In [8]:
#| export
class CompleteformerDataset(pl.LightningDataModule):
    def __init__(
        self,
        length,
        language,
        tokenizer_name=None,
        batch_size=8,
        enc_max_len=512,
        dec_max_len=128,
        num_workers=4,
        vocab_size=2**15
    ):
        super().__init__()
        self.tokenizer_name = tokenizer_name
        self.batch_size = batch_size
        self.enc_max_len = enc_max_len
        self.dec_max_len = dec_max_len
        self.num_workers = num_workers
        self.vocab_size = vocab_size

        if language == "java":
            self.dataset = load_dataset("semeru/completeformer_java_data", length)
        elif language == "python":
            self.dataset = load_dataset("semeru/completeformer", length)
        else:
            raise ValueError(f"Language {language} not supported. Please choose from java or python.")

        self.train_dataset = self.dataset["train"]
        self.valid_dataset = self.dataset["validation"]
        self.test_dataset = self.dataset["test"]
        
        if self.tokenizer_name is None:
            self.tokenizer = train_tokenizer(self.train_dataset, self.vocab_size)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
        
        self.data_collator = DataCollatorForSeq2Seq(
            self.tokenizer,
            label_pad_token_id=self.tokenizer.pad_token_id,
            pad_to_multiple_of=8,
        )
    
    def prepare_data(self):
        if "input_ids" not in self.train_dataset.column_names:
            self.train_dataset = self.train_dataset.map(
                lambda x: tokenize(
                    x,
                    self.tokenizer,
                    self.enc_max_len,
                    self.dec_max_len
                ),
                batched=True,
                num_proc=self.num_workers,
                remove_columns=self.train_dataset.column_names,
                load_from_cache_file=False,
            )
            self.valid_dataset = self.valid_dataset.map(
                lambda x: tokenize(
                    x,
                    self.tokenizer,
                    self.enc_max_len,
                    self.dec_max_len
                ),
                batched=True,
                num_proc=self.num_workers,
                remove_columns=self.valid_dataset.column_names,
                load_from_cache_file=False,
            )
            self.test_dataset = self.test_dataset.map(
                lambda x: tokenize(
                    x,
                    self.tokenizer,
                    self.enc_max_len,
                    self.dec_max_len
                ),
                batched=True,
                num_proc=self.num_workers,
                remove_columns=self.test_dataset.column_names,
                load_from_cache_file=False,
            )

            # Set everything to torch tensors
            self.train_dataset.set_format(
                type="torch",
                columns=["input_ids", "attention_mask", "labels"],
            )
            self.valid_dataset.set_format(
                type="torch",
                columns=["input_ids", "attention_mask", "labels"],
            )
            self.test_dataset.set_format(
                type="torch",
                columns=["input_ids", "attention_mask", "labels"],
            )
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            collate_fn=self.data_collator,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.valid_dataset,
            collate_fn=self.data_collator,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            collate_fn=self.data_collator,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers
        )

In [10]:
NUM_SPECIAL_TOKENS = 5
short_ds = CompleteformerDataset(
    length="short",
    language="python",
    tokenizer_name="semeru/completeformer_tokenizer",
    batch_size=BATCH_SIZE,
    enc_max_len=ENC_MAX_LEN,
    dec_max_len=DEC_MAX_LEN,
    num_workers=4,
)
assert len(short_ds.tokenizer) == short_ds.vocab_size + NUM_SPECIAL_TOKENS

short_ds.prepare_data()
assert short_ds.train_dataset.column_names == ["input_ids", "attention_mask", "labels"]
for row in short_ds.train_dataset:
    assert len(row["input_ids"]) <= ENC_MAX_LEN
    assert len(row["attention_mask"]) <= ENC_MAX_LEN
    assert len(row["labels"]) <= DEC_MAX_LEN

Found cached dataset completeformer (/work/.cache/huggingface/datasets/semeru___completeformer/short/1.1.0/ce1b15cfe83a260c6798ed5a300527b0d71a5e3ef384a75341d675694c83bad2)


  0%|          | 0/3 [00:00<?, ?it/s]

     

#0:   0%|          | 0/70 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/70 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/70 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/70 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/9 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/9 [00:00<?, ?ba/s]

In [11]:
train_dl = short_ds.train_dataloader()
batch = next(iter(train_dl))
assert batch["input_ids"].shape[0] == BATCH_SIZE and batch["input_ids"].shape[1] <= ENC_MAX_LEN
assert batch["attention_mask"].shape[0] == BATCH_SIZE and batch["attention_mask"].shape[1] <= ENC_MAX_LEN
assert batch["labels"].shape[0] == BATCH_SIZE and batch["labels"].shape[1] <= DEC_MAX_LEN

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
#| include: false
from nbdev import nbdev_export; nbdev_export()