# Train a tokenizer

##  assemble a corpus of Python source code

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("code_search_net", "python")

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 8.44k/8.44k [00:00<00:00, 22.2MB/s]
Downloading readme: 100%|██████████| 12.9k/12.9k [00:00<00:00, 31.9MB/s]
Downloading data: 100%|██████████| 941M/941M [00:23<00:00, 39.7MB/s] 
Generating train split: 100%|██████████| 412178/412178 [01:42<00:00, 4010.45 examples/s]
Generating test split: 100%|██████████| 22176/22176 [00:05<00:00, 4113.70 examples/s]
Generating validation split: 100%|██████████| 23107/23107 [00:05<00:00, 3874.45 examples/s]


In [2]:
raw_datasets["train"]

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [3]:
print(raw_datasets["train"][123456]["whole_func_string"])

def init(self, args=None):
        """Fills `settings` with values from `settings.py` and env."""
        from .logs import exception

        self._setup_user_dir()
        self._init_settings_file()

        try:
            self.update(self._settings_from_file())
        except Exception:
            exception("Can't load settings from file", sys.exc_info())

        try:
            self.update(self._settings_from_env())
        except Exception:
            exception("Can't load settings from env", sys.exc_info())

        self.update(self._settings_from_args(args))


In [8]:
def get_training_corpus():
    return (
        raw_datasets["train"][i : i + 1000]["whole_func_string"]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )


training_corpus = get_training_corpus()

In [5]:
# Don't uncomment the following line unless your dataset is small!
# training_corpus = [raw_datasets["train"][i: i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000)]
#Using a Python generator, we can avoid Python loading anything into memory until it’s actually necessary. To create such a generator, 
# you just to need to replace the brackets with parentheses:
# training_corpus = (
#     raw_datasets["train"][i : i + 1000]["whole_func_string"]
#     for i in range(0, len(raw_datasets["train"]), 1000)
# )
#You can also define your generator inside a for loop by using the yield statement:
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["whole_func_string"]

In [6]:
#Training a new tokenizer
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")



In [7]:
example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
tokens

['def',
 'Ġadd',
 '_',
 'n',
 'umbers',
 '(',
 'a',
 ',',
 'Ġb',
 '):',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`',
 '."',
 '""',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [9]:
#seriously train a new tokenizer :)
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)






In [10]:
tokens = tokenizer.tokenize(example)
tokens

['def',
 'Ġadd',
 '_',
 'numbers',
 '(',
 'a',
 ',',
 'Ġb',
 '):',
 'ĊĠĠĠ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`."""',
 'ĊĠĠĠ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']