In [None]:
!pip install datasets



In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

In [None]:
data = load_dataset('code_search_net','python')

In [None]:
data['train']

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [None]:
print(data['train'][123456]["whole_func_string"])

def _from_json(json_data):
        """
        Creates a Coordinate from json data.
        
        :param json_data: The raw json data to parse
        :type json_data: dict
        :returns: Coordinate
        """
        if len(json_data) >= 3:
            return Coordinate(_parse_float(json_data[0]),
                        _parse_float(json_data[1]),
                        _parse_float(json_data[2]))
        else:
            raise USGSException("The given coordinate information was incomplete.")


In [None]:
def get_training_corpus():
  return(data['train'][i:i+1000]['whole_func_string'] for i in range(0,len(data['train']),1000))

training_corpus = get_training_corpus()

In [None]:
training_corpus

<generator object get_training_corpus.<locals>.<genexpr> at 0x7da2ad3852a0>

# Model Initialization and Training

In [None]:
old_tokenizer=AutoTokenizer.from_pretrained('gpt2')


In [None]:
demo='''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens=old_tokenizer.tokenize(demo)
tokens

['def',
 'Ġadd',
 '_',
 'n',
 'umbers',
 '(',
 'a',
 ',',
 'Ġb',
 '):',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`',
 '."',
 '""',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [None]:
new_tokenizer=old_tokenizer.train_new_from_iterator(training_corpus, 52000)

In [None]:
tokens = new_tokenizer.tokenize(demo)
tokens

['def',
 'Ġadd',
 '_',
 'numbers',
 '(',
 'a',
 ',',
 'Ġb',
 '):',
 'ĊĠĠĠ',
 'Ġ"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 'Ġ`',
 'a',
 '`',
 'Ġand',
 'Ġ`',
 'b',
 '`."""',
 'ĊĠĠĠ',
 'Ġreturn',
 'Ġa',
 'Ġ+',
 'Ġb']

In [None]:
print(len(tokens))
print(len(old_tokenizer.tokenize(demo)))

27
36


In [None]:
demo_2="""
tokenizer = AutoTokenizer.train_new_from_iterator(
    texts,
    vocab_size=100,  # Define the size of the vocabulary
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]  # Add special tokens
)

# Test the trained tokenizer
sample_text = "How are tokenizers trained?"
tokens = tokenizer.tokenize(sample_text)
print("Tokenized output:", tokens)

# Convert tokens to IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)
"""

In [None]:
tokens=new_tokenizer.tokenize(demo_2)
tokens

['Ġ',
 'Ċ',
 'tokenizer',
 'Ġ=',
 'ĠAuto',
 'Tokenizer',
 '.',
 'train',
 '_',
 'new',
 '_',
 'from',
 '_',
 'iterator',
 '(',
 'ĊĠĠĠ',
 'Ġtexts',
 ',',
 'ĠĊĠĠĠ',
 'Ġvocab',
 '_',
 'size',
 '=',
 '100',
 ',',
 'Ġ',
 'Ġ#',
 'ĠDefine',
 'Ġthe',
 'Ġsize',
 'Ġof',
 'Ġthe',
 'Ġvocabulary',
 'ĊĠĠĠ',
 'Ġspecial',
 '_',
 'tokens',
 '=["',
 '[',
 'PAD',
 ']",',
 'Ġ"[',
 'UNK',
 ']",',
 'Ġ"[',
 'CLS',
 ']",',
 'Ġ"[',
 'SEP',
 ']",',
 'Ġ"[',
 'MASK',
 ']"]',
 'Ġ',
 'Ġ#',
 'ĠAdd',
 'Ġspecial',
 'Ġtokens',
 'Ċ',
 ')',
 'Ċ',
 'Ċ',
 '#',
 'ĠTest',
 'Ġthe',
 'Ġtrained',
 'Ġtokenizer',
 'Ċ',
 'sample',
 '_',
 'text',
 'Ġ=',
 'Ġ"',
 'How',
 'Ġare',
 'Ġtoken',
 'izers',
 'Ġtrained',
 '?"',
 'Ċ',
 'tokens',
 'Ġ=',
 'Ġtokenizer',
 '.',
 'tokenize',
 '(',
 'sample',
 '_',
 'text',
 ')',
 'Ċ',
 'print',
 '("',
 'Token',
 'ized',
 'Ġoutput',
 ':",',
 'Ġtokens',
 ')',
 'Ċ',
 'Ċ',
 '#',
 'ĠConvert',
 'Ġtokens',
 'Ġto',
 'ĠIDs',
 'Ċ',
 'token',
 '_',
 'ids',
 'Ġ=',
 'Ġtokenizer',
 '.',
 'convert',
 '_',
 'tokens

In [None]:
new_tokenizer.save_pretrained('code_search_tokenizer')

('code_search_tokenizer/tokenizer_config.json',
 'code_search_tokenizer/special_tokens_map.json',
 'code_search_tokenizer/vocab.json',
 'code_search_tokenizer/merges.txt',
 'code_search_tokenizer/added_tokens.json',
 'code_search_tokenizer/tokenizer.json')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
new_tokenizer.push_to_hub('code_search_tokenizer')

CommitInfo(commit_url='https://huggingface.co/PavansaiGundaram/code_search_tokenizer/commit/4d6d4e74626daec81812d31e5732c824ad68aee8', commit_message='Upload tokenizer', commit_description='', oid='4d6d4e74626daec81812d31e5732c824ad68aee8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/PavansaiGundaram/code_search_tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='PavansaiGundaram/code_search_tokenizer'), pr_revision=None, pr_num=None)

In [None]:
tokenizer=AutoTokenizer.from_pretrained('PavansaiGundaram/code_search_tokenizer')

tokenizer_config.json:   0%|          | 0.00/471 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/822k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/467k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.67M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
demo_3="""
encoded = tokenizer.encode(sample_text, add_special_tokens=True)
print("Encoded IDs with special tokens:", encoded)

decoded = tokenizer.decode(encoded)
print("Decoded text:", decoded)
"""

tokens=tokenizer.tokenize(demo_3)
tokens

['Ċ',
 'encoded',
 'Ġ=',
 'Ġtokenizer',
 '.',
 'encode',
 '(',
 'sample',
 '_',
 'text',
 ',',
 'Ġadd',
 '_',
 'special',
 '_',
 'tokens',
 '=',
 'True',
 ')',
 'Ċ',
 'print',
 '("',
 'Encoded',
 'ĠIDs',
 'Ġwith',
 'Ġspecial',
 'Ġtokens',
 ':",',
 'Ġencoded',
 ')',
 'Ċ',
 'Ċ',
 'decoded',
 'Ġ=',
 'Ġtokenizer',
 '.',
 'decode',
 '(',
 'encoded',
 ')',
 'Ċ',
 'print',
 '("',
 'De',
 'coded',
 'Ġtext',
 ':",',
 'Ġdecoded',
 ')',
 'Ċ']