In [131]:
from transformers import GPT2Tokenizer,GPT2LMHeadModel,BertTokenizer
import torch

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')

In [63]:
device = torch.device('mps')

In [64]:
model = GPT2LMHeadModel.from_pretrained('gpt2-large').to(device)

In [122]:
input = "Give me a python code for generating the first 5 fibonacci numbers"
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_encoding = tokenizer(input,padding=True,truncation=True)

In [123]:
train_encoding

{'input_ids': [23318, 502, 257, 21015, 2438, 329, 15453, 262, 717, 642, 12900, 261, 44456, 3146], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [124]:
input_ids = tokenizer.encode(input, return_tensors="pt").to(device)
output = model.generate(
    input_ids,
    max_length=50,
    temperature=0.7,
    top_k=50,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True
)

In [125]:
output

tensor([[23318,   502,   257, 21015,  2438,   329, 15453,   262,   717,   642,
         12900,   261, 44456,  3146,   357,    72,    13,    68,    13,   352,
            11,   362,    11,   513,    11,   642,   737,   198,   198,    12,
            32, 21015,  2438,   329,  5270,   286,   262,   717,   642, 41566,
           261, 44456,  3146,    13,   198,   198,    12,    32, 21015,  2438]],
       device='mps:0')

In [126]:
output[0]

tensor([23318,   502,   257, 21015,  2438,   329, 15453,   262,   717,   642,
        12900,   261, 44456,  3146,   357,    72,    13,    68,    13,   352,
           11,   362,    11,   513,    11,   642,   737,   198,   198,    12,
           32, 21015,  2438,   329,  5270,   286,   262,   717,   642, 41566,
          261, 44456,  3146,    13,   198,   198,    12,    32, 21015,  2438],
       device='mps:0')

In [127]:
tokenizer.decode(output[0])

'Give me a python code for generating the first 5 fibonacci numbers (i.e. 1, 2, 3, 5).\n\n-A python code for generation of the first 5 Fibonacci numbers.\n\n-A python code'

In [128]:
tokenizer.encode(input,return_tensors='pt')

tensor([[23318,   502,   257, 21015,  2438,   329, 15453,   262,   717,   642,
         12900,   261, 44456,  3146]])

In [129]:
tokenizer.encode(input)

[23318,
 502,
 257,
 21015,
 2438,
 329,
 15453,
 262,
 717,
 642,
 12900,
 261,
 44456,
 3146]

In [130]:
tokenizer.tokenize(input)

['Give',
 'Ġme',
 'Ġa',
 'Ġpython',
 'Ġcode',
 'Ġfor',
 'Ġgenerating',
 'Ġthe',
 'Ġfirst',
 'Ġ5',
 'Ġfib',
 'on',
 'acci',
 'Ġnumbers']

In [132]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [133]:
bert_tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [134]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2-large', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [135]:
bert_tokenizer.tokenize(input)

['give',
 'me',
 'a',
 'python',
 'code',
 'for',
 'generating',
 'the',
 'first',
 '5',
 'fi',
 '##bon',
 '##ac',
 '##ci',
 'numbers']

In [136]:
bert_tokenizer.vocab_size

30522

In [138]:
bert_tokenizer.tokenize(input)[0]

'give'