## BERT Model

In [3]:
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

token = tokenizer("Kolkata is the Capital of West Bengal", return_tensors='pt')
token

{'input_ids': tensor([[  101, 13522,  2003,  1996,  3007,  1997,  2225,  8191,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [5]:
token = tokenizer(["Kolkata is the Capital of West Bengal", " Good Morning"], padding=True, return_tensors='pt')
token

{'input_ids': tensor([[  101, 13522,  2003,  1996,  3007,  1997,  2225,  8191,   102],
        [  101,  2204,  2851,   102,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0]])}

In [6]:
token = tokenizer(["Kolkata is the Capital of West Bengal", " Good Morning"], padding='max_length', max_length=10, return_tensors='pt')
token

{'input_ids': tensor([[  101, 13522,  2003,  1996,  3007,  1997,  2225,  8191,   102,     0],
        [  101,  2204,  2851,   102,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}

In [7]:
#token = tokenizer(["Kolkata is the Capital of West Bengal", " Good Morning"], padding='max_length', max_length=7, return_tensors='pt')
#token 
# It will throw an error

In [8]:
token = tokenizer(["Kolkata is the Capital of West Bengal", " Good Morning"], padding='max_length', max_length=7, truncation=True, return_tensors='pt')
print(token)  # Token for "Kolkata is the Capital of"

{'input_ids': tensor([[  101, 13522,  2003,  1996,  3007,  1997,   102],
        [  101,  2204,  2851,   102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0]])}


## Bert Model

In [9]:
token = tokenizer("Kolkata is the Capital of West Bengal", return_tensors='pt')
token

{'input_ids': tensor([[  101, 13522,  2003,  1996,  3007,  1997,  2225,  8191,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
Text = "Data Science"
token = tokenizer(Text, return_tensors='pt')
model = BertModel.from_pretrained('bert-base-uncased')
output = model(**token)

In [11]:
print(output['last_hidden_state'])
output['last_hidden_state'].shape


tensor([[[ 0.1700,  0.0984, -0.5506,  ..., -0.5209,  0.1481,  0.6785],
         [-0.0902, -0.5260, -0.1495,  ..., -0.2665,  0.4729,  0.1830],
         [-0.1192, -0.3216, -0.5444,  ..., -0.5233,  0.0967, -0.3171],
         [ 0.9494, -0.0994, -0.3951,  ...,  0.1451, -0.7749, -0.1792]]],
       grad_fn=<NativeLayerNormBackward0>)


torch.Size([1, 4, 768])

In [12]:
#print(output['pooler_output'])
output['pooler_output'].shape

torch.Size([1, 768])

In [17]:
tokenizer.convert_ids_to_tokens(token['input_ids'][0])

['[CLS]', 'data', 'science', '[SEP]']