In [1]:
from transformers import AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [3]:
tokenizer.vocab_size

30522

In [4]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [5]:
tokenizer.model_max_length

512

In [6]:
encoded_str = tokenizer.encode("this is a complicatedtest")

In [7]:
encoded_str

[101, 2023, 2003, 1037, 8552, 22199, 102]

In [8]:
for token in encoded_str:
    print(token, tokenizer.decode([token]))

101 [CLS]
2023 this
2003 is
1037 a
8552 complicated
22199 ##test
102 [SEP]


In [9]:
from transformers import AutoModel
import torch

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')


In [11]:
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

In [14]:
text = "this is a test"
text_tensor = tokenizer(text, return_tensors="pt").to(device)

In [17]:
import numpy as np

In [37]:
def aggregate_embeddings(chunk_embeddings, method='mean'):
    if method == 'mean':
        document_embedding = torch.mean(chunk_embeddings, axis=1)
    elif method == 'sum':
        document_embedding = torch.sum(chunk_embeddings, axis=1)
    else:
        raise ValueError("Unsupported aggregation method")
    return document_embedding

In [40]:
output = model(**text_tensor)
output = output.last_hidden_state
aggregate_embeddings(output).shape

torch.Size([1, 768])