## Way 1

In [1]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
data = [sample1, sample2]

# Define the max vocabulary size and sequence length
vocab_size = 8
sequence_length = 5

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')
print(tokenizer(sample1))
print(tokenizer(sample2))

['we', 'are', 'learning', 'ai']
['ai', 'is', 'a', 'cs', 'topic']


In [3]:
# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(data),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'<unk>': 0,
 '<pad>': 1,
 'ai': 2,
 'a': 3,
 'is': 6,
 'are': 4,
 'learning': 7,
 'cs': 5}

In [4]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length):
    tokens = tokenizer(text)
    token_ids = [vocab[token] for token in tokens][:sequence_length]
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    return torch.tensor(token_ids, dtype=torch.long)

# Vectorize the samples
vectorized_sample1 = vectorize(sample1, vocab, sequence_length)
vectorized_sample2 = vectorize(sample2, vocab, sequence_length)

print("Vectorized Sample 1:", vectorized_sample1)
print("Vectorized Sample 2:", vectorized_sample2)

Vectorized Sample 1: tensor([0, 4, 7, 2, 1])
Vectorized Sample 2: tensor([2, 6, 3, 5, 0])


## Way 2

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
sequence_length = 5

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
data = [sample1, sample2]

In [25]:
tokens = tokenizer('we are learning ai')
print(tokens) 

{'input_ids': [101, 2057, 2024, 4083, 9932, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [2]:
tokens =  sample1.lower().split()
tokenizer.convert_tokens_to_ids(tokens)

[2057, 2024, 4083, 9932]

In [27]:
len(tokenizer)

30522

In [2]:
tokenizer.pad_token_id

0

## Dataset

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
sequence_length = 5

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
sentences = [sample1, sample2]
labels = [0, 1]
print(sentences)

['We are learning AI', 'AI is a CS topic']


In [6]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        super().__init__()
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        sentence_label = self.labels[idx]

        tokens = sentence.split()
        tokens = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens = self.pad_and_truncate(tokens)

        return tokens, sentence_label
    
    def pad_and_truncate(self, tokens): 
        pad_id = self.tokenizer.pad_token_id
        if len(tokens) < self.max_len:
            padded_tokens = tokens + [pad_id] * (self.max_len - len(tokens))            
        else:
            padded_tokens = tokens[:self.max_len]
            
        return torch.tensor(padded_tokens)

In [9]:
from torch.utils.data import DataLoader

train_set = MyDataset(sentences, labels, tokenizer, sequence_length)

batch_size = 1
train_loader = DataLoader(train_set, batch_size)

In [10]:
for x,y in train_loader:
    print(x)
    print(y)

tensor([[ 100, 2024, 4083,  100,    0]])
tensor([0])
tensor([[ 100, 2003, 1037,  100, 8476]])
tensor([1])


## Embedding

In [None]:
import torch.nn as nn

embedding = nn.Embedding(vocab_size, 4)
print(embedding)

In [None]:
print(embedding.weight)

In [None]:
input1 = torch.tensor([0, 3, 6, 1, 0], dtype=torch.long)
output1 = embedding(input1)
print(output1)