## Vectorization

In [None]:
from torchtext.data.utils import get_tokenizer

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'

# Define tokenizer function
tokenizer = get_tokenizer('basic_english')
sample1_tokens = tokenizer(sample1)
sample2_tokens = tokenizer(sample2)

print(sample1_tokens)
print(sample2_tokens)

In [None]:
# problem?
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
data = [sample1, sample2, ...]

# Create vocabulary
vocab_size = 8
vocab = build_vocab_from_iterator(data)
vocab.set_default_index(vocab["<unk>"])

In [None]:
# problem?
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
data = [sample1, sample2, ...]

# Create vocabulary
vocab_size = 8
vocab = build_vocab_from_iterator(data,
                                  max_tokens=vocab_size,
                                  specials=["<unk>", 
                                            "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
data = [sample1, sample2]

# Create a function to yield list of tokens
tokenizer = get_tokenizer('basic_english')
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab_size = 8
vocab = build_vocab_from_iterator(yield_tokens(data),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", 
                                            "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [3]:
vocab.get_stoi()

{'<unk>': 0,
 '<pad>': 1,
 'ai': 2,
 'a': 3,
 'is': 6,
 'are': 4,
 'learning': 7,
 'cs': 5}

In [4]:
tokens = tokenizer(sample1)
print(tokens) 

sample1_tokens = [vocab[token] for token in tokens]
print(sample1_tokens)

['we', 'are', 'learning', 'ai']
[0, 4, 7, 2]


In [5]:
tokens = tokenizer(sample2)
print(tokens) 

sample2_tokens = [vocab[token] for token in tokens]
print(sample2_tokens)

['ai', 'is', 'a', 'cs', 'topic']
[2, 6, 3, 5, 0]


In [16]:
# problem?
import torch

# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length):
    tokens = tokenizer(text)    
    tokens = [vocab[token] for token in tokens] 
    
    num_pads = sequence_length - len(tokens)
    tokens = tokens + [vocab["<pad>"]] * num_pads
    
    return torch.tensor(tokens, dtype=torch.long)

# Vectorize the samples
sequence_length = 5
vectorized_sample1 = vectorize(sample1, 
                               vocab, 
                               sequence_length)
vectorized_sample2 = vectorize(sample2, 
                               vocab, 
                               sequence_length)

print("Vectorized Sample 1:", vectorized_sample1)
print("Vectorized Sample 2:", vectorized_sample2)

Vectorized Sample 1: tensor([0, 4, 7, 2, 1])
Vectorized Sample 2: tensor([2, 6, 3, 5, 0])


In [17]:
sample3 = 'AI topic in CS is difficult'
vectorized_sample3 = vectorize(sample3, 
                               vocab, 
                               sequence_length)
print(vectorized_sample3)

tensor([2, 0, 0, 5, 6, 0])


In [18]:
import torch

def vectorize(text, vocab, seq_len):
    tokens = tokenizer(text)    
    tokens = [vocab[token] for token in tokens] 
    
    num_pads = sequence_length - len(tokens)
    tokens = tokens[:sequence_length] 
             + [vocab["<pad>"]]*num_pads
    
    return torch.tensor(tokens, dtype=torch.long)

# Vectorize the samples
sequence_length = 5
vectorized_sample1 = vectorize(sample1, vocab, 
                               sequence_length)
vectorized_sample2 = vectorize(sample2, vocab, 
                               sequence_length)

print("Vectorized Sample 1:", vectorized_sample1)
print("Vectorized Sample 2:", vectorized_sample2)

Vectorized Sample 1: tensor([0, 4, 7, 2, 1])
Vectorized Sample 2: tensor([2, 6, 3, 5, 0])


In [19]:
sample3 = 'AI topic in CS is difficult'
vectorized_sample3 = vectorize(sample3, vocab, 
                               sequence_length)
print(vectorized_sample3)

tensor([2, 0, 0, 5, 6])


## Embedding

In [None]:
import torch
import torch.nn as nn

vocab_size = 8
embed_dim = 4
embedding = nn.Embedding(vocab_size, embed_dim)

custom_weights = torch.tensor( [[-0.1882,  0.5530,  1.6267,  0.7013],
                                [ 1.7840, -0.8278, -0.2701,  1.3586],
                                [ 1.0281, -1.9094,  0.3182,  0.4211],
                                [-1.3083, -0.0987,  0.7647, -0.3680],
                                [ 0.2293,  1.3255,  0.1318,  2.0501],
                                [ 0.4058, -0.6624, -0.8745,  0.7203],
                                [ 0.5582,  0.0786, -0.6817,  0.6902],
                                [ 0.4309, -1.3067, -0.8823,  1.5977]]).float()
embedding.weight = nn.Parameter(custom_weights)
print(embedding.weight)

## Update

In [3]:
import torch
import torch.nn as nn

vocab_size = 8
embed_dim = 4
embedding = nn.Embedding(vocab_size, 
                         embed_dim)

custom_weights = torch.tensor( [[-0.1882,  0.5530,  1.6267,  0.7013],
                                [ 1.7840, -0.8278, -0.2701,  1.3586],
                                [ 1.0281, -1.9094,  0.3182,  0.4211],
                                [-1.3083, -0.0987,  0.7647, -0.3680],
                                [ 0.2293,  1.3255,  0.1318,  2.0501],
                                [ 0.4058, -0.6624, -0.8745,  0.7203],
                                [ 0.5582,  0.0786, -0.6817,  0.6902],
                                [ 0.4309, -1.3067, -0.8823,  1.5977]]).float()
embedding.weight = nn.Parameter(custom_weights)
print(embedding.weight)

Parameter containing:
tensor([[-0.1882,  0.5530,  1.6267,  0.7013],
        [ 1.7840, -0.8278, -0.2701,  1.3586],
        [ 1.0281, -1.9094,  0.3182,  0.4211],
        [-1.3083, -0.0987,  0.7647, -0.3680],
        [ 0.2293,  1.3255,  0.1318,  2.0501],
        [ 0.4058, -0.6624, -0.8745,  0.7203],
        [ 0.5582,  0.0786, -0.6817,  0.6902],
        [ 0.4309, -1.3067, -0.8823,  1.5977]], requires_grad=True)


In [4]:
data = torch.tensor([0, 4, 7, 2, 1], dtype=torch.long)
label = torch.tensor([1], dtype=torch.float)

x = embedding(data)
x = nn.Flatten(0)(x)
x = nn.Linear(20, 1)(x)
output = nn.Sigmoid()(x)

In [6]:
loss = nn.BCELoss()(output, label)
optimizer = torch.optim.Adam(embedding.parameters())
loss.backward()
optimizer.step()
print(embedding.weight)

Parameter containing:
tensor([[-0.1892,  0.5540,  1.6257,  0.7023],
        [ 1.7850, -0.8288, -0.2691,  1.3596],
        [ 1.0271, -1.9104,  0.3192,  0.4201],
        [-1.3083, -0.0987,  0.7647, -0.3680],
        [ 0.2283,  1.3265,  0.1308,  2.0491],
        [ 0.4058, -0.6624, -0.8745,  0.7203],
        [ 0.5582,  0.0786, -0.6817,  0.6902],
        [ 0.4319, -1.3057, -0.8833,  1.5987]], requires_grad=True)
