## Dataset

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
sequence_length = 5

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
sentences = [sample1, sample2]
labels = [0, 1]
print(sentences)

['We are learning AI', 'AI is a CS topic']


In [6]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        super().__init__()
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        sentence_label = self.labels[idx]

        tokens = sentence.split()
        tokens = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens = self.pad_and_truncate(tokens)

        return tokens, sentence_label
    
    def pad_and_truncate(self, tokens): 
        pad_id = self.tokenizer.pad_token_id
        if len(tokens) < self.max_len:
            padded_tokens = tokens + [pad_id] * (self.max_len - len(tokens))            
        else:
            padded_tokens = tokens[:self.max_len]
            
        return torch.tensor(padded_tokens)

In [9]:
from torch.utils.data import DataLoader

train_set = MyDataset(sentences, labels, tokenizer, sequence_length)

batch_size = 1
train_loader = DataLoader(train_set, batch_size)

In [10]:
for x,y in train_loader:
    print(x)
    print(y)

tensor([[ 100, 2024, 4083,  100,    0]])
tensor([0])
tensor([[ 100, 2003, 1037,  100, 8476]])
tensor([1])


## Embedding

In [None]:
import torch.nn as nn

embedding = nn.Embedding(vocab_size, 4)
print(embedding)

In [None]:
print(embedding.weight)

In [None]:
input1 = torch.tensor([0, 3, 6, 1, 0], dtype=torch.long)
output1 = embedding(input1)
print(output1)