In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

# Math
import math

# Huggingface libraries
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

# Pathlib
from pathlib import Path

# typing
from typing import Any

# Tqdm
from tqdm import tqdm

# Importing library of warnings
import warnings

In [104]:
# nn.Embedding有两个参数：num_embeddings和embedding_dim。可以理解为num_embeddings是词典的大小，embedding_dim是每个词嵌入的维度。把Embedding层当做是一个字典，输入一个索引，返回一个词向量。
# 比如“I love you”这句话，词典大小是3，可以用[0, 1, 2]表示，然后通过Embedding层，针对每一个索引，去找它的词向量。比如说“I”，索引是0，那么从Embedding层取第0行的向量用来表示“I”，同理总共可以得到三个词向量，然后把这三个词向量拼接起来，就是整个句子的词嵌入表示。
# Embedding层的参数是一个二维矩阵，该矩阵就是一个包含词向量的字典。行数是num_embeddings，也就是词典大小，列数是embedding_dim。
# Embedding层的作用是把输入x里的每个元素用词向量表示，因此x的维度是不限的。
embedding = nn.Embedding(10, 3)
embedding(torch.tensor([[[1,2]]])),embedding.weight

(tensor([[[[-0.4378, -0.0305,  0.5566],
           [ 1.6604,  0.7059, -1.0342]]]], grad_fn=<EmbeddingBackward0>),
 Parameter containing:
 tensor([[ 0.9677, -2.3553,  1.9556],
         [-0.4378, -0.0305,  0.5566],
         [ 1.6604,  0.7059, -1.0342],
         [-0.2311, -1.0571, -1.2353],
         [ 0.9896, -0.3503, -0.2724],
         [ 1.2129, -0.7979,  0.4722],
         [ 2.0810,  0.9564,  0.1199],
         [-0.6105, -0.2161,  0.8669],
         [ 0.6462, -0.7145,  0.0053],
         [-1.9800, -0.0670, -0.9011]], requires_grad=True))

In [None]:
# Creating Input Embeddings
# Transform the input into a sequence of embeddings, 
# Input example: vocabulary:[I,love, you]  input: I love you -> [I, love, you] -> [0, 1, 2] -> [[1, 0, 0], [0, 1, 0], [0, 0, 1]] dimiension:(n, vocab_size)
# The Input Embedding will reduce the dimension of the input to the desired dimension: vocab_size(num_embedding) -> d_model(embedding_dim) 
class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)