In [1]:
import torch
import torch.nn as nn

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)

    def forward(self, x):
        return self.embedding(x)

# Example
vocab_size = 10000  # Number of unique tokens
embed_size = 512    # Size of each embedding vector
embedding_layer = TokenEmbedding(vocab_size, embed_size)
sample_input = torch.randint(0, vocab_size, (2, 5))  # Batch of 2, sequence length 5
print(embedding_layer(sample_input))

tensor([[[ 1.2756e+00, -2.8853e-01, -1.6341e-02,  ..., -1.3297e+00,
           3.1749e-01,  1.3849e-01],
         [-9.1959e-01,  4.3836e-01, -6.2098e-01,  ..., -3.0692e-01,
           1.2537e+00,  5.2725e-04],
         [ 1.7660e-01,  2.3661e+00, -2.1267e-01,  ..., -1.5117e+00,
          -6.5065e-01, -1.4829e-01],
         [ 6.8536e-01,  8.3798e-02, -4.5726e-01,  ...,  7.4870e-01,
           1.2463e+00,  1.0455e+00],
         [-2.7685e-01,  1.2896e+00, -1.4539e+00,  ..., -1.2494e+00,
          -1.1990e-01,  9.8863e-01]],

        [[-1.2678e+00, -3.3537e-01,  1.0391e+00,  ...,  6.9800e-01,
          -7.8852e-01, -5.4905e-01],
         [ 1.0651e+00,  9.1889e-01,  1.5173e+00,  ...,  1.5635e+00,
          -3.1303e-01,  6.6971e-04],
         [-3.5550e-02, -3.0058e-01, -5.2690e-01,  ...,  4.9994e-01,
           6.1160e-01,  5.0224e-01],
         [-5.8075e-01,  6.5676e-01, -1.4737e-01,  ...,  2.6394e-01,
           9.6445e-01, -5.8891e-01],
         [ 1.6721e-01, -3.4282e-01, -1.0198e+00,  ...

In [2]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        positional_encoding = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-math.log(10000.0) / embed_size))
        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        self.positional_encoding = positional_encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.positional_encoding[:, :x.size(1), :]

# Example
pos_encoding = PositionalEncoding(embed_size)
sample_embed = torch.randn(2, 5, embed_size)  # Batch of 2, sequence length 5
print(pos_encoding(sample_embed))

tensor([[[-1.2256, -0.3302, -1.1003,  ...,  1.1093, -1.5659,  1.1703],
         [ 0.7098,  0.3457,  0.2656,  ...,  1.7193,  0.3305,  1.4212],
         [ 1.6793, -1.4432,  1.2259,  ...,  0.6950, -0.5887,  1.2472],
         [ 0.5869, -1.2814,  0.8982,  ...,  1.9058,  0.3580, -0.2717],
         [-1.5831, -1.2846, -0.5073,  ...,  1.3527,  0.7306,  1.6261]],

        [[-0.3170,  2.2296,  0.2574,  ...,  2.4787, -0.2724,  0.6551],
         [ 0.0692,  1.2288,  1.0194,  ...,  0.3034, -1.3004, -0.4429],
         [ 0.1045, -0.5705,  0.9179,  ...,  1.5701, -1.6499,  1.2531],
         [ 0.5472, -1.5761, -1.8515,  ...,  0.2490,  0.4323,  0.5445],
         [-0.6237,  0.0706,  0.0886,  ...,  2.1376,  0.1846,  1.4615]]])


In [3]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert self.head_dim * heads == embed_size, "Embed size must be divisible by heads"

        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, queries, mask):
        N = queries.shape[0]  # Batch size
        value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Split embeddings into multiple heads
        values = values.view(N, value_len, self.heads, self.head_dim).transpose(1, 2)
        keys = keys.view(N, key_len, self.heads, self.head_dim).transpose(1, 2)
        queries = queries.view(N, query_len, self.heads, self.head_dim).transpose(1, 2)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])  # Dot product
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.embed_size)
        return self.fc_out(out)

# Example
self_attention = SelfAttention(embed_size, heads=8)
sample_values = torch.randn(2, 5, embed_size)
sample_keys = torch.randn(2, 5, embed_size)
sample_queries = torch.randn(2, 5, embed_size)
print(self_attention(sample_values, sample_keys, sample_queries, mask=None))

tensor([[[-0.2051, -0.0942, -0.0996,  ..., -0.0270,  0.0003,  0.0634],
         [-0.1861, -0.0685, -0.1287,  ...,  0.0260, -0.0940, -0.0952],
         [ 0.0826,  0.0721, -0.0073,  ..., -0.0900, -0.1284, -0.0448],
         [-0.0518,  0.1684,  0.1346,  ..., -0.1092,  0.0984,  0.0716],
         [ 0.2453,  0.0283,  0.0512,  ...,  0.0241, -0.0437, -0.1226]],

        [[ 0.0008,  0.0235, -0.1502,  ...,  0.0541,  0.0904,  0.1190],
         [ 0.0168,  0.2034, -0.1096,  ...,  0.1638, -0.1925, -0.1107],
         [ 0.0004,  0.0211, -0.1277,  ..., -0.2120,  0.0617, -0.0202],
         [ 0.2216, -0.0952, -0.0638,  ...,  0.0396, -0.1255, -0.2428],
         [-0.0822, -0.0051,  0.1309,  ...,  0.0268,  0.0958, -0.1995]]],
       grad_fn=<ViewBackward0>)


In [4]:
class FeedForward(nn.Module):
    def __init__(self, embed_size, hidden_dim):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(embed_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, embed_size)

    def forward(self, x):
        return self.fc2(torch.relu(self.fc1(x)))

# Example
ffn = FeedForward(embed_size, hidden_dim=2048)
sample_input = torch.randn(2, 5, embed_size)
print(ffn(sample_input))

tensor([[[-0.1399, -0.1748,  0.2533,  ...,  0.1207, -0.1825, -0.3562],
         [ 0.0389, -0.1497,  0.1076,  ...,  0.0903, -0.1643, -0.1957],
         [-0.3438, -0.5250,  0.3384,  ..., -0.0641,  0.0429, -0.4472],
         [-0.1850, -0.1658,  0.0186,  ...,  0.2102,  0.1028, -0.2599],
         [-0.3367,  0.1386,  0.2086,  ..., -0.0724,  0.2466, -0.6518]],

        [[-0.0943, -0.1365,  0.1974,  ...,  0.2559, -0.0492, -0.1265],
         [-0.3728,  0.0049,  0.2150,  ..., -0.0993,  0.1318,  0.0046],
         [-0.1181, -0.2412, -0.1956,  ..., -0.0796, -0.0455, -0.1000],
         [ 0.0208,  0.0765,  0.3855,  ...,  0.0871,  0.1456,  0.1169],
         [-0.0728, -0.0939, -0.0690,  ...,  0.2313, -0.0984, -0.0998]]],
       grad_fn=<ViewBackward0>)


In [5]:
class LayerNorm(nn.Module):
    def __init__(self, embed_size, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(embed_size))
        self.beta = nn.Parameter(torch.zeros(embed_size))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

# Example
layer_norm = LayerNorm(embed_size)
sample_input = torch.randn(2, 5, embed_size)
print(layer_norm(sample_input))

tensor([[[-1.5051,  0.6230,  0.2745,  ...,  1.9000, -0.4936,  1.0943],
         [-0.8244, -0.1761, -0.4022,  ...,  1.4115, -0.4973,  0.1845],
         [-0.8476,  0.0574,  1.6289,  ...,  1.1036,  0.8037, -0.0115],
         [ 1.4055, -2.2716, -0.7452,  ...,  2.5477, -0.0618,  0.9770],
         [ 2.9755, -0.2280, -0.0464,  ..., -0.6853,  0.3125,  0.9810]],

        [[-0.7845, -1.3037, -0.4398,  ..., -0.6197,  0.9313,  1.1225],
         [-0.9677,  0.6140,  0.5200,  ..., -0.5276,  0.9966,  0.4008],
         [ 0.7556,  1.0178, -1.4728,  ..., -1.1096,  0.9708,  0.3151],
         [-0.7562,  1.7414, -1.3928,  ..., -0.2798, -0.0870, -1.3427],
         [-2.6141, -2.7820, -0.4479,  ...,  0.4673,  0.9002, -0.4669]]],
       grad_fn=<AddBackward0>)


In [6]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, hidden_dim, dropout):
        super(TransformerBlock, self).__init__()
        self.self_attention = SelfAttention(embed_size, heads)
        self.norm1 = LayerNorm(embed_size)
        self.norm2 = LayerNorm(embed_size)
        self.feed_forward = FeedForward(embed_size, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.self_attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

# Example
transformer_block = TransformerBlock(embed_size, heads=8, hidden_dim=2048, dropout=0.1)
sample_values = torch.randn(2, 5, embed_size)
print(transformer_block(sample_values, sample_values, sample_values, mask=None))

tensor([[[-1.9844,  0.9456,  0.3492,  ..., -1.3219, -1.4275, -0.5619],
         [ 1.3929, -0.6650, -2.1925,  ..., -0.3971,  0.1844, -0.1936],
         [ 0.8262,  1.0770,  0.0754,  ...,  0.3134, -0.8512,  0.4292],
         [-0.0914,  1.7182,  0.0000,  ..., -0.3007,  0.3823, -0.1301],
         [ 0.1395,  0.0112, -0.1073,  ...,  0.4543,  0.7772, -0.3475]],

        [[-0.8234,  0.4540, -1.1789,  ...,  1.5506,  1.1937, -0.8939],
         [ 0.0000, -0.6291,  0.6088,  ...,  1.3344, -0.6786, -3.1985],
         [-0.4257,  0.4937,  0.6166,  ..., -0.1715,  0.8155, -0.0806],
         [-0.9355, -0.8333,  0.0000,  ...,  0.0000, -2.8921,  1.9766],
         [ 0.1750,  0.5049,  2.5375,  ..., -0.6904, -0.2660,  1.2031]]],
       grad_fn=<MulBackward0>)


## Playing around with the Torch.nn module

In [7]:
x = torch.randn(3, 4)
print(x)

tensor([[-1.2724, -1.1546,  0.9854, -0.4189],
        [ 0.2406, -0.0393,  0.1152,  0.4656],
        [-0.2870,  0.7521,  0.0750,  0.0653]])


In [8]:
layer = nn.Linear(in_features=4, out_features=2)
x = torch.randn(3, 4)  # batch size 3, input dim 4
output = layer(x)
print("Linear Output:\n", output)

Linear Output:
 tensor([[-0.6080, -0.3494],
        [-0.6235, -0.5291],
        [-0.2171,  0.2979]], grad_fn=<AddmmBackward0>)


In [9]:
relu = nn.ReLU()
sigmoid = nn.Sigmoid()

x = torch.tensor([-1.0, 0.0, 1.0])
print("ReLU:", relu(x))
print("Sigmoid:", sigmoid(x))

ReLU: tensor([0., 0., 1.])
Sigmoid: tensor([0.2689, 0.5000, 0.7311])


In [10]:
criterion = nn.MSELoss()
pred = torch.tensor([0.8, 0.4], requires_grad=True)
target = torch.tensor([1.0, 0.0])
loss = criterion(pred, target)
print("MSE Loss:", loss.item())

MSE Loss: 0.10000000149011612


In [11]:
model = nn.Sequential(
    nn.Linear(4, 8),
    nn.ReLU(),
    nn.Linear(8, 2)
)

x = torch.randn(1, 4)
output = model(x)
print("Sequential Model Output:", output)

Sequential Model Output: tensor([[ 0.3174, -0.5773]], grad_fn=<AddmmBackward0>)


In [12]:
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(4, 8)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(8, 2)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

net = MyNet()
x = torch.randn(2, 4)
print("Custom Net Output:\n", net(x))

Custom Net Output:
 tensor([[-0.1140,  0.3842],
        [ 0.1333,  0.5911]], grad_fn=<AddmmBackward0>)
