## Import Required Libraries

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
from torch.utils.data.dataset import random_split


import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import AG_NEWS
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer

import warnings
def warn(*args,**kwargs):
    pass

warnings.warn = warn
warnings.filterwarnings("ignore")

In [2]:
torch.__version__,torchtext.__version__


('2.2.2', '0.17.2')

In [4]:
train_iter, test_iter = AG_NEWS()

## Define Helper function

In [7]:
def plot_loss_acc(LOSS,ACC):

    fig,ax1 = plt.subplots()
    color = "tab:red"
    ax1.plot(LOSS)
    ax1.set_xlabel('epochs',color = color)
    ax1.set_ylabel('Cost',color = color)

    ax2 = ax1.twinx()
    color = 'tab:blue'
    ax2.set_ylabel('accuracy',color = color)
    ax2.plot(ACC, color = color)
    fig.tight_layout()
    ax2.tick_params(axis = 'y', color = color)
    fig.tight_layout()

    plt.show()

In [None]:
def plot_embeddings(my_embeddings,name,vocab):

    fig = plt.figure()

    ax = fig.add_subplot(111,projection = '3d')

    # Plot the datapoints
    ax.scatter(my_embeddings[:,0], my_embeddings[:,1])

## Toy Dataset

In [9]:
dataset = [
    (1,"Introduction to NLP"),
    (2,"Basics of PyTorch"),
    (1,"NLP Techniques for Text Classification"),
    (3,"Named Entity Recognition with PyTorch"),
    (3,"Sentiment Analysis using PyTorch"),
    (3,"Machine Translation with PyTorch"),
    (1," NLP Named Entity,Sentiment Analysis,Machine Translation "),
    (1," Machine Translation with NLP "),
    (1," Named Entity vs Sentiment Analysis  NLP "),
    (3,"he painted the car red"),
    (1,"he painted the red car")
    ]

tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):

    for _,text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(dataset),specials = ["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [10]:
def text_pipeline(text):
    return vocab(tokenizer(text))

def label_pipeline(x):
    return int(x)-1

## Zero Padding

In [17]:
sequences = [torch.tensor ([j for j in range (1,i)]) for i in range(2,10)]
sequences

[tensor([1]),
 tensor([1, 2]),
 tensor([1, 2, 3]),
 tensor([1, 2, 3, 4]),
 tensor([1, 2, 3, 4, 5]),
 tensor([1, 2, 3, 4, 5, 6]),
 tensor([1, 2, 3, 4, 5, 6, 7]),
 tensor([1, 2, 3, 4, 5, 6, 7, 8])]

In [18]:
# pad 
from torch.nn.utils.rnn import pad_sequence
padded_sequence = pad_sequence(sequences,batch_first = True, padding_value = 0)
print(padded_sequence)

tensor([[1, 0, 0, 0, 0, 0, 0, 0],
        [1, 2, 0, 0, 0, 0, 0, 0],
        [1, 2, 3, 0, 0, 0, 0, 0],
        [1, 2, 3, 4, 0, 0, 0, 0],
        [1, 2, 3, 4, 5, 0, 0, 0],
        [1, 2, 3, 4, 5, 6, 0, 0],
        [1, 2, 3, 4, 5, 6, 7, 0],
        [1, 2, 3, 4, 5, 6, 7, 8]])


In [35]:
my_tokens = "he painted the car red he painted the red car"

my_index = text_pipeline(my_tokens)
my_index

embedding_dim = 3

vocab_size = len(vocab)
print(vocab_size)

embedding = nn.Embedding(vocab_size, embedding_dim)

27


In [36]:
embedding.weight

Parameter containing:
tensor([[ 1.3464e+00,  1.4493e-03, -2.5309e-01],
        [-6.0592e-01,  7.6708e-02,  1.0916e+00],
        [ 6.6008e-04,  1.1636e-01,  1.3907e+00],
        [ 7.8528e-01,  2.3689e-01,  7.0954e-01],
        [-1.7754e+00, -6.0924e-01,  6.7999e-02],
        [-1.3510e+00,  1.0454e-01, -1.5773e+00],
        [ 1.0209e+00, -1.8551e-01,  1.2583e+00],
        [ 4.1363e-01, -5.2709e-01,  9.9344e-01],
        [-1.2712e+00, -6.3085e-02,  1.1929e+00],
        [ 4.3014e-01, -5.7862e-01,  9.9738e-01],
        [ 1.9295e+00, -7.6916e-01,  8.3594e-01],
        [-1.0519e+00,  2.0030e-01, -1.5784e+00],
        [-3.0331e-01,  3.1823e-01, -1.5905e-02],
        [ 5.3973e-02, -1.1571e+00, -1.0002e+00],
        [ 5.4749e-01, -8.6171e-01, -1.8179e+00],
        [-5.4248e-01, -2.1521e+00, -1.3743e-01],
        [-8.7264e-01, -1.4662e+00,  3.9409e-01],
        [-3.5706e-01,  7.6975e-01, -8.6329e-01],
        [ 2.1589e+00,  1.7368e+00, -3.9517e-01],
        [-3.3711e-01,  1.9707e-01, -1.1146e+00]

In [37]:
my_index

[12, 13, 15, 11, 14, 12, 13, 15, 14, 11]

In [38]:
position = torch.arange(0,vocab_size, dtype = torch.float).unsqueeze(1)
position

tensor([[ 0.],
        [ 1.],
        [ 2.],
        [ 3.],
        [ 4.],
        [ 5.],
        [ 6.],
        [ 7.],
        [ 8.],
        [ 9.],
        [10.],
        [11.],
        [12.],
        [13.],
        [14.],
        [15.],
        [16.],
        [17.],
        [18.],
        [19.],
        [20.],
        [21.],
        [22.],
        [23.],
        [24.],
        [25.],
        [26.]])

In [39]:
vocab.get_itos()

['<unk>',
 'nlp',
 'pytorch',
 'analysis',
 'entity',
 'machine',
 'named',
 'sentiment',
 'translation',
 'with',
 ',',
 'car',
 'he',
 'painted',
 'red',
 'the',
 'basics',
 'classification',
 'for',
 'introduction',
 'of',
 'recognition',
 'techniques',
 'text',
 'to',
 'using',
 'vs']

In [40]:
d_model = 3
pe = torch.zeros(vocab_size,d_model)

In [41]:
pe

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

In [42]:
pe = torch.cat((position,position, position),dim = 1)
pe

tensor([[ 0.,  0.,  0.],
        [ 1.,  1.,  1.],
        [ 2.,  2.,  2.],
        [ 3.,  3.,  3.],
        [ 4.,  4.,  4.],
        [ 5.,  5.,  5.],
        [ 6.,  6.,  6.],
        [ 7.,  7.,  7.],
        [ 8.,  8.,  8.],
        [ 9.,  9.,  9.],
        [10., 10., 10.],
        [11., 11., 11.],
        [12., 12., 12.],
        [13., 13., 13.],
        [14., 14., 14.],
        [15., 15., 15.],
        [16., 16., 16.],
        [17., 17., 17.],
        [18., 18., 18.],
        [19., 19., 19.],
        [20., 20., 20.],
        [21., 21., 21.],
        [22., 22., 22.],
        [23., 23., 23.],
        [24., 24., 24.],
        [25., 25., 25.],
        [26., 26., 26.]])

In [43]:
my_embedings = embedding(torch.tensor(my_index)).detach().numpy()


In [44]:
samples, dim = my_embedings.shape
samples, dim

(10, 3)

In [45]:
my_embedings

array([[-0.30330598,  0.3182314 , -0.01590507],
       [ 0.05397258, -1.157111  , -1.0001758 ],
       [-0.5424799 , -2.1520617 , -0.13742737],
       [-1.0519241 ,  0.20030454, -1.5783936 ],
       [ 0.54749435, -0.8617089 , -1.8179109 ],
       [-0.30330598,  0.3182314 , -0.01590507],
       [ 0.05397258, -1.157111  , -1.0001758 ],
       [-0.5424799 , -2.1520617 , -0.13742737],
       [ 0.54749435, -0.8617089 , -1.8179109 ],
       [-1.0519241 ,  0.20030454, -1.5783936 ]], dtype=float32)

In [46]:
pos_embding = my_embedings + pe[0:samples, :].numpy()
pos_embding

array([[-3.0330598e-01,  3.1823140e-01, -1.5905073e-02],
       [ 1.0539726e+00, -1.5711105e-01, -1.7583370e-04],
       [ 1.4575201e+00, -1.5206170e-01,  1.8625727e+00],
       [ 1.9480759e+00,  3.2003045e+00,  1.4216064e+00],
       [ 4.5474944e+00,  3.1382911e+00,  2.1820891e+00],
       [ 4.6966939e+00,  5.3182316e+00,  4.9840951e+00],
       [ 6.0539727e+00,  4.8428888e+00,  4.9998240e+00],
       [ 6.4575200e+00,  4.8479385e+00,  6.8625727e+00],
       [ 8.5474939e+00,  7.1382914e+00,  6.1820889e+00],
       [ 7.9480758e+00,  9.2003050e+00,  7.4216065e+00]], dtype=float32)

In [47]:
pos_embding[3]

array([1.9480759, 3.2003045, 1.4216064], dtype=float32)

## Encoder Layer

In [48]:
my_embdings = embedding(torch.tensor(my_index))

In [49]:
my_embdings

tensor([[-0.3033,  0.3182, -0.0159],
        [ 0.0540, -1.1571, -1.0002],
        [-0.5425, -2.1521, -0.1374],
        [-1.0519,  0.2003, -1.5784],
        [ 0.5475, -0.8617, -1.8179],
        [-0.3033,  0.3182, -0.0159],
        [ 0.0540, -1.1571, -1.0002],
        [-0.5425, -2.1521, -0.1374],
        [ 0.5475, -0.8617, -1.8179],
        [-1.0519,  0.2003, -1.5784]], grad_fn=<EmbeddingBackward0>)

In [51]:
torch.tensor(my_index)

tensor([12, 13, 15, 11, 14, 12, 13, 15, 14, 11])

In [52]:
embedding

Embedding(27, 3)

In [53]:
my_embdings.shape

torch.Size([10, 3])

In [54]:
encoder_layer = nn.TransformerEncoderLayer(
    d_model=3,
    nhead = 1,
    dim_feedforward=1,
    dropout = 0
)

In [55]:
out = encoder_layer(my_embdings)
out

tensor([[-0.3975,  1.3741, -0.9766],
        [ 1.3787, -0.4165, -0.9622],
        [ 0.9005, -1.3946,  0.4942],
        [ 0.1794,  1.1251, -1.3046],
        [ 1.2589, -0.0714, -1.1875],
        [-0.3975,  1.3741, -0.9766],
        [ 1.3787, -0.4165, -0.9622],
        [ 0.9005, -1.3946,  0.4942],
        [ 1.2589, -0.0714, -1.1875],
        [ 0.1794,  1.1251, -1.3046]], grad_fn=<NativeLayerNormBackward0>)

In [58]:
out.mean(dim = 1)

tensor([ 1.9868e-08, -3.9736e-08,  4.9671e-08,  3.9736e-08,  0.0000e+00,
         1.9868e-08, -3.9736e-08,  4.9671e-08,  0.0000e+00,  0.0000e+00],
       grad_fn=<MeanBackward1>)

In [59]:
params_dict = encoder_layer.state_dict()

# Print the parameter names and shapes
for name, param in params_dict.items():
    print(name,param.shape)

self_attn.in_proj_weight torch.Size([9, 3])
self_attn.in_proj_bias torch.Size([9])
self_attn.out_proj.weight torch.Size([3, 3])
self_attn.out_proj.bias torch.Size([3])
linear1.weight torch.Size([1, 3])
linear1.bias torch.Size([1])
linear2.weight torch.Size([3, 1])
linear2.bias torch.Size([3])
norm1.weight torch.Size([3])
norm1.bias torch.Size([3])
norm2.weight torch.Size([3])
norm2.bias torch.Size([3])


In [60]:
embed_dim = 3
q_proj_weight = encoder_layer.state_dict()['self_attn.in_proj_weight'][0:embed_dim].t()
k_proj_weight = encoder_layer.state_dict()['self_attn.in_proj_weight'][embed_dim:2*embed_dim].t()
v_proj_weight = encoder_layer.state_dict()['self_attn.in_proj_weight'][2*embed_dim:3*embed_dim].t()

In [62]:
q_proj_weight,k_proj_weight,v_proj_weight

(tensor([[-0.3882,  0.0439,  0.4021],
         [ 0.1002, -0.1283, -0.1279],
         [ 0.2260, -0.6599, -0.4037]]),
 tensor([[-0.6860, -0.5836,  0.0396],
         [-0.2887,  0.4315,  0.5349],
         [-0.2767,  0.1194, -0.6030]]),
 tensor([[-0.4280, -0.0283, -0.4744],
         [-0.3591, -0.6797, -0.0774],
         [-0.3855,  0.2779,  0.6169]]))

In [63]:
Q = my_embdings @ q_proj_weight
K = my_embdings @ k_proj_weight
V = my_embdings @ v_proj_weight

In [64]:
Q

tensor([[ 0.1460, -0.0436, -0.1562],
        [-0.3630,  0.8108,  0.5736],
        [-0.0362,  0.3429,  0.1127],
        [ 0.0717,  0.9697,  0.1887],
        [-0.7097,  1.3343,  1.0644],
        [ 0.1460, -0.0436, -0.1562],
        [-0.3630,  0.8108,  0.5736],
        [-0.0362,  0.3429,  0.1127],
        [-0.7097,  1.3343,  1.0644],
        [ 0.0717,  0.9697,  0.1887]], grad_fn=<MmBackward0>)

In [65]:
scores = Q@K.T/np.sqrt(embed_dim)
scores

tensor([[-0.0128,  0.0660,  0.2011, -0.0119, -0.0047, -0.0128,  0.0660,  0.2011,
         -0.0047, -0.0119],
        [ 0.1765, -0.4291, -0.8712,  0.3459, -0.2865,  0.1765, -0.4291, -0.8712,
         -0.2865,  0.3459],
        [ 0.0702, -0.1416, -0.2168,  0.1445, -0.1449,  0.0702, -0.1416, -0.2168,
         -0.1449,  0.1445],
        [ 0.1982, -0.3418, -0.4278,  0.4430, -0.4214,  0.1982, -0.3418, -0.4278,
         -0.4214,  0.4430],
        [ 0.2944, -0.7444, -1.5764,  0.5685, -0.4502,  0.2944, -0.7444, -1.5764,
         -0.4502,  0.5685],
        [-0.0128,  0.0660,  0.2011, -0.0119, -0.0047, -0.0128,  0.0660,  0.2011,
         -0.0047, -0.0119],
        [ 0.1765, -0.4291, -0.8712,  0.3459, -0.2865,  0.1765, -0.4291, -0.8712,
         -0.2865,  0.3459],
        [ 0.0702, -0.1416, -0.2168,  0.1445, -0.1449,  0.0702, -0.1416, -0.2168,
         -0.1449,  0.1445],
        [ 0.2944, -0.7444, -1.5764,  0.5685, -0.4502,  0.2944, -0.7444, -1.5764,
         -0.4502,  0.5685],
        [ 0.1982, -

In [66]:
head = nn.Softmax(dim = 1)(scores) @ V
head

tensor([[ 0.7399,  0.3077, -0.3537],
        [ 0.6669, -0.0094, -0.3993],
        [ 0.7053,  0.1675, -0.3762],
        [ 0.6913,  0.0489, -0.3471],
        [ 0.6447, -0.1506, -0.3951],
        [ 0.7399,  0.3077, -0.3537],
        [ 0.6669, -0.0094, -0.3993],
        [ 0.7053,  0.1675, -0.3762],
        [ 0.6447, -0.1506, -0.3951],
        [ 0.6913,  0.0489, -0.3471]], grad_fn=<MmBackward0>)

In [67]:
transformer_encoder = nn.TransformerEncoder(encoder_layer,num_layers =2)

In [68]:
params_dict = transformer_encoder.state_dict()
for name, param in params_dict.items():
    print(name,param.shape)

layers.0.self_attn.in_proj_weight torch.Size([9, 3])
layers.0.self_attn.in_proj_bias torch.Size([9])
layers.0.self_attn.out_proj.weight torch.Size([3, 3])
layers.0.self_attn.out_proj.bias torch.Size([3])
layers.0.linear1.weight torch.Size([1, 3])
layers.0.linear1.bias torch.Size([1])
layers.0.linear2.weight torch.Size([3, 1])
layers.0.linear2.bias torch.Size([3])
layers.0.norm1.weight torch.Size([3])
layers.0.norm1.bias torch.Size([3])
layers.0.norm2.weight torch.Size([3])
layers.0.norm2.bias torch.Size([3])
layers.1.self_attn.in_proj_weight torch.Size([9, 3])
layers.1.self_attn.in_proj_bias torch.Size([9])
layers.1.self_attn.out_proj.weight torch.Size([3, 3])
layers.1.self_attn.out_proj.bias torch.Size([3])
layers.1.linear1.weight torch.Size([1, 3])
layers.1.linear1.bias torch.Size([1])
layers.1.linear2.weight torch.Size([3, 1])
layers.1.linear2.bias torch.Size([3])
layers.1.norm1.weight torch.Size([3])
layers.1.norm1.bias torch.Size([3])
layers.1.norm2.weight torch.Size([3])
layers.1

## Text Classification

In [69]:
# Import Bank dataset
train_iter = AG_NEWS(split = 'train')

# AG_NEWS is an iterable dataset, that should be used with an iterator
y,text = next(iter(train_iter))

y,text

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [70]:
ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}
ag_news_label[y]

'Business'

In [71]:
num_class = len(set([label for label,text in train_iter]))
num_class

4

In [None]:
# build the vocab
vocab = 