# Exercise 1: Training a Trigram Model

## Steps:
1. Create xs and ys array by mapping index to character
2. One hot encode the vectors
3. Generate weights matrix and multiply by it
4. Get probabilities by using softmax activation function
5. Calculate the loss with the negative average log-likelihood

### First Approach: Combining Embedding Vectors of Two Characters to get 54-Dimensional Vector

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [115]:
len(words)

32033

In [5]:
chars = sorted(set(''.join(words) + '.'))
stoi = {}
for index, value in enumerate(chars):
    stoi[value] = index

stoi

{'.': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [117]:
import torch

xs = []
ys = []

for w in words[:5]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        print(ch1, ch2, ch3)
        ix1 = stoi[ch1]
        print(ix1)
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append([ix1, ix2])
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

. e m
0
e m m
5
m m a
13
m a .
13
. o l
0
o l i
15
l i v
12
i v i
9
v i a
22
i a .
9
. a v
0
a v a
1
v a .
22
. i s
0
i s a
9
s a b
19
a b e
1
b e l
2
e l l
5
l l a
12
l a .
12
. s o
0
s o p
19
o p h
15
p h i
16
h i a
8
i a .
9


In [58]:
xs

tensor([[ 0,  5],
        [ 5, 13],
        [13, 13],
        [13,  1],
        [ 0, 15],
        [15, 12],
        [12,  9],
        [ 9, 22],
        [22,  9],
        [ 9,  1],
        [ 0,  1],
        [ 1, 22],
        [22,  1],
        [ 0,  9],
        [ 9, 19],
        [19,  1],
        [ 1,  2],
        [ 2,  5],
        [ 5, 12],
        [12, 12],
        [12,  1],
        [ 0, 19],
        [19, 15],
        [15, 16],
        [16,  8],
        [ 8,  9],
        [ 9,  1]])

In [60]:

import torch.nn.functional as F

# One-hot encoding
xenc = F.one_hot(xs, num_classes=27).float()

# If you want to combine them and still keep the original shape, you might concatenate along a new dimension
xenc = torch.cat((xenc[:, 0, :], xenc[:, 1, :]), dim=-1)

print(xenc)

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [61]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((54,27), generator=g, requires_grad=True)

In [62]:
torch.arange(4)

tensor([0, 1, 2, 3])

In [64]:
logits = xenc @ W
# counts = logits.exp()
probs = torch.softmax(logits, dim=1)

loss = -probs[len(ys)-1, ys].log().mean()
loss

tensor(3.6066, grad_fn=<NegBackward0>)

In [65]:
W.grad = None
loss.backward()

In [70]:
loss.item()

3.606621265411377

In [69]:
W.data += -5 * W.grad

#### Polished Version

In [114]:
import torch

xs = []
ys = []

for w in words:
    chs = ['.'] + list(w) + ['.']
    print(ch1, ch2, ch3)
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append([ix1, ix2])
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

. . emma


KeyError: '.'

In [73]:
import torch.nn.functional as F

# One-hot encoding
xenc = F.one_hot(xs, num_classes=27).float()

# If you want to combine them and still keep the original shape, you might concatenate along a new dimension
xenc = torch.cat((xenc[:, 0, :], xenc[:, 1, :]), dim=-1)


In [86]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((54,27), generator=g, requires_grad=True)

In [91]:
for k in range(500):
    
    # forward pass
    logits = xenc @ W
    probs = torch.softmax(logits, dim=1)
    loss = -probs[len(ys)-1, ys].log().mean() + 0.01*(W**2).mean()
    print(loss.item())

    # backward pass
    W.grad = None
    loss.backward()

    W.data += -20 * W.grad
    

2.7101964950561523
2.7101917266845703
2.7101891040802
2.7102184295654297
2.710810661315918
2.72115159034729
2.914403200149536
4.259065628051758
4.249783515930176
5.933859825134277
6.294427394866943
5.303628444671631
4.763333320617676
5.796388626098633
4.768635272979736
5.874476432800293
7.9547505378723145
7.4351911544799805
7.890408992767334
9.00831413269043
7.094325065612793
6.374057292938232
6.616848468780518
8.479443550109863
8.422089576721191
9.437820434570312
7.177141189575195
4.99993896484375
5.547086238861084
4.86515474319458
5.070521354675293
4.268878936767578
3.5950818061828613
6.825418472290039
8.565591812133789
6.760919094085693
4.663906097412109
4.961480140686035
6.643067359924316
6.984325408935547
4.651499271392822
5.2959818840026855
7.505491256713867
6.379263877868652
4.391354084014893
3.6751134395599365
4.775835037231445
6.8229169845581055
6.120018005371094
5.384059429168701
3.9447126388549805
5.3929762840271
7.6287641525268555
7.456218242645264
7.156004428863525
4.87608

### Second Approach: Counting Bigrams

In [1]:
words = open('names.txt', 'r').read().splitlines()

In [46]:
chars = sorted(set(''.join(words) + '.'))
b = [ch1+ch2 for ch1 in chars for ch2 in chars]
btoi = {}
stoi = {}
for index, value in enumerate(b):
    btoi[value] = index

for index, value in enumerate(chars):
    stoi[value] = index

print(stoi)

{'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [70]:
import torch

xs = []
ys = []

for w in words:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        b = ch1 + ch2
        ix1 = btoi[b]
        ix2 = stoi[ch3]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
num = xs.nelement()
ys = torch.tensor(ys)

In [71]:
xs.size()

torch.Size([228146])

In [47]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729,27), generator=g, requires_grad=True).float()

In [51]:
import torch.nn.functional as F

xenc = F.one_hot(xs, num_classes=729).float() # 729 because that's how many possible bigrams there are

In [96]:
for k in range(2000):
    logits = xenc @ W # 228146x729 * 729x27 # we want output dimension to be 27 to represent number of single characters, and input dimension to be 729 to represent number of bigrams
    probs = torch.softmax(logits, dim=1)
    loss = -probs[torch.arange(num), ys].log().mean()
    print(loss)

    W.grad = None
    loss.backward()

    W.data += -100 * W.grad

tensor(2.2451, grad_fn=<NegBackward0>)
tensor(2.2407, grad_fn=<NegBackward0>)
tensor(2.2123, grad_fn=<NegBackward0>)
tensor(2.2098, grad_fn=<NegBackward0>)
tensor(2.2097, grad_fn=<NegBackward0>)
tensor(2.2094, grad_fn=<NegBackward0>)
tensor(2.2093, grad_fn=<NegBackward0>)
tensor(2.2093, grad_fn=<NegBackward0>)
tensor(2.2092, grad_fn=<NegBackward0>)
tensor(2.2092, grad_fn=<NegBackward0>)
tensor(2.2091, grad_fn=<NegBackward0>)
tensor(2.2091, grad_fn=<NegBackward0>)
tensor(2.2091, grad_fn=<NegBackward0>)
tensor(2.2091, grad_fn=<NegBackward0>)
tensor(2.2091, grad_fn=<NegBackward0>)
tensor(2.2090, grad_fn=<NegBackward0>)
tensor(2.2090, grad_fn=<NegBackward0>)
tensor(2.2090, grad_fn=<NegBackward0>)
tensor(2.2090, grad_fn=<NegBackward0>)
tensor(2.2090, grad_fn=<NegBackward0>)
tensor(2.2090, grad_fn=<NegBackward0>)
tensor(2.2089, grad_fn=<NegBackward0>)
tensor(2.2089, grad_fn=<NegBackward0>)
tensor(2.2089, grad_fn=<NegBackward0>)
tensor(2.2089, grad_fn=<NegBackward0>)
tensor(2.2089, grad_fn=<N

# Exercises 2 & 3: Using Train, Dev (Eval), Test Splits & Evaluating Regularization

## Steps
1. Split dataset into 80% train, 10% eval, 10% test
2. Train bigram model on it
3. Train trigram model on it

In [6]:
train = words[:int(0.8*len(words))]
dev = words[int(0.8*len(words)):int(0.9*len(words))]
test = words[int(0.9*len(words)):int(len(words))]
print(len(train), len(dev), len(test))
print(len(train+dev+test))

25626 3203 3204
32033


### Bigram Model

#### Train

In [19]:
import torch
import torch.nn.functional as F

xs, ys = [], []
for w in train:
    chs = ['.'] + list(w) + ['.'] 
    for ch1, ch2, in zip(chs, chs[1:]): 
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

number of examples:  182778


In [20]:
# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True) 

In [21]:
# training loop
for k in range(1500):
    
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float() 
    logits = xenc @ W # operator for dot product in pytorch
    counts = logits.exp() 
    probs = counts/counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean() # second part is regularization
    print(loss.item())

    W.grad = None
    loss.backward()

    W.data += -50 * W.grad

3.7777111530303955
3.3582370281219482
3.1306982040405273
2.9942102432250977
2.901160478591919
2.834472179412842
2.7845919132232666
2.745758056640625
2.714503049850464
2.6886656284332275
2.6668546199798584
2.6481521129608154
2.6319284439086914
2.61773419380188
2.6052331924438477
2.594165563583374
2.5843210220336914
2.575528383255005
2.567643642425537
2.560544967651367
2.554129123687744
2.548307180404663
2.543004035949707
2.5381553173065186
2.533705949783325
2.529609203338623
2.5258262157440186
2.522322177886963
2.519068956375122
2.5160415172576904
2.5132179260253906
2.510580539703369
2.5081124305725098
2.5057997703552246
2.503629207611084
2.50158953666687
2.4996705055236816
2.4978630542755127
2.4961585998535156
2.494549512863159
2.493028402328491
2.4915897846221924
2.490226984024048
2.4889352321624756
2.4877090454101562
2.486544609069824
2.4854371547698975
2.4843835830688477
2.483379602432251
2.4824228286743164
2.4815096855163574
2.480637788772583
2.479804277420044
2.4790070056915283
2.

#### Eval

In [37]:
import torch
import torch.nn.functional as F

xs, ys = [], []
for w in dev:
    chs = ['.'] + list(w) + ['.'] 
    for ch1, ch2, in zip(chs, chs[1:]): 
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

number of examples:  22633


In [38]:
with torch.no_grad():
    xenc = F.one_hot(xs, num_classes=27).float() 
    logits = xenc @ W # operator for dot product in pytorch
    counts = logits.exp() 
    probs = counts/counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.001*(W**2).mean() # second part is regularization
    print(loss.item())

2.5956411361694336


#### Test

In [39]:
import torch
import torch.nn.functional as F

xs, ys = [], []
for w in test:
    chs = ['.'] + list(w) + ['.'] 
    for ch1, ch2, in zip(chs, chs[1:]): 
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

number of examples:  22735


In [40]:
with torch.no_grad():
    xenc = F.one_hot(xs, num_classes=27).float() 
    logits = xenc @ W # operator for dot product in pytorch
    counts = logits.exp() 
    probs = counts/counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.001*(W**2).mean() # second part is regularization
    print(loss.item())

2.5999526977539062


In [44]:
itos = {i:s for s,i in stoi.items()}

g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts/counts.sum(1, keepdims=True)
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

cexze.
momalurailezitynn.
minimittain.
llayn.
ka.
da.
staiyaubrtthrigotai.
moliellavo.
ke.
teda.
ka.
eyla.
sade.
eniaviyny.
ftlspehinivenvtahlasu.
jsor.
bre.
gl.
penwaisan.
ja.


### Trigram Model

#### Train

In [124]:
import torch

xs = []
ys = []

for w in train:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        b = ch1 + ch2
        ix1 = btoi[b]
        ix2 = stoi[ch3]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
num = xs.nelement()
ys = torch.tensor(ys)
print(num)

182778


In [125]:
# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True) 

In [146]:
xenc = F.one_hot(xs, num_classes=729).float()
xenc

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [147]:
for k in range(1000):
    logits = xenc @ W # 228146x729 * 729x27 # we want output dimension to be 27 to represent number of single characters, and input dimension to be 729 to represent number of bigrams
    print(logits)
    probs = torch.softmax(logits, dim=1)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.001*(W**2).mean()
    print(loss)

    W.grad = None
    loss.backward()

    W.data += -100 * W.grad

tensor([[ 1.5674, -0.2373, -0.0274,  ..., -0.0707,  2.4968,  2.4448],
        [ 0.4724,  1.4830,  0.3175,  ..., -0.4275, -2.1259,  0.9604],
        [ 0.1275,  1.7862,  0.9084,  ..., -0.0410,  0.4848, -0.9423],
        ...,
        [-1.5601, -0.1108,  0.3535,  ...,  0.3452, -0.4784,  1.4288],
        [ 0.3432, -0.7622,  0.3055,  ...,  0.0052,  0.6708, -1.8072],
        [ 0.3094,  1.9103,  0.3989,  ...,  2.9526, -0.4998,  0.7987]],
       grad_fn=<MmBackward0>)
tensor(3.7912, grad_fn=<AddBackward0>)
tensor([[ 0.7167,  1.6471,  0.3515,  ..., -0.1802,  0.5685,  0.7959],
        [ 0.4521,  1.4366,  0.3056,  ..., -0.4347, -2.1208,  0.9460],
        [ 0.1595,  1.8121,  0.9054,  ..., -0.0473,  0.4869, -0.9449],
        ...,
        [-1.5050, -0.0697,  0.3415,  ...,  0.3333, -0.4443,  1.3951],
        [ 0.4040, -0.6392,  0.2951,  ..., -0.0034,  0.6668, -1.8075],
        [ 0.3091,  1.9333,  0.3950,  ...,  2.9031, -0.4839,  0.7929]],
       grad_fn=<MmBackward0>)
tensor(3.5230, grad_fn=<AddBackwa

#### Dev

In [128]:
import torch

xs = []
ys = []

for w in dev:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        b = ch1 + ch2
        ix1 = btoi[b]
        ix2 = stoi[ch3]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
num = xs.nelement()
ys = torch.tensor(ys)
print(num)

22633


In [129]:
xenc = F.one_hot(xs, num_classes=729).float()

In [130]:
with torch.no_grad():
    logits = xenc @ W # 228146x729 * 729x27 # we want output dimension to be 27 to represent number of single characters, and input dimension to be 729 to represent number of bigrams
    probs = torch.softmax(logits, dim=1)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.001*(W**2).mean()
    print(loss)

tensor(2.4281)


#### Test

In [131]:
import torch

xs = []
ys = []

for w in test:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        b = ch1 + ch2
        ix1 = btoi[b]
        ix2 = stoi[ch3]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
num = xs.nelement()
ys = torch.tensor(ys)
print(num)

22735


In [173]:
xenc = F.one_hot(xs, num_classes=729).float()

In [133]:
with torch.no_grad():
    logits = xenc @ W # 228146x729 * 729x27 # we want output dimension to be 27 to represent number of single characters, and input dimension to be 729 to represent number of bigrams
    probs = torch.softmax(logits, dim=1)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.001*(W**2).mean()
    print(loss)

tensor(2.4490)


In [135]:
itos = {i:s for s,i in stoi.items()}


# sampling
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    out = ['.', '.']
    ix = btoi['..']
    while True:
        ix = ''.join(out[-2:])
        ix = btoi[''.join(out[-2:])]
        xenc = F.one_hot(torch.tensor([ix]), num_classes=729).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims=True)
        
        
        ix = torch.multinomial(p, num_samples=1, replacement = True, generator = g).item()
        out.append(itos[ix])
        if ix == 0:
            break
        

    print(''.join(out))


..ce.
..bra.
..jalius.
..ila.
..kaydnevonimilea.
..noluwan.
..ka.
..da.
..samiyah.
..javer.
..gotai.
..moriellavorie.
..teda.
..ka.
..emilysidel.
..niaviyah.
..fobspehlynne.
..vtallas.
..kashrxdleenlen.
..alaisana.


# Exercise 4: Replacing One-Hot Encoding

### Description: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [137]:
import torch

xs = []
ys = []

for w in train:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        b = ch1 + ch2
        ix1 = btoi[b]
        ix2 = stoi[ch3]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
num = xs.nelement()
ys = torch.tensor(ys)
print(num)

182778


In [163]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True) 

In [168]:
for k in range(50):
    logits = []
    for index in xs:
        logits.append(W[index.item()])
    
    logits = torch.stack(logits)
    # takes a list of tensors and converts to a tensor of lists
    
       
    probs = torch.softmax(logits, dim=1)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.001*(W**2).mean()
    print(loss)

    W.grad = None
    loss.backward()

    W.data += -50 * W.grad

tensor(3.7912, grad_fn=<AddBackward0>)
tensor(3.6299, grad_fn=<AddBackward0>)
tensor(3.5322, grad_fn=<AddBackward0>)
tensor(3.4598, grad_fn=<AddBackward0>)
tensor(3.3998, grad_fn=<AddBackward0>)
tensor(3.3471, grad_fn=<AddBackward0>)
tensor(3.2999, grad_fn=<AddBackward0>)
tensor(3.2571, grad_fn=<AddBackward0>)
tensor(3.2182, grad_fn=<AddBackward0>)
tensor(3.1825, grad_fn=<AddBackward0>)
tensor(3.1497, grad_fn=<AddBackward0>)
tensor(3.1193, grad_fn=<AddBackward0>)
tensor(3.0910, grad_fn=<AddBackward0>)
tensor(3.0646, grad_fn=<AddBackward0>)
tensor(3.0399, grad_fn=<AddBackward0>)
tensor(3.0168, grad_fn=<AddBackward0>)
tensor(2.9949, grad_fn=<AddBackward0>)
tensor(2.9744, grad_fn=<AddBackward0>)
tensor(2.9550, grad_fn=<AddBackward0>)
tensor(2.9366, grad_fn=<AddBackward0>)
tensor(2.9192, grad_fn=<AddBackward0>)
tensor(2.9026, grad_fn=<AddBackward0>)
tensor(2.8869, grad_fn=<AddBackward0>)
tensor(2.8719, grad_fn=<AddBackward0>)
tensor(2.8576, grad_fn=<AddBackward0>)
tensor(2.8440, grad_fn=<A

# Exercise 5: Replacing NLL with Cross-Entropy

### Description: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [181]:
# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True) 

In [183]:
for k in range(200):
    logits = xenc @ W # 228146x729 * 729x27 # we want output dimension to be 27 to represent number of single characters, and input dimension to be 729 to represent number of bigrams
    probs = torch.softmax(logits, dim=1)
    loss = F.cross_entropy(logits, ys)
    print(loss)

    W.grad = None
    loss.backward()

    W.data += -100 * W.grad
    

tensor(2.2564, grad_fn=<NllLossBackward0>)
tensor(2.2559, grad_fn=<NllLossBackward0>)
tensor(2.2555, grad_fn=<NllLossBackward0>)
tensor(2.2550, grad_fn=<NllLossBackward0>)
tensor(2.2546, grad_fn=<NllLossBackward0>)
tensor(2.2541, grad_fn=<NllLossBackward0>)
tensor(2.2537, grad_fn=<NllLossBackward0>)
tensor(2.2532, grad_fn=<NllLossBackward0>)
tensor(2.2528, grad_fn=<NllLossBackward0>)
tensor(2.2524, grad_fn=<NllLossBackward0>)
tensor(2.2519, grad_fn=<NllLossBackward0>)
tensor(2.2515, grad_fn=<NllLossBackward0>)
tensor(2.2511, grad_fn=<NllLossBackward0>)
tensor(2.2507, grad_fn=<NllLossBackward0>)
tensor(2.2502, grad_fn=<NllLossBackward0>)
tensor(2.2498, grad_fn=<NllLossBackward0>)
tensor(2.2494, grad_fn=<NllLossBackward0>)
tensor(2.2490, grad_fn=<NllLossBackward0>)
tensor(2.2486, grad_fn=<NllLossBackward0>)
tensor(2.2482, grad_fn=<NllLossBackward0>)
tensor(2.2478, grad_fn=<NllLossBackward0>)
tensor(2.2474, grad_fn=<NllLossBackward0>)
tensor(2.2470, grad_fn=<NllLossBackward0>)
tensor(2.24

# Re-Implmentation of MLP