### Ex- 1 Train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [4]:
len(words)

32033

In [20]:
import torch

In [21]:
chars = sorted(list((set(''.join(words)))))

In [22]:
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0;
itos = {i:s for s,i in stoi.items()}

In [23]:
for w in words:
    chs = ['.'] + list(w)+['.']
    for ch1,ch2,ch3 in zip(chs,chs[1:],chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        N[ix1,ix2,ix3] += 1

In [30]:
P = (N+2).float()
P /= P.sum(2,keepdim=True)

In [31]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):  # Generate 20 sequences
    out = []  # Store the output sequence
    ix, jx = 0, 0  # Start with some initial characters, e.g., '.' as a start token if itos[0] = '.'
    
    while True:
        # Get the probability distribution for the next character based on the previous two (ix, jx)
        p = P[ix][jx]  # This gives the probabilities for the third character given ix and jx
        # Sample the next character based on the current distribution
        next_char_idx = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        
        # Add the next character to the output
        out.append(itos[next_char_idx])

        # If the next character is the "end" character (e.g., represented by index 0), stop the sequence
        if next_char_idx == 0:
            break
        
        # Update the indices for the next iteration:
        ix, jx = jx, next_char_idx  # Shift to the next two-character context
    
    # Print the generated sequence
    print(''.join(out))


junide.
ilyasid.
prelay.
ocin.
fairritoper.
sathen.
dannaaryanileniassibduinrwin.
lessiyanayla.
te.
farmumthyfortumj.
ponn.
zena.
jaylicore.
ya.
zoffra.
jamilyn.
fmouis.
yah.
wanaasnhavi.
honszxhddion.


In [32]:
log_likelihood = 0.0
n = 0
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2,ch3 in zip(chs,chs[1:],chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        prob = P[ix1,ix2,ix3]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        # print(f'{ch1}{ch2} : {prob:.4f} {logprob: .4f}')

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n=}')

log_likelihood=tensor(-410414.9688)
nll=tensor(410414.9688)
nll/n=tensor(2.0927)


# using neural network

In [34]:
uninitialized_tensor = torch.empty(3, 2)
uninitialized_tensor

tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])

In [103]:
# Create the training set of trigrams (x, y)
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # The index for the first character
        ix2 = stoi[ch2]  # The index for the second character
        ix3 = stoi[ch3]  # The index for the third character
  # For debugging: Print the trigrams
        xs.append((ix1, ix2))  # Append the bigram (ix1, ix2) as input
        ys.append(ix3)  # Append the third character as output
    
# Convert lists to tensors
xs = torch.tensor(xs)
ys = torch.tensor(ys)
xs.shape

torch.Size([196113, 2])

In [89]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(xs, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(xs)), ys].log().mean()
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.6169
10: 2.6165
20: 2.4743
30: 2.4242
40: 2.4002
50: 2.3870
60: 2.3790
70: 2.3739
80: 2.3707
90: 2.3685
100: 2.3670
110: 2.3660
120: 2.3654
130: 2.3649
140: 2.3646
150: 2.3643
160: 2.3642
170: 2.3640
180: 2.3640
190: 2.3639


In [91]:
names = []
for i in range(10):
    out = []
    ix1, ix2 = 0, 0
    while True:
        # previosly we used P[ix]
        # p = P[ix]

        # now we use the softmax of the logits
        xenc = F.one_hot(torch.tensor([ix1, ix2]).to(device), num_classes = 27).float().to(device)
        xenc = xenc.view(-1, 27*2)
        
        logits = xenc @ W
        counts = torch.exp(logits)
        p = counts / counts.sum(dim = 1, keepdim = True)

        ix1 = ix2
        ix2 = torch.multinomial(p.to(device), num_samples = 1 , replacement = True).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break

    names.append("".join(out))
    
for name in names:
    print(name)

odigdary.
bramilirpmiy.
amanstretsoladels.
on.
amua.
ymcekeynasin.
mmin.
ea.
ia.
hamqkelen.


### EX-2 E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [143]:
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # The index for the first character
        ix2 = stoi[ch2]  # The index for the second character
        ix3 = stoi[ch3]  # The index for the third character
  # For debugging: Print the trigrams
        xs.append((ix1, ix2))  # Append the bigram (ix1, ix2) as input
        ys.append(ix3)  # Append the third character as output
    
# Convert lists to tensors
xs = torch.tensor(xs)
ys = torch.tensor(ys)




from sklearn.model_selection import train_test_split

X_train, X_test_val, y_train, y_test_val = train_test_split(xs, ys, test_size=0.2, random_state=42)


X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)



In [144]:
X_train.dtype

torch.int64

In [110]:
X_test.shape

torch.Size([19611, 2])

In [111]:
X_val.shape

torch.Size([19612, 2])

In [121]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(X_train, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(X_train)), y_train].log().mean()
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.5938
10: 2.5890
20: 2.4609
30: 2.4168
40: 2.3957
50: 2.3841
60: 2.3773
70: 2.3730
80: 2.3703
90: 2.3685
100: 2.3673
110: 2.3665
120: 2.3659
130: 2.3655
140: 2.3652
150: 2.3650
160: 2.3649
170: 2.3648
180: 2.3647
190: 2.3647


In [122]:
def MLP_loss(x, y, W):
    xenc = F.one_hot(x, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)

    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)

    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(x)), y].log().mean()
    
    return loss.item()

In [116]:
print(f"Train Loss: {MLP_loss(X_train, y_train, W):.4f}")
print(f"Dev Loss: {MLP_loss(X_val, y_val, W):.4f}")
print(f"Test Loss: {MLP_loss(X_test, y_test, W):.4f}")

Train Loss: 2.2876
Dev Loss: 2.2881
Test Loss: 2.2830


### E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [126]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(X_train, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(X_train)), y_train].log().mean()
    # add regularization
    # loss += 0.05 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(X_val, y_val, W):.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: Train Loss: 4.2798 | Dev Loss 4.2945
10: Train Loss: 2.4828 | Dev Loss 2.4814
20: Train Loss: 2.3714 | Dev Loss 2.3704
30: Train Loss: 2.3285 | Dev Loss 2.3279
40: Train Loss: 2.3056 | Dev Loss 2.3054
50: Train Loss: 2.2916 | Dev Loss 2.2919
60: Train Loss: 2.2822 | Dev Loss 2.2829
70: Train Loss: 2.2755 | Dev Loss 2.2766
80: Train Loss: 2.2705 | Dev Loss 2.2720
90: Train Loss: 2.2666 | Dev Loss 2.2684
100: Train Loss: 2.2635 | Dev Loss 2.2656
110: Train Loss: 2.2610 | Dev Loss 2.2633
120: Train Loss: 2.2589 | Dev Loss 2.2614
130: Train Loss: 2.2571 | Dev Loss 2.2598
140: Train Loss: 2.2555 | Dev Loss 2.2584
150: Train Loss: 2.2542 | Dev Loss 2.2572
160: Train Loss: 2.2530 | Dev Loss 2.2561
170: Train Loss: 2.2519 | Dev Loss 2.2552
180: Train Loss: 2.2510 | Dev Loss 2.2543
190: Train Loss: 2.2502 | Dev Loss 2.2536


### E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [132]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(X_train, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim = 1, keepdim = True)
    
    # loss (normalized negative log likelihood)
    loss = - probs[torch.arange(len(X_train)), y_train].log().mean()
    # add regularization
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 15.6788
10: nan
20: nan
30: nan
40: nan
50: nan
60: nan
70: nan
80: nan
90: nan
100: nan
110: nan
120: nan
130: nan
140: nan
150: nan
160: nan
170: nan
180: nan
190: nan


### Ex-04 look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [145]:
W = torch.randn((27*2,27), requires_grad = True, device = device)
for k in range(200):
    # forward pass
    xenc = F.one_hot(X_train, num_classes = 27).float().to(device)
    xenc = xenc.view(-1, 27*2)
    
    # probs is softmax
    logits = xenc @ W
    
    # loss (normalized negative log likelihood)
    loss = torch.nn.functional.cross_entropy(logits, y_train)
    loss += 0.2 * W.pow(2).mean()

    if k % 10 == 0:
        print(f"{k}: {loss.item():.4f}")

    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    with torch.no_grad():
        W -= 50 * W.grad

0: 4.4856
10: 2.5856
20: 2.4639
30: 2.4197
40: 2.3979
50: 2.3857
60: 2.3784
70: 2.3738
80: 2.3708
90: 2.3689
100: 2.3675
110: 2.3666
120: 2.3660
130: 2.3656
140: 2.3653
150: 2.3651
160: 2.3649
170: 2.3648
180: 2.3647
190: 2.3647
