In [12]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from collections import Counter
import itertools
import torch.nn.functional as f
from torch.utils.data import SubsetRandomSampler

In [13]:
with open(r"names.txt",'r') as names_list:
    words = [i.strip('\n') for i in names_list.readlines()]

Building a TriGram Language Model

In [14]:
#get all characters 
chars = sorted(list(set(''.join(words))))
bigram_starts = [''.join([i,j]) for i,j in list(itertools.permutations(chars,2))]
bigram_with_dot = [''.join(['.',i]) for i in chars]
bigram_with_itself = [''.join([i,i]) for i in chars]
final_bigram_list = bigram_with_dot + bigram_starts + bigram_with_itself
final_bigram_list = sorted(final_bigram_list)

In [15]:
# bigram_starts.extend(bigram_with_itself)
# bigram_starts = sorted(bigram_starts)
bsti = {j:i for i,j in enumerate(final_bigram_list)}
itbs = {i:j for i,j in enumerate(final_bigram_list)}

Main counts tensor will be of 702 x 27 dimensions

In [16]:
chars.extend(['.'])
itoc = {i:j for i,j in enumerate(chars)}
ctoi = {j:i for i,j in enumerate(chars)}

In [17]:
#Main counts tensor
N = torch.zeros(size = (len(final_bigram_list),27),dtype = torch.int32)

In [18]:
for word in words:
    chs = '.' + word + '.'
    for i,j in enumerate(chs):
        condition = chs[i:i+2]
        result = chs[i+2]
        idx_row = bsti[condition]
        idx_col = ctoi[result]
        N[idx_row,idx_col] += 1
        if result == '.':
            break

In [19]:
#add fake counts for smoothing
P = N + 1
#normalizing P
log_likelihood = 0.0
n = 0
P = P/P.sum(1,keepdims = True)
for word in words:
    chs = '.' + word + '.'
    for i,j in enumerate(chs):
        #e.g: .emma.
        #P(m|.e), P(m|em), P(a|mm),P(.|ma)
        condition = chs[i:i+2]
        result = chs[i + 2]
        idx_row = bsti[condition]
        idx_col = ctoi[result]
        prob = P[idx_row,idx_col]
        logprob = torch.log(prob)
        log_likelihood += logprob.item()
        n += 1
        if result == '.':
            break
print(f'Negative log likelihood: {-log_likelihood/n: .4f}')


Negative log likelihood:  2.0931


Neural Net Model

Create tensor for one hot encoding. 

The one hot vector will be of length 702.

In [20]:
xs = []
ys = []
for word in words[:50]:
    chs = '.' + word + '.'
    for i in range(len(chs)):
        input = chs[i:i+2]
        result = chs[i + 2]
        idx_input = bsti[input]
        idx_output = ctoi[result]
        xs.append(idx_input)
        ys.append(idx_output)
        if result == '.':
            break

In [21]:
f

<module 'torch.nn.functional' from 'c:\\Users\\ASUS\\AppData\\Local\\Programs\\Python\\Python38\\lib\\site-packages\\torch\\nn\\functional.py'>

In [22]:
lr = 50
iter = 50
xenc = f.one_hot(torch.tensor(xs).long(),num_classes = len(bsti)).float()
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((xenc.shape[1],len(ctoi)),generator=g,requires_grad=True).float()

for i in range(iter):
    logits = xenc @ W
    counts = logits.exp()
    probs = counts/counts.sum(1,keepdims = True)

    #loss
    class_probs = probs[torch.arange(len(ys)),ys]
    loss = -1 * class_probs.log().mean()
    print(f"Loss : {loss.item(): .4f}")

    W.grad = None
    loss.backward()

    #update
    W.data += - lr*W.grad


Loss :  3.6238
Loss :  3.3120
Loss :  3.0412
Loss :  2.8108
Loss :  2.6103
Loss :  2.4321
Loss :  2.2728
Loss :  2.1306
Loss :  2.0038
Loss :  1.8906
Loss :  1.7894
Loss :  1.6987
Loss :  1.6172
Loss :  1.5438
Loss :  1.4774
Loss :  1.4172
Loss :  1.3626
Loss :  1.3129
Loss :  1.2677
Loss :  1.2266
Loss :  1.1891
Loss :  1.1550
Loss :  1.1239
Loss :  1.0956
Loss :  1.0699
Loss :  1.0465
Loss :  1.0251
Loss :  1.0057
Loss :  0.9880
Loss :  0.9718
Loss :  0.9570
Loss :  0.9435
Loss :  0.9311
Loss :  0.9198
Loss :  0.9093
Loss :  0.8997
Loss :  0.8908
Loss :  0.8825
Loss :  0.8749
Loss :  0.8678
Loss :  0.8612
Loss :  0.8550
Loss :  0.8493
Loss :  0.8439
Loss :  0.8388
Loss :  0.8341
Loss :  0.8296
Loss :  0.8254
Loss :  0.8214
Loss :  0.8177


Split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [38]:
def train_val_test_splits(xs,ys,train_split):
    import math
    assert train_split <= 0.8,"Too high train split"
    total_length = len(xs)
    g = torch.Generator().manual_seed(42)
    indices = list(SubsetRandomSampler(range(total_length),generator=g))
    train_idx = math.ceil(train_split*total_length)
    remaining = total_length - train_idx
    dev_idx = math.ceil(0.5*remaining)
    train_set = indices[:train_idx]
    val_set = indices[train_idx:train_idx + dev_idx]
    test_set = indices[train_idx + dev_idx:]
    return train_set,val_set,test_set

In [40]:
train,val,test = train_val_test_splits(xs,ys,0.8)

In [58]:
xs_train = np.array(xs)[train].tolist()
ys_train = np.array(ys)[train].tolist()
xs_val = np.array(xs)[val].tolist()
ys_val = np.array(ys)[val].tolist()
xs_test = np.array(xs)[test].tolist()
ys_test = np.array(ys)[test].tolist()

Train on trigram model

In [60]:
lr = 50
iter = 50
xenc = f.one_hot(torch.tensor(xs_train).long(),num_classes = len(bsti)).float()
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((xenc.shape[1],len(ctoi)),generator=g,requires_grad=True).float()

for i in range(iter):
    logits = xenc @ W
    counts = logits.exp()
    probs = counts/counts.sum(1,keepdims = True)

    #loss
    class_probs = probs[torch.arange(len(ys_train)),ys_train]
    loss = -1 * class_probs.log().mean()
    print(f"Loss : {loss.item(): .4f}")

    W.grad = None
    loss.backward()

    #update
    W.data += - lr*W.grad
print(f"Final Loss")

Loss :  3.5871
Loss :  3.2281
Loss :  2.9199
Loss :  2.6608
Loss :  2.4370
Loss :  2.2402
Loss :  2.0668
Loss :  1.9144
Loss :  1.7803
Loss :  1.6623
Loss :  1.5581
Loss :  1.4660
Loss :  1.3844
Loss :  1.3118
Loss :  1.2471
Loss :  1.1895
Loss :  1.1381
Loss :  1.0922
Loss :  1.0514
Loss :  1.0150
Loss :  0.9826
Loss :  0.9537
Loss :  0.9281
Loss :  0.9052
Loss :  0.8848
Loss :  0.8665
Loss :  0.8501
Loss :  0.8354
Loss :  0.8221
Loss :  0.8101
Loss :  0.7992
Loss :  0.7893
Loss :  0.7803
Loss :  0.7720
Loss :  0.7644
Loss :  0.7574
Loss :  0.7510
Loss :  0.7450
Loss :  0.7395
Loss :  0.7343
Loss :  0.7295
Loss :  0.7250
Loss :  0.7208
Loss :  0.7169
Loss :  0.7132
Loss :  0.7097
Loss :  0.7065
Loss :  0.7034
Loss :  0.7005
Loss :  0.6977


In [48]:
xs[train]

TypeError: list indices must be integers or slices, not list

In [None]:
train_ds = xs[]