In [1]:
import torch
import torch.nn.functional as F


words = open('names.txt', 'r').read().splitlines()

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}


In [2]:
def build_dataset(words, context):
    xs= []
    ys = []
    for word in words:

        new_word =  word + '.'

        new_added = ['.'] * (context)

        contextt = "".join(new_added)
        new_word = contextt + new_word

        for i in range(len(new_word)):

            slided_word = new_word[i: (i +context)%len(new_word)]
            if not (i+context) == len(new_word):
                xs.append([stoi[x] for x in list(slided_word)])
                ys.append(stoi[new_word[(i+context)%len(new_word)]])
            else:
                break

    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    return xs, ys  

In [20]:
@torch.no_grad()
def test_forward_pass(X_dataset, Y_dataset, feature_vector_C, H_weights, H_bias, G_weights, G_bias):
    for i in range(1):
 
        minibatch_ints = torch.randint(0, X_dataset.shape[0], (32,))
        minibatch_construct = X_dataset[minibatch_ints]

        feature_activation_layer = feature_vector_C[minibatch_construct] 
        merged_feature_vector = feature_activation_layer.view((minibatch_construct.shape[0], -1)) 
        first_activation = torch.tanh(merged_feature_vector @ H_weights + H_bias)
        second_activation = first_activation @ G_weights + G_bias
        

        normalized_probabilities = F.log_softmax(second_activation, dim=1)
        negative_likelihood_loss = F.nll_loss(normalized_probabilities, Y_dataset[minibatch_ints])

        return negative_likelihood_loss

In [6]:
import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

context=3
Xtr, Ytr = build_dataset(words[:n1],context)
Xdev, Ydev = build_dataset(words[n1:n2],context)
Xte, Yte = build_dataset(words[n2:],context)


# META_VARIABLES
H_weights_length = 200
Context_length = 3
Feature_dimensions = 10


# Parameters
g = torch.Generator().manual_seed(2147483647)

feature_vector_C = torch.rand((27, Feature_dimensions), generator=g)
H_weights = torch.rand((Context_length*Feature_dimensions, H_weights_length),generator=g)
H_bias = torch.rand((H_weights_length) ,generator=g)
G_weights = torch.rand((H_weights_length, 27) ,generator=g)
G_bias = torch.rand((27,), generator=g)

parameters = [feature_vector_C, H_weights, H_bias, G_weights, G_bias]
for p in parameters:
    p.requires_grad = True


lowest_loss = 0
for i in range(100000):
# minibatch contruct
    minibatch_ints = torch.randint(0, Xtr.shape[0], (32,))
    minibatch_construct = Xtr[minibatch_ints]
# forward pass
    feature_activation_layer = feature_vector_C[minibatch_construct] 
    merged_feature_vector = feature_activation_layer.view((minibatch_construct.shape[0], -1)) 
    first_activation = torch.tanh(merged_feature_vector @ H_weights + H_bias)
    second_activation = first_activation @ G_weights + G_bias
    
# loss
    normalized_probabilities = F.log_softmax(second_activation, dim=1)
    negative_loss_likelihood = F.nll_loss(normalized_probabilities, Ytr[minibatch_ints])
# backward pass
    for p in parameters:
        p.grad = None

    negative_loss_likelihood.backward()

#gradient descent

    lr = 0.01 if i < 100000 else 0.001
    for p in parameters:
        p.data += -lr*p.grad
    
    loss_item = negative_loss_likelihood.item()
    if i == 1:
        lowest_loss = loss_item
    if lowest_loss > loss_item:
        lowest_loss = loss_item

    if i % 10000 == 0:
        print(loss_item)


print(lowest_loss)

9.31351089477539
2.847027063369751
2.814969778060913
2.946777582168579
2.980334758758545
2.906338930130005
2.7683167457580566
3.0382931232452393
2.920518159866333
2.725039005279541
2.197007417678833


In [17]:
testing = test_forward_pass(Xdev, Ydev, feature_vector_C, H_weights, H_bias, G_weights, G_bias)
print(testing, 'testing loss on validation set')

tensor(7.2812) testing loss on validation set
