In [19]:
import torch
import torch.nn.functional as F
import pandas as pd
import re
import matplotlib.pyplot as plt

%matplotlib inline

In [7]:
# Get list of names from data
data = pd.read_csv('../data/companies_sorted.csv')
names = data['name'].tolist()

In [10]:
# Cut data to first million names to reduce training time - not looking for the most accurate anyway.
names = names[:1000000]

In [13]:
len(names)

1000000

In [20]:
# Build the vocabulary of characters and index mapping - to and from integers
# Involves making sure all the names are strings (as some are floats) and removing any names with non-alphanumeric characters
# (as some of these companies are from other countries and have non-English characters in their names).
# Maybe we should limit to US and UK companies only as the names are likely to be more similar and contain English words - extension.
names = [str(name) for name in names]
pattern = re.compile(r'^[a-zA-Z\d]+$')
filtered_names = [name for name in names if pattern.match(name)]
chars = sorted(list(set(''.join(filtered_names))))

In [21]:
len(chars)

36

In [24]:
stoi = {char: i+1 for i, char in enumerate(chars)}
stoi['.'] = 0
itos = {i: char for char, i in stoi.items()}

In [28]:
assert len(stoi) == len(chars) + 1, "Error: Vocabulary size does not match character set size"
assert len(stoi) == len(itos), "Error: Indexes to characters and characters to indexes do not match"

In [163]:
# Create x and y datasets from list of company names.
# I want two lists of lists, one for x and one for y
# x contain the first x letters of each name, each letter will be its own element of a list
# y contains the next letter
block_size = 3
X, Y = [], []
for name in filtered_names:
    # print(name)
    context = [0] * block_size
    for char in name + '.':
        ix = stoi[char]
        X.append(context)
        Y.append(ix)
        # print(''.join([itos[i] for i in context]), '-->', char)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)


In [85]:
# Create the embeddings for the characters.
# The embeddings will be a feature vector of length 10 for each character.
# The table of embeddings will be a matrix of size (vocab_size, embedding_size).

embedding_size = 10
vocab_size = len(stoi)

# Createa randn tensor of size (vocab_size, embedding_size) as the embeddings lookup table.
C = torch.randn(vocab_size, embedding_size)
# Apply these embeddings to the input data by using pytorch indexing.
embeddings = C[X]
embeddings.shape

torch.Size([17, 3, 10])

In [106]:
# Creating the first hidden layer.
# The input of this layer will be each of the characters in the context, input as their feature vectors (embeddings),
# the block_size (3) * embedding_size (10) = 30.
# The bias will be a vector of size 50, as the hidden layer has 50 outputs.
# In the paper, the hidden layer either has 0, 50, or 100 outputs.
# To get the activations of the hidden layer, we matrix multiply the inputs by the weights and add the bias, and 
# then apply the tanh activation function.

hidden_size = 50

W1 = torch.randn(block_size * embedding_size, hidden_size)
b1 = torch.randn(hidden_size)

h = torch.tanh(embeddings.view(-1, block_size * embedding_size) @ W1 + b1)

In [107]:
h.shape

torch.Size([17, 50])

In [123]:
# Create the output layer, which will take an input size of the output from the hidden layer (50)
# and output a vector of size vocab_size (37).
# With these numbers, we then need to normalise them, so we exponentiate them and divide by the sum of the exponentiated values.
# This gives us the probabilities of each character being the next character in the sequence.

W2 = torch.randn(hidden_size, vocab_size)
b2 = torch.randn(vocab_size)

logits = h @ W2 + b2
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)

In [135]:
# Calculate the loss of the model.
# The loss is the negative log likelihood of the correct character.
# We need to find the next character (Y), and look at the probability the model gives of the actual next character being
# the predicted next character by the model.

loss = -prob[torch.arange(len(Y)), Y].log().mean()
loss

tensor(12.9650)

## Refactored

In [173]:
X.shape, Y.shape

(torch.Size([1116580, 3]), torch.Size([1116580]))

In [177]:
g = torch.Generator().manual_seed(42)
C = torch.randn((vocab_size, embedding_size), generator=g, requires_grad=True)
W1 = torch.randn((block_size * embedding_size, hidden_size), generator=g, requires_grad=True)
b1 = torch.randn(hidden_size, generator=g, requires_grad=True)
W2 = torch.randn((hidden_size, vocab_size), generator=g, requires_grad=True)
b2 = torch.randn(vocab_size, generator=g, requires_grad=True)
parameters = [C, W1, b1, W2, b2]

In [175]:
sum(p.nelement() for p in parameters)

3807

In [176]:
embeddings = C[X]
h = torch.tanh(embeddings.view(-1, block_size * embedding_size) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)

In [168]:
loss

tensor(13.8153, grad_fn=<NllLossBackward0>)

In [169]:
# Backward pass
for p in parameters:
    p.grad = None
loss.backward()

# Update parameters
for p in parameters:
    p.data += -0.01 * p.grad

## Training Loop

In [183]:
# Forward pass
for _ in range(100000):

    # Mini batch
    ix = torch.randint(0, X.shape[0], (32,))

    # Forward pass
    embeddings = C[X[ix]]
    h = torch.tanh(embeddings.view(-1, block_size * embedding_size) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])

    if _ % 1000 == 0:
        print(loss.item())

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update parameters
    for p in parameters:
        p.data += -0.01 * p.grad

2.939155101776123
2.625471830368042
2.694211959838867
2.7962472438812256
3.0440847873687744
3.0402674674987793
2.792820453643799
2.6928231716156006
2.659545421600342
2.5892138481140137
2.4613988399505615
2.4509239196777344
2.9675216674804688
2.6929869651794434
2.4946107864379883
2.445042133331299
2.759065866470337
3.015272378921509
2.745004892349243
2.7951488494873047
2.6111488342285156
2.535982131958008
2.6902029514312744
2.431286573410034
2.8781919479370117
2.922363758087158
2.7624025344848633
2.805875539779663
2.9225165843963623
2.4314820766448975
2.99627685546875
2.5717971324920654
2.9652109146118164
2.589905261993408
2.920271635055542
2.7060375213623047
2.6206116676330566
2.917795181274414
2.933218002319336
2.361433982849121
2.8394644260406494
2.673220157623291
2.8193938732147217
2.693074941635132
2.2659153938293457
2.9032742977142334
2.8549792766571045
2.4161155223846436
2.4600281715393066
2.75777268409729
2.773993968963623
2.506575107574463
2.690840482711792
2.6051206588745117
2

KeyboardInterrupt: 

In [None]:
loss

tensor(0.2196, grad_fn=<NllLossBackward0>)