In [19]:
import torch
import torch.nn.functional as F
import pandas as pd
import re
import matplotlib.pyplot as plt

%matplotlib inline

In [7]:
# Get list of names from data
data = pd.read_csv('../data/companies_sorted.csv')
names = data['name'].tolist()

In [10]:
# Cut data to first million names to reduce training time - not looking for the most accurate anyway.
names = names[:1000000]

In [13]:
len(names)

1000000

In [20]:
# Build the vocabulary of characters and index mapping - to and from integers
# Involves making sure all the names are strings (as some are floats) and removing any names with non-alphanumeric characters
# (as some of these companies are from other countries and have non-English characters in their names).
# Maybe we should limit to US and UK companies only as the names are likely to be more similar and contain English words - extension.
names = [str(name) for name in names]
pattern = re.compile(r'^[a-zA-Z\d]+$')
filtered_names = [name for name in names if pattern.match(name)]
chars = sorted(list(set(''.join(filtered_names))))

In [21]:
len(chars)

36

In [24]:
stoi = {char: i+1 for i, char in enumerate(chars)}
stoi['.'] = 0
itos = {i: char for char, i in stoi.items()}

In [28]:
assert len(stoi) == len(chars) + 1, "Error: Vocabulary size does not match character set size"
assert len(stoi) == len(itos), "Error: Indexes to characters and characters to indexes do not match"

In [37]:
# Create x and y datasets from list of company names.
# I want two lists of lists, one for x and one for y
# x contain the first x letters of each name, each letter will be its own element of a list
# y contains the next letter
block_size = 3
X, Y = [], []
for name in filtered_names[:3]:
    print(name)
    context = [0] * block_size
    for char in name + '.':
        ix = stoi[char]
        X.append(context)
        Y.append(ix)
        print(''.join([itos[i] for i in context]), '-->', char)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)


ibm
... --> i
..i --> b
.ib --> m
ibm --> .
accenture
... --> a
..a --> c
.ac --> c
acc --> e
cce --> n
cen --> t
ent --> u
ntu --> r
tur --> e
ure --> .
ey
... --> e
..e --> y
.ey --> .


In [85]:
# Create the embeddings for the characters.
# The embeddings will be a feature vector of length 10 for each character.
# The table of embeddings will be a matrix of size (vocab_size, embedding_size).

embedding_size = 10
vocab_size = len(stoi)

# Createa randn tensor of size (vocab_size, embedding_size) as the embeddings lookup table.
C = torch.randn(vocab_size, embedding_size)
# Apply these embeddings to the input data by using pytorch indexing.
embeddings = C[X]
embeddings.shape

torch.Size([17, 3, 10])

In [86]:
# Creating the first hidden layer.
# The input of this layer will be each of the characters in the context, input as their feature vectors (embeddings),
# the block_size (3) * embedding_size (10) = 30.
# The bias will be a vector of size 50, as the hidden layer has 50 outputs.
# In the paper, the hidden layer either has 0, 50, or 100 outputs.

hidden_size = 50

W1 = torch.randn(block_size * embedding_size, hidden_size)
b1 = torch.randn(hidden_size)

In [87]:
embeddings[0]

tensor([[-0.0953, -1.1443,  0.6619, -1.3265, -0.7251,  1.8243, -0.9412, -1.1199,
         -1.5182,  0.7003],
        [-0.0953, -1.1443,  0.6619, -1.3265, -0.7251,  1.8243, -0.9412, -1.1199,
         -1.5182,  0.7003],
        [-0.0953, -1.1443,  0.6619, -1.3265, -0.7251,  1.8243, -0.9412, -1.1199,
         -1.5182,  0.7003]])

In [88]:
W1.shape

torch.Size([30, 50])

In [89]:
embeddings[0].shape

torch.Size([3, 10])

In [94]:
embeddings[0].flatten() @ W1

torch.Size([50])