### Basing the model off of the MLP paper Bengio et al. 2003:

https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

(P.S: The paper uses 17000 words and applies word embedding. Here, the data is 'names.txt' and we apply character embedding.)

In [1]:
# In the previous code (part 1), we imlemented 'bigram'. 
# Here in part 2, we are implementing 'mlp'. 

import torch
import torch.nn.functional as F 
import matplotlib.pyplot as plt 
%matplotlib inline 

In [7]:
# read in all the words: 
words = open("names.txt", 'r').read().splitlines()
print(words[0:8])
print(len(words))

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
32033


In [40]:
# build the vocabulary of characters to/from integers 

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)} 
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

print(stoi)
print(itos)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [48]:
ch

'e'

In [69]:
# building the dataset: 

block_size = 3 # = context length = how many characters do we take to predict the next one? 
X, Y = [], [] 
for w in words[:3]: 
    
    print(w)
    context = [0]*block_size # provide padding before the first character 
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context) 
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', ch) # ch = itos[ix]
        context = context[1:] + [ix] #update 'context' 
X = torch.tensor(X).float()
Y = torch.Tensor(Y).float()

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .


In [75]:
print(X.shape, X.dtype, Y.shape, Y.dtype)
print(X)
print(Y)

torch.Size([16, 3]) torch.float32 torch.Size([16]) torch.float32
tensor([[ 0.,  0.,  0.],
        [ 0.,  0.,  5.],
        [ 0.,  5., 13.],
        [ 5., 13., 13.],
        [13., 13.,  1.],
        [ 0.,  0.,  0.],
        [ 0.,  0., 15.],
        [ 0., 15., 12.],
        [15., 12.,  9.],
        [12.,  9., 22.],
        [ 9., 22.,  9.],
        [22.,  9.,  1.],
        [ 0.,  0.,  0.],
        [ 0.,  0.,  1.],
        [ 0.,  1., 22.],
        [ 1., 22.,  1.]])
tensor([ 5., 13., 13.,  1.,  0., 15., 12.,  9., 22.,  9.,  1.,  0.,  1., 22.,
         1.,  0.])


In [76]:
# Embedding matrix/look-up table (table for embedding the 27 alphabetical characters)

C = torch.randn(27, 2)

In [77]:
C

tensor([[ 1.3399, -2.4392],
        [ 0.2491,  0.7287],
        [ 0.9423,  2.5457],
        [ 1.8983, -0.7876],
        [-1.0674, -0.1109],
        [-0.9283, -1.9208],
        [-0.1401,  1.1358],
        [ 0.7916,  1.2085],
        [ 0.5050,  0.1314],
        [ 1.1475,  1.0135],
        [ 0.1826,  1.5369],
        [ 1.0209, -1.1005],
        [-0.1744, -0.0509],
        [-1.0586,  0.6450],
        [ 1.4109, -1.0600],
        [-0.9549,  1.7785],
        [ 1.5576,  0.6574],
        [-0.0051,  1.8975],
        [-1.1549, -1.3384],
        [-0.2702,  1.7332],
        [-0.0734, -0.6737],
        [-0.6820, -2.9194],
        [-0.5700,  1.5197],
        [-0.3822, -0.1921],
        [-0.2074, -0.7838],
        [-1.3177, -0.6732],
        [ 0.3523,  0.8825]])