<a href="https://www.kaggle.com/code/evelynartoria/mlp-pytorch-nlp?scriptVersionId=187416571" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/us-baby-names/StateNames.csv
/kaggle/input/us-baby-names/NationalReadMe.pdf
/kaggle/input/us-baby-names/hashes.txt
/kaggle/input/us-baby-names/NationalNames.csv
/kaggle/input/us-baby-names/StateReadMe.pdf
/kaggle/input/us-baby-names/database.sqlite


# Introduction
- This is a notebook presesnting how to build a MLP (Multilayer perceptron) for character level machine learning model
- this notebook follows the Building makemore Part 2: MLP by Andrej Karpathy (https://www.youtube.com/watch?v=TCH_1BHY58I&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=4)

# Import needed libraries

In [2]:
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split


# Device agnostic code

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
generator = torch.Generator(device=device)
print(f"default devcie set to {device}")

default devcie set to cpu


# Read the names csv file

In [4]:
names_csv = pd.read_csv("/kaggle/input/us-baby-names/NationalNames.csv")
names_csv

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
...,...,...,...,...,...
1825428,1825429,Zykeem,2014,M,5
1825429,1825430,Zymeer,2014,M,5
1825430,1825431,Zymiere,2014,M,5
1825431,1825432,Zyran,2014,M,5


# Prepare the data

In [5]:
names = names_csv["Name"].values
vocab = set("".join(names))
print(vocab)

{'f', 'b', 'd', 'g', 'U', 'R', 'Z', 'J', 'u', 'h', 't', 'N', 's', 'E', 'K', 'o', 'T', 'z', 'q', 'v', 'k', 'S', 'F', 'H', 'D', 'm', 'n', 'a', 'Q', 'e', 'x', 'j', 'c', 'M', 'L', 'w', 'X', 'G', 'I', 'C', 'i', 'A', 'B', 'r', 'W', 'p', 'y', 'O', 'V', 'Y', 'l', 'P'}


In [6]:
# string to id
stoi = {c:v+1 for v, c in enumerate(vocab)}
stoi["."] = 0

# id to string
itos = {v:c for c, v in stoi.items()}

print(stoi["g"])
print(itos[21])

vocab_size = len(stoi)
print(f"vocab size --> {vocab_size}")

4
k
vocab size --> 53


In [7]:
def make_dataset(context_size, names, log: bool):

    inputs = []
    labels = []

    for name in names:
        chs = name + "."
        context = ["."] * context_size

        for ch in chs:
            idx = stoi[ch]
            inputs.append([stoi[ch_context] for ch_context in context])
            labels.append(idx)
            
            if log:
                print(f"for context {context}, expect --> {itos[idx]}")
            
            context = context[1:] + list(ch)
    
    if log:
        print(inputs)
    
    inputs = torch.tensor(inputs)
    labels = torch.tensor(labels)
    dataset = TensorDataset(inputs, labels)
    
    if log:
        print(inputs.shape)
        print(labels.shape)
    return dataset

In [8]:
context_size = 6
names_dataset = make_dataset(context_size=context_size, names=names, log=False)

In [9]:
train_split = int(len(names_dataset) * 0.8)
test_split = int(len(names_dataset) - train_split)
train_dataset, test_dataset = random_split(dataset=names_dataset, lengths=[train_split, test_split])

In [10]:
batch_size = 64
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, generator=generator)

# Sample from the dataloader

In [11]:
batch_sample_inputs, batch_sample_labels = next(iter(test_dataloader))
print(batch_sample_inputs.shape, batch_sample_labels.shape)

torch.Size([64, 6]) torch.Size([64])


# Model

In [12]:
# n_embd is the number of values used to represent each token
n_embd = 5
lookup_table = torch.randn(size=(vocab_size, n_embd))
print(lookup_table)

tensor([[-0.2380,  1.2840, -0.3180, -0.7439, -2.3563],
        [ 0.0449, -0.6238, -0.3228, -1.2490,  0.2209],
        [ 0.4687,  0.3194,  0.9461,  1.2307, -0.6808],
        [ 0.0339, -1.0664,  0.7568, -0.0748, -1.3176],
        [-0.9521, -0.2372,  0.4800,  0.9182, -0.8509],
        [ 1.2009, -1.5661, -0.4351,  0.5841,  0.7373],
        [ 0.0804,  0.8419,  0.4605, -0.4901, -0.8799],
        [-1.1578, -0.7584, -0.3020, -0.5826,  0.1305],
        [-1.3423,  0.7354, -0.1074,  1.1297,  0.6460],
        [-1.6259, -1.6127, -0.2083, -0.5965, -0.1800],
        [ 1.5672,  0.4673,  0.6280, -0.5546, -1.5302],
        [-0.3957, -0.4435,  0.2360,  1.0293,  0.2274],
        [ 0.9550,  0.2033, -0.4835,  0.0068, -1.2867],
        [ 0.6163, -0.6753, -1.0018, -0.6938, -1.3934],
        [ 1.5164,  0.5080,  0.5824,  0.0425, -1.8638],
        [ 1.0930,  0.5154,  0.8251, -0.7702, -1.6851],
        [ 0.1972,  0.3574,  0.1928, -1.9264,  0.7967],
        [ 0.1445,  0.9669,  2.2789, -0.4513, -1.3026],
        [ 

In [13]:
random_char_idx = torch.randint(0, vocab_size, (1,)).item()
random_char = itos[random_char_idx]
print(random_char)
print(lookup_table[random_char_idx]) # the random char is going to get n_embd values related to itself

R
tensor([ 0.0804,  0.8419,  0.4605, -0.4901, -0.8799])


In [14]:
random_char_idx_tensor = torch.randint(0, vocab_size, (8, 4))
stacked = torch.stack([idx for idx in random_char_idx_tensor])

for stack in stacked:
    input_chs = ""
    for v in stack:
        input_chs += itos[v.item()]
        
    print(f"for input {input_chs} --> {stack}")

print(random_char)
print(lookup_table[stacked]) # the random char is going to get n_embd values related to itself
print(lookup_table[stacked].shape) # shape of B x T x C --> batches by inputs by channels

for input TiAU --> tensor([17, 41, 42,  5])
for input geRY --> tensor([ 4, 30,  6, 50])
for input fDLz --> tensor([ 1, 25, 35, 18])
for input nJdr --> tensor([27,  8,  3, 44])
for input DnOg --> tensor([25, 27, 48,  4])
for input OUGC --> tensor([48,  5, 38, 40])
for input Wzsd --> tensor([45, 18, 13,  3])
for input qEyk --> tensor([19, 14, 47, 21])
R
tensor([[[ 0.1445,  0.9669,  2.2789, -0.4513, -1.3026],
         [-0.5481,  0.9890, -0.7496, -0.4148, -0.5032],
         [ 1.6976,  0.0184, -0.3929, -1.3877, -1.6941],
         [ 1.2009, -1.5661, -0.4351,  0.5841,  0.7373]],

        [[-0.9521, -0.2372,  0.4800,  0.9182, -0.8509],
         [-0.8794, -0.3183, -0.4027, -1.5787,  0.6611],
         [ 0.0804,  0.8419,  0.4605, -0.4901, -0.8799],
         [-1.2868,  0.2308, -0.3827,  0.5407, -1.0174]],

        [[ 0.0449, -0.6238, -0.3228, -1.2490,  0.2209],
         [ 1.1240, -1.0678,  1.9256,  0.6974,  0.5373],
         [-0.6082, -1.5248,  0.9232, -0.9301, -1.0904],
         [ 1.5436, -0.7183

In [15]:
# another way is to use nn.Embedding()
emb_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
random_char_idx_tensor = torch.randint(0, vocab_size, (8, 4), dtype=torch.long)


embedded = emb_table(random_char_idx_tensor)

print(random_char)
print(embedded.shape) # B x T x C (batches by time by channels)
# time is the number of inputs, channels is the number of values for each input
print(embedded)

R
torch.Size([8, 4, 5])
tensor([[[-0.2688,  0.4070,  0.7157, -0.1697, -0.4871],
         [-2.1723,  0.8674, -0.6886,  0.6854,  0.3214],
         [-1.3155, -0.0350,  1.0970, -0.6316, -0.4365],
         [ 1.3189, -0.8938,  0.8776, -1.0404,  0.1400]],

        [[ 1.3189, -0.8938,  0.8776, -1.0404,  0.1400],
         [ 0.1609,  1.3086,  0.1983,  1.2213, -0.5938],
         [ 0.6904,  0.5175,  0.6599,  0.3598, -1.0716],
         [-1.2888,  0.5457, -0.4568, -0.4194, -0.9574]],

        [[-0.9450,  0.0470,  0.4992, -0.0203, -0.0195],
         [-0.9450,  0.0470,  0.4992, -0.0203, -0.0195],
         [-0.1771, -1.4323,  0.2912, -2.4547, -0.8935],
         [-0.7647, -0.3024,  0.3577,  1.7043,  0.7692]],

        [[ 0.6506,  0.3200, -0.2648,  0.1434,  2.1544],
         [-0.1891,  0.5103, -1.0046,  0.8475,  0.7190],
         [-1.3416,  0.2404,  0.1805, -0.5998,  1.5986],
         [-0.2688,  0.4070,  0.7157, -0.1697, -0.4871]],

        [[-0.7647, -0.3024,  0.3577,  1.7043,  0.7692],
         [ 0.970

In [16]:
embedded = emb_table(random_char_idx_tensor)

# Batches, Time and Channels
B, T, C = embedded.shape
#embedded = embedded.reshape(B, T*C) # proper shape to multiply the embedded information by weights
embedded = embedded.view(B, T*C)
print(embedded.shape)
print(embedded)

torch.Size([8, 20])
tensor([[-0.2688,  0.4070,  0.7157, -0.1697, -0.4871, -2.1723,  0.8674, -0.6886,
          0.6854,  0.3214, -1.3155, -0.0350,  1.0970, -0.6316, -0.4365,  1.3189,
         -0.8938,  0.8776, -1.0404,  0.1400],
        [ 1.3189, -0.8938,  0.8776, -1.0404,  0.1400,  0.1609,  1.3086,  0.1983,
          1.2213, -0.5938,  0.6904,  0.5175,  0.6599,  0.3598, -1.0716, -1.2888,
          0.5457, -0.4568, -0.4194, -0.9574],
        [-0.9450,  0.0470,  0.4992, -0.0203, -0.0195, -0.9450,  0.0470,  0.4992,
         -0.0203, -0.0195, -0.1771, -1.4323,  0.2912, -2.4547, -0.8935, -0.7647,
         -0.3024,  0.3577,  1.7043,  0.7692],
        [ 0.6506,  0.3200, -0.2648,  0.1434,  2.1544, -0.1891,  0.5103, -1.0046,
          0.8475,  0.7190, -1.3416,  0.2404,  0.1805, -0.5998,  1.5986, -0.2688,
          0.4070,  0.7157, -0.1697, -0.4871],
        [-0.7647, -0.3024,  0.3577,  1.7043,  0.7692,  0.9707,  1.6231, -1.0015,
          0.1926, -0.4187, -0.2243, -0.5688,  1.5243, -0.3886,  0.9

In [17]:
hidden_units = 100
w1 = torch.randn(size=(T*C, hidden_units))
b1 = torch.randn(size=(hidden_units,))
result = torch.tanh(embedded @ w1 + b1)
print(result.shape)

torch.Size([8, 100])


In [18]:
w2 = torch.randn(size=(100, vocab_size))
b2 = torch.randn(size=(vocab_size,))
logits = result @ w2 + b2
print(logits.shape) # B by vocab_size, each batch has 1 output and there are vocab_size possible characters as an output

torch.Size([8, 53])


In [19]:
percents = torch.softmax(logits, dim=1)
preds = torch.argmax(percents, dim=1)
print(percents.shape)
print(percents[0].sum())
print(preds.shape, preds)

torch.Size([8, 53])
tensor(1., grad_fn=<SumBackward0>)
torch.Size([8]) tensor([ 7, 18, 43, 15,  6, 18, 33, 12])


# Turn all this mess into a model class

In [20]:
# vocab_size --> number of possible characters
# n_embd --> number of values associated with each token
class MLP(nn.Module):
    def __init__(self, context_size, vocab_size, n_embd):
        super().__init__()
        
        self.context_size = context_size
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        
        self.token_emb = nn.Embedding(vocab_size, n_embd) # B x T x C (B=batches; T=context_size, C=n_embd)
        self.linear1 = nn.Linear(in_features=context_size*n_embd, out_features=8*8)
        self.linear2 = nn.Linear(in_features=8*8, out_features=vocab_size)
        self.act_fn = nn.Tanh()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.token_emb(x)
        
        B, T, C = x.shape
        x = x.view(B, T*C)
        x = self.act_fn(self.linear1(x))
        x = self.linear2(x)
        
        return x
    
    def generate_name(self, starting_char, max_length, randomize: bool):
        name = ""
        last_char = starting_char
        i = 0
        while last_char != "." and i < max_length:
            context = [stoi["."]] * (self.context_size - 1) + [stoi[last_char]]
            context = torch.tensor(context, dtype=torch.long).view(1, len(context))

            logits = self(context)
            percents = torch.softmax(logits, dim=1)
            
            if randomize:
                pred = torch.multinomial(percents, num_samples=1)
            else:
                pred = torch.argmax(percents, dim=1)
            
            i += 1
            name += itos[pred.item()]
            last_char = itos[pred.item()]
        return name

# Define the model, optimizer and loss function

In [21]:
model = MLP(vocab_size=vocab_size, context_size=context_size, n_embd=8)
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [22]:
def list_to_characters(array):
    strings = []
    for v in array:
        string = [itos[idx] for idx in v]
        strings.append(string)
        
    return strings

# Model infernce

In [23]:
def model_inference(model, dataloader):
    model.eval()
    with torch.inference_mode():
        X, y = next(iter(dataloader))
        logits = model(X)
        percents = torch.softmax(logits, dim=1)
        preds = torch.argmax(percents, dim=1)
        preds_array = [idx_array.item() for idx_array in preds]
        labels_array = [label_array.item() for label_array in y]
        print(preds)
        print("\n")
        batches = [batch for batch in X]
        separated_inputs = [batch.tolist() for batch in batches]
        print(f"the batches are {list_to_characters(array=separated_inputs)}\n")
        print(f"model predicted {[itos[idx] for idx in preds_array]}\n")
        print(f"expected --> {[itos[label] for label in labels_array]}\n")

In [24]:
model_inference(model=model, dataloader=train_dataloader)

tensor([51, 52, 47,  7, 29, 10, 47, 25,  7,  7, 47, 17,  7, 20,  2, 47, 47, 51,
         7, 51,  7,  7, 25, 47,  2,  7,  7,  2, 34,  3,  7, 47, 47,  7, 35, 47,
         7, 47, 47,  7, 34,  7, 34, 47,  7, 26, 25,  0, 10, 26,  7, 47,  7, 47,
        25, 42, 47,  7,  7, 47,  7,  2, 34, 34])


the batches are [['e', 'p', 't', 'e', 'm', 'b'], ['.', 'T', 'y', 's', 'h', 'i'], ['.', '.', '.', '.', '.', '.'], ['.', '.', '.', 'K', 'e', 'e'], ['.', 'A', 'b', 'd', 'i', 'f'], ['.', 'T', 'a', 'k', 'o', 't'], ['.', 'C', 'a', 'r', 'i', 's'], ['.', 'L', 'a', 'y', 'c', 'i'], ['.', '.', '.', 'K', 'a', 'n'], ['.', '.', '.', '.', 'D', 'a'], ['.', '.', 'N', 'i', 'c', 'k'], ['h', 'e', 'r', 'r', 'y', 'e'], ['.', '.', 'K', 'e', 'i', 'k'], ['V', 'e', 'r', 'o', 'n', 'e'], ['.', '.', 'S', 'a', 'n', 'j'], ['.', '.', '.', '.', '.', '.'], ['.', '.', '.', '.', '.', 'S'], ['D', 'o', 'r', 't', 'h', 'e'], ['.', '.', 'L', 'u', 'd', 'e'], ['D', 'o', 'r', 'e', 'a', 't'], ['.', '.', '.', '.', 'S', 'a'], ['.', '.', 'G', 'a',

# Training loop

In [25]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for batch, (X, y) in enumerate(dataloader):
            logits = model(X)
            loss = loss_fn(logits, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch % 5000 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")
    print(f"loss for the very last batch --> {loss}")

In [26]:
train_model(model=model, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=2)

loss for batch 0 --> 3.9514310359954834 at epoch 0
loss for batch 5000 --> 2.0795602798461914 at epoch 0
loss for batch 10000 --> 2.2222611904144287 at epoch 0
loss for batch 15000 --> 2.2724478244781494 at epoch 0
loss for batch 20000 --> 1.7696046829223633 at epoch 0
loss for batch 25000 --> 2.1284308433532715 at epoch 0
loss for batch 30000 --> 2.009101629257202 at epoch 0
loss for batch 35000 --> 2.150378704071045 at epoch 0
loss for batch 40000 --> 1.7829046249389648 at epoch 0
loss for batch 45000 --> 1.9582194089889526 at epoch 0
loss for batch 50000 --> 2.5422234535217285 at epoch 0
loss for batch 55000 --> 2.050772190093994 at epoch 0
loss for batch 60000 --> 1.704452395439148 at epoch 0
loss for batch 65000 --> 2.1727938652038574 at epoch 0
loss for batch 70000 --> 2.088308334350586 at epoch 0
loss for batch 75000 --> 2.336027145385742 at epoch 0
loss for batch 80000 --> 2.318988800048828 at epoch 0
loss for batch 85000 --> 1.9064821004867554 at epoch 0
loss for batch 90000 -

# Test the model

In [27]:
model_inference(model=model, dataloader=train_dataloader)

tensor([51, 28, 27, 41, 34, 28, 44, 44, 28, 41, 28, 30, 13, 51, 27, 27, 51, 51,
        28,  0,  0, 28, 30, 34, 30,  0, 34, 28, 28, 30, 28,  0, 51,  0, 44, 41,
        30, 34, 51, 27,  0,  0, 28, 28, 34, 27, 51, 51, 44, 28, 51, 34,  0,  0,
        41,  0, 51,  0, 30, 34, 27, 41, 51, 51])


the batches are [['.', '.', '.', '.', 'D', 'e'], ['.', '.', '.', 'K', 'l', 'e'], ['.', '.', '.', '.', 'A', 'n'], ['.', '.', 'T', 'o', 'd', 'd'], ['.', '.', '.', '.', '.', '.'], ['.', '.', '.', '.', 'T', 'h'], ['.', '.', '.', '.', 'L', 'o'], ['.', '.', '.', 'S', 't', 'a'], ['.', '.', '.', '.', '.', 'L'], ['.', '.', '.', '.', 'E', 'l'], ['.', '.', 'H', 'a', 'v', 'i'], ['.', '.', '.', '.', '.', 'D'], ['.', '.', 'S', 'a', 'm', 'u'], ['.', '.', 'S', 'e', 'n', 'e'], ['.', '.', 'T', 'y', 'l', 'i'], ['.', '.', '.', '.', 'T', 'u'], ['.', '.', '.', 'A', 'm', 'i'], ['.', 'K', 'a', 'h', 'r', 'e'], ['.', '.', 'N', 'a', 'y', 'l'], ['.', 'K', 'a', 'r', 'l', 'y'], ['l', 'i', 'e', 'a', 'n', 'a'], ['.', '.', '.', '.',

# Generate name

In [28]:
model.eval()
with torch.inference_mode():
    name_from_distribution = model.generate_name(starting_char="L", max_length=5, randomize=True)
    name = model.generate_name(starting_char="L", max_length=5, randomize=False)

    print(name_from_distribution)
    print(name)

ynado
alala
