<a href="https://www.kaggle.com/code/evelynartoria/mlp-pytorch-nlp?scriptVersionId=187285587" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/us-baby-names/StateNames.csv
/kaggle/input/us-baby-names/NationalReadMe.pdf
/kaggle/input/us-baby-names/hashes.txt
/kaggle/input/us-baby-names/NationalNames.csv
/kaggle/input/us-baby-names/StateReadMe.pdf
/kaggle/input/us-baby-names/database.sqlite


# Introduction
- This is a notebook presesnting how to build a MLP (Multilayer perceptron) for character level machine learning model
- this notebook follows the Building makemore Part 2: MLP by Andrej Karpathy (https://www.youtube.com/watch?v=TCH_1BHY58I&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=4)

# Import needed libraries

In [2]:
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split


# Device agnostic code

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
generator = torch.Generator(device=device)
print(f"default devcie set to {device}")

default devcie set to cpu


# Read the names csv file

In [4]:
names_csv = pd.read_csv("/kaggle/input/us-baby-names/NationalNames.csv")
names_csv

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
...,...,...,...,...,...
1825428,1825429,Zykeem,2014,M,5
1825429,1825430,Zymeer,2014,M,5
1825430,1825431,Zymiere,2014,M,5
1825431,1825432,Zyran,2014,M,5


# Prepare the data

In [5]:
names = names_csv["Name"].values
vocab = set("".join(names))
print(vocab)

{'M', 'T', 'i', 'z', 'B', 'u', 'N', 'b', 'R', 'K', 'a', 'U', 'c', 'Y', 'V', 'x', 'm', 'v', 'W', 'o', 'G', 'l', 'e', 'f', 'F', 'A', 'p', 'E', 'I', 'C', 'L', 'n', 'k', 'h', 'w', 'X', 'Z', 'd', 'q', 'D', 'j', 's', 'r', 'g', 'O', 'H', 'J', 'P', 'Q', 'S', 'y', 't'}


In [6]:
# string to id
stoi = {c:v+1 for v, c in enumerate(vocab)}
stoi["."] = 0

# id to string
itos = {v:c for c, v in stoi.items()}

print(stoi["g"])
print(itos[21])

vocab_size = len(stoi)
print(f"vocab size --> {vocab_size}")

44
G
vocab size --> 53


In [7]:
def make_dataset(context_size, names, log: bool):

    inputs = []
    labels = []

    for name in names[:1000000]:
        chs = name + "."
        context = ["."] * context_size

        for ch in chs:
            idx = stoi[ch]
            inputs.append([stoi[ch_context] for ch_context in context])
            labels.append(idx)
            
            if log:
                print(f"for context {context}, expect --> {itos[idx]}")
            
            context = context[1:] + list(ch)
    
    if log:
        print(inputs)
    
    inputs = torch.tensor(inputs)
    labels = torch.tensor(labels)
    dataset = TensorDataset(inputs, labels)
    
    if log:
        print(inputs.shape)
        print(labels.shape)
    return dataset

In [8]:
context_size = 4
names_dataset = make_dataset(context_size=context_size, names=names, log=False)

In [9]:
train_split = int(len(names_dataset) * 0.8)
test_split = int(len(names_dataset) - train_split)
train_dataset, test_dataset = random_split(dataset=names_dataset, lengths=[train_split, test_split])

In [10]:
batch_size = 64
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, generator=generator)

# Sample from the dataloader

In [11]:
batch_sample_inputs, batch_sample_labels = next(iter(test_dataloader))
print(batch_sample_inputs.shape, batch_sample_labels.shape)

torch.Size([64, 4]) torch.Size([64])


# Model

In [12]:
# n_embd is the number of values used to represent each token
n_embd = 5
lookup_table = torch.randn(size=(vocab_size, n_embd))
print(lookup_table)

tensor([[ 7.6655e-01,  1.8211e-01, -1.5845e+00, -1.4882e-01,  3.3624e-01],
        [ 2.8833e-01, -1.9351e-01,  2.9328e-01,  3.3140e-01, -1.8512e-01],
        [ 1.0245e+00,  7.8906e-01,  1.9254e-01,  2.2420e-02, -1.2740e+00],
        [-6.5269e-01, -8.8686e-02,  8.0516e-02, -5.9009e-01,  2.1242e+00],
        [ 1.8560e+00, -5.0954e-01, -3.8792e-01,  1.6668e+00,  5.0784e-01],
        [ 7.7819e-01, -3.2264e-01, -1.2609e+00,  9.9239e-01,  4.5334e-01],
        [ 6.5151e-01,  2.5334e-01,  2.2904e-01, -5.2549e-01,  6.6740e-02],
        [-3.0718e-01, -7.0428e-01,  8.1581e-02, -2.4455e-01, -5.6492e-01],
        [-6.3053e-01,  1.9804e+00,  2.9209e-01,  1.1195e+00, -7.8012e-01],
        [ 9.6694e-01,  1.2035e+00, -9.2423e-01,  1.1622e+00,  1.1018e+00],
        [ 7.3404e-01, -1.7948e+00, -1.6293e+00, -5.4670e-01, -3.2266e-01],
        [ 1.1690e+00,  1.8100e-01,  6.1795e-01,  1.0565e+00,  1.6915e-01],
        [ 2.6565e-01, -5.3859e-01,  6.3124e-01, -2.2384e-01,  8.2131e-01],
        [ 1.4778e+00,  2.

In [13]:
random_char_idx = torch.randint(0, vocab_size, (1,)).item()
random_char = itos[random_char_idx]
print(random_char)
print(lookup_table[random_char_idx]) # the random char is going to get n_embd values related to itself

M
tensor([ 0.2883, -0.1935,  0.2933,  0.3314, -0.1851])


In [14]:
random_char_idx_tensor = torch.randint(0, vocab_size, (8, 4))
stacked = torch.stack([idx for idx in random_char_idx_tensor])

for stack in stacked:
    input_chs = ""
    for v in stack:
        input_chs += itos[v.item()]
        
    print(f"for input {input_chs} --> {stack}")

print(random_char)
print(lookup_table[stacked]) # the random char is going to get n_embd values related to itself
print(lookup_table[stacked].shape) # shape of B x T x C --> batches by inputs by channels

for input jCrV --> tensor([41, 30, 43, 15])
for input cUA. --> tensor([13, 12, 26,  0])
for input LBnI --> tensor([31,  5, 32, 29])
for input cYgi --> tensor([13, 14, 44,  3])
for input yPyI --> tensor([51, 48, 51, 29])
for input aJbf --> tensor([11, 47,  8, 24])
for input PZMl --> tensor([48, 37,  1, 22])
for input DiEa --> tensor([40,  3, 28, 11])
M
tensor([[[ 0.2421,  0.9041,  0.2628, -0.0104, -0.4948],
         [ 0.2546, -1.5567,  0.0560,  1.0560,  0.6307],
         [-1.1932,  1.7947, -0.3605,  0.0300, -0.2976],
         [ 0.1336, -1.0272,  1.0315, -0.9480,  0.2437]],

        [[ 1.4778,  0.2338,  1.3707,  0.9181, -0.4576],
         [ 0.2656, -0.5386,  0.6312, -0.2238,  0.8213],
         [ 0.5390,  0.7827,  1.8141,  0.8592, -0.5785],
         [ 0.7665,  0.1821, -1.5845, -0.1488,  0.3362]],

        [[-1.1578,  0.0657,  2.0796, -0.4968,  0.0229],
         [ 0.7782, -0.3226, -1.2609,  0.9924,  0.4533],
         [-0.2988, -1.0289, -0.1524, -2.0522,  0.5557],
         [ 0.3681,  1.0544

In [15]:
# another way is to use nn.Embedding()
emb_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
random_char_idx_tensor = torch.randint(0, vocab_size, (8, 4), dtype=torch.long)


embedded = emb_table(random_char_idx_tensor)

print(random_char)
print(embedded.shape) # B x T x C (batches by time by channels)
# time is the number of inputs, channels is the number of values for each input
print(embedded)

M
torch.Size([8, 4, 5])
tensor([[[-1.5672, -0.4502,  0.1516, -1.2840, -0.5506],
         [-0.3016,  0.5590,  0.7414, -0.7917,  1.1247],
         [-0.0613, -2.0487, -1.2371, -0.8480, -0.5769],
         [-0.2281,  0.0809, -0.7191,  0.3620,  0.5775]],

        [[ 1.5874, -0.3137, -0.7653,  1.6016, -0.2799],
         [ 0.1332,  0.2517, -0.6478,  1.4070, -0.2854],
         [-1.0686,  0.0370,  1.1281,  0.6733,  0.0116],
         [-0.0613, -2.0487, -1.2371, -0.8480, -0.5769]],

        [[ 2.4157,  1.2876,  0.7653,  0.3756, -1.4121],
         [-0.2281,  0.0809, -0.7191,  0.3620,  0.5775],
         [-1.3477,  0.1816,  1.6845,  0.0211, -2.5997],
         [-1.3470, -0.6315,  0.3968,  0.9489, -0.0694]],

        [[ 0.4430, -0.6886, -0.4542, -0.8186,  0.4243],
         [-1.1782,  0.9478, -0.5819,  1.5804, -0.7754],
         [-0.2281,  0.0809, -0.7191,  0.3620,  0.5775],
         [ 1.0905,  0.9305, -0.6677, -0.4777, -0.4832]],

        [[-0.5870, -0.6108, -0.2293,  1.0131, -1.0357],
         [-0.554

In [16]:
embedded = emb_table(random_char_idx_tensor)

# Batches, Time and Channels
B, T, C = embedded.shape
#embedded = embedded.reshape(B, T*C) # proper shape to multiply the embedded information by weights
embedded = embedded.view(B, T*C)
print(embedded.shape)
print(embedded)

torch.Size([8, 20])
tensor([[-1.5672, -0.4502,  0.1516, -1.2840, -0.5506, -0.3016,  0.5590,  0.7414,
         -0.7917,  1.1247, -0.0613, -2.0487, -1.2371, -0.8480, -0.5769, -0.2281,
          0.0809, -0.7191,  0.3620,  0.5775],
        [ 1.5874, -0.3137, -0.7653,  1.6016, -0.2799,  0.1332,  0.2517, -0.6478,
          1.4070, -0.2854, -1.0686,  0.0370,  1.1281,  0.6733,  0.0116, -0.0613,
         -2.0487, -1.2371, -0.8480, -0.5769],
        [ 2.4157,  1.2876,  0.7653,  0.3756, -1.4121, -0.2281,  0.0809, -0.7191,
          0.3620,  0.5775, -1.3477,  0.1816,  1.6845,  0.0211, -2.5997, -1.3470,
         -0.6315,  0.3968,  0.9489, -0.0694],
        [ 0.4430, -0.6886, -0.4542, -0.8186,  0.4243, -1.1782,  0.9478, -0.5819,
          1.5804, -0.7754, -0.2281,  0.0809, -0.7191,  0.3620,  0.5775,  1.0905,
          0.9305, -0.6677, -0.4777, -0.4832],
        [-0.5870, -0.6108, -0.2293,  1.0131, -1.0357, -0.5543,  2.1708,  0.3754,
         -1.0827, -0.6425,  1.0905,  0.9305, -0.6677, -0.4777, -0.4

In [17]:
hidden_units = 100
w1 = torch.randn(size=(T*C, hidden_units))
b1 = torch.randn(size=(hidden_units,))
result = torch.tanh(embedded @ w1 + b1)
print(result.shape)

torch.Size([8, 100])


In [18]:
w2 = torch.randn(size=(100, vocab_size))
b2 = torch.randn(size=(vocab_size,))
logits = result @ w2 + b2
print(logits.shape) # B by vocab_size, each batch has 1 output and there are vocab_size possible characters as an output

torch.Size([8, 53])


In [19]:
percents = torch.softmax(logits, dim=1)
preds = torch.argmax(percents, dim=1)
print(percents.shape)
print(percents[0].sum())
print(preds.shape, preds)

torch.Size([8, 53])
tensor(1.0000, grad_fn=<SumBackward0>)
torch.Size([8]) tensor([31, 29, 51, 52, 42,  3, 51, 14])


# Turn all this mess into a model class

In [20]:
# vocab_size --> number of possible characters
# n_embd --> number of values associated with each token
class MLP(nn.Module):
    def __init__(self, vocab_size, context_size, n_embd):
        super().__init__()
        
        self.emb_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        self.linear1 = nn.Linear(in_features=context_size*n_embd, out_features=8*8, bias=True)
        self.linear2 = nn.Linear(in_features=8*8, out_features=8*8*8, bias=True)
        self.linear3 = nn.Linear(in_features=8*8*8, out_features=vocab_size, bias=True)
        
        self.act_fn = nn.Tanh()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.emb_table(x)
        B, T, C = x.shape
        x = x.view(B, T*C)
        x = self.act_fn(self.linear1(x))
        x = self.act_fn(self.linear2(x))
        x = self.linear3(x)
        
        return x

# Define the model, optimizer and loss function

In [21]:
model = MLP(vocab_size=vocab_size, context_size=context_size, n_embd=5)
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-2)
loss_fn = nn.CrossEntropyLoss()

In [22]:
def list_to_characters(array):
    strings = []
    for v in array:
        string = [itos[idx] for idx in v]
        strings.append(string)
        
    return strings

# Model infernce

In [23]:
def model_inference(model, dataloader):
    model.eval()
    with torch.inference_mode():
        X, y = next(iter(dataloader))
        logits = model(X)
        percents = torch.softmax(logits, dim=1)
        preds = torch.argmax(percents, dim=1)
        preds_array = [idx_array.item() for idx_array in preds]
        labels_array = [label_array.item() for label_array in y]
        print(preds)
        print("\n")
        batches = [batch for batch in X]
        separated_inputs = [batch.tolist() for batch in batches]
        print(f"the batches are {list_to_characters(array=separated_inputs)}\n")
        print(f"model predicted {[itos[idx] for idx in preds_array]}\n")
        print(f"expected --> {[itos[label] for label in labels_array]}\n")

In [24]:
model_inference(model=model, dataloader=train_dataloader)

tensor([39, 24, 39, 39,  6, 24,  9, 41, 26, 19, 39, 29, 39, 44, 39,  9, 39, 39,
        18, 22, 39, 52, 45, 20, 39, 45, 39,  6, 29, 38, 39, 39, 39, 18, 22, 39,
        39,  2, 52, 38, 39, 39, 36, 13, 21, 38, 10, 22, 39, 39, 22, 52, 22, 21,
        39, 22, 32, 39, 52, 39, 18, 39, 41, 39])


the batches are [['.', '.', '.', 'E'], ['.', '.', '.', 'S'], ['.', '.', '.', 'K'], ['.', '.', '.', 'P'], ['N', 'o', 'e', 'l'], ['.', '.', '.', 'S'], ['.', '.', 'Q', 'u'], ['.', '.', 'L', 'e'], ['a', 'r', 'g', 'a'], ['.', 'V', 'e', 'r'], ['.', '.', '.', 'W'], ['i', 't', 'z', 'i'], ['.', '.', '.', '.'], ['C', 'h', 'e', 'r'], ['.', '.', '.', 'I'], ['.', '.', 'M', 'i'], ['.', '.', '.', '.'], ['.', '.', 'P', 'r'], ['.', 'I', 'v', 'a'], ['a', 'r', 'y', 'n'], ['.', '.', '.', 'I'], ['o', 'n', 'i', 'e'], ['e', 's', 't', 'i'], ['m', 'o', 'n', 'a'], ['.', '.', '.', 'F'], ['e', 'r', 'm', 'i'], ['.', '.', 'M', 'a'], ['.', 'D', 'e', 'l'], ['.', 'S', 'i', 'm'], ['l', 'v', 'i', 's'], ['M', 'a', 'r', 'g'], ['.', '.',

# Training loop

In [25]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for batch, (X, y) in enumerate(dataloader):
            logits = model(X)
            loss = loss_fn(logits, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch % 5000 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")
    print(f"loss for the very last batch --> {loss}")

In [26]:
train_model(model=model, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=10)

loss for batch 0 --> 3.9813597202301025 at epoch 0
loss for batch 5000 --> 2.2622315883636475 at epoch 0
loss for batch 10000 --> 2.479404926300049 at epoch 0
loss for batch 15000 --> 2.166165351867676 at epoch 0
loss for batch 20000 --> 2.243795156478882 at epoch 0
loss for batch 25000 --> 2.5603160858154297 at epoch 0
loss for batch 30000 --> 2.4674558639526367 at epoch 0
loss for batch 35000 --> 2.5853469371795654 at epoch 0
loss for batch 40000 --> 2.2452845573425293 at epoch 0
loss for batch 45000 --> 2.141946792602539 at epoch 0
loss for batch 50000 --> 2.365069627761841 at epoch 0
loss for batch 55000 --> 2.202312469482422 at epoch 0
loss for batch 60000 --> 2.0946226119995117 at epoch 0
loss for batch 65000 --> 2.005415916442871 at epoch 0
loss for batch 70000 --> 2.092287063598633 at epoch 0
loss for batch 75000 --> 2.029737710952759 at epoch 0
loss for batch 80000 --> 2.0311062335968018 at epoch 0
loss for batch 85000 --> 2.1530752182006836 at epoch 0
loss for batch 0 --> 2.0

# Test the model

In [27]:
model_inference(model=model, dataloader=train_dataloader)

tensor([ 9,  0, 23,  9, 34, 22, 43,  0, 38, 22, 32, 23, 23, 22, 22, 52,  9,  0,
        22,  9,  0, 43,  9, 34, 11, 23, 32, 23,  9,  0, 11,  9,  0,  0,  0, 22,
        11, 43, 23,  0, 38,  0, 43, 22, 23, 23, 11, 22, 34, 38, 22,  9,  0,  0,
        23, 33,  0, 43, 43, 11, 11, 43,  9, 22])


the batches are [['.', '.', '.', '.'], ['i', 'k', 'k', 'a'], ['.', 'F', 'a', 'n'], ['.', '.', '.', '.'], ['.', '.', '.', 'C'], ['r', 'c', 'e', 'l'], ['.', 'C', 'l', 'a'], ['r', 'r', 'a', 'l'], ['C', 'h', 'a', 'n'], ['.', '.', '.', 'E'], ['J', 'e', 'a', 'n'], ['V', 'e', 'r', 'n'], ['u', 'a', 'n', 'n'], ['.', '.', 'K', 'o'], ['.', '.', '.', 'A'], ['h', 'r', 'i', 's'], ['.', '.', '.', '.'], ['n', 'e', 't', 'a'], ['a', 'n', 'e', 'l'], ['.', '.', '.', '.'], ['u', 't', 'h', 'a'], ['.', '.', 'P', 'a'], ['.', '.', '.', '.'], ['.', '.', '.', 'S'], ['.', '.', '.', 'L'], ['K', 'a', 'l', 'l'], ['n', 'e', 's', 'i'], ['e', 'l', 'y', 'n'], ['.', '.', '.', '.'], ['g', 'g', 'i', 'e'], ['.', '.', '.', 'H'], ['.', '.',