<a href="https://www.kaggle.com/code/evelynartoria/mlp-pytorch-nlp?scriptVersionId=187413733" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/us-baby-names/StateNames.csv
/kaggle/input/us-baby-names/NationalReadMe.pdf
/kaggle/input/us-baby-names/hashes.txt
/kaggle/input/us-baby-names/NationalNames.csv
/kaggle/input/us-baby-names/StateReadMe.pdf
/kaggle/input/us-baby-names/database.sqlite


# Introduction
- This is a notebook presesnting how to build a MLP (Multilayer perceptron) for character level machine learning model
- this notebook follows the Building makemore Part 2: MLP by Andrej Karpathy (https://www.youtube.com/watch?v=TCH_1BHY58I&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=4)

# Import needed libraries

In [2]:
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split


# Device agnostic code

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
generator = torch.Generator(device=device)
print(f"default devcie set to {device}")

default devcie set to cpu


# Read the names csv file

In [4]:
names_csv = pd.read_csv("/kaggle/input/us-baby-names/NationalNames.csv")
names_csv

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
...,...,...,...,...,...
1825428,1825429,Zykeem,2014,M,5
1825429,1825430,Zymeer,2014,M,5
1825430,1825431,Zymiere,2014,M,5
1825431,1825432,Zyran,2014,M,5


# Prepare the data

In [5]:
names = names_csv["Name"].values
vocab = set("".join(names))
print(vocab)

{'Q', 'x', 't', 'S', 'M', 'n', 'R', 'Y', 'y', 'B', 'W', 'O', 'h', 'I', 'V', 'i', 'X', 'F', 'K', 'u', 'b', 'c', 'U', 'z', 's', 'A', 'f', 'D', 'T', 'C', 'd', 'v', 'E', 'p', 'l', 'G', 'a', 'r', 'o', 'H', 'w', 'j', 'm', 'k', 'J', 'Z', 'e', 'q', 'g', 'N', 'L', 'P'}


In [6]:
# string to id
stoi = {c:v+1 for v, c in enumerate(vocab)}
stoi["."] = 0

# id to string
itos = {v:c for c, v in stoi.items()}

print(stoi["g"])
print(itos[21])

vocab_size = len(stoi)
print(f"vocab size --> {vocab_size}")

49
b
vocab size --> 53


In [7]:
def make_dataset(context_size, names, log: bool):

    inputs = []
    labels = []

    for name in names:
        chs = name + "."
        context = ["."] * context_size

        for ch in chs:
            idx = stoi[ch]
            inputs.append([stoi[ch_context] for ch_context in context])
            labels.append(idx)
            
            if log:
                print(f"for context {context}, expect --> {itos[idx]}")
            
            context = context[1:] + list(ch)
    
    if log:
        print(inputs)
    
    inputs = torch.tensor(inputs)
    labels = torch.tensor(labels)
    dataset = TensorDataset(inputs, labels)
    
    if log:
        print(inputs.shape)
        print(labels.shape)
    return dataset

In [8]:
context_size = 6
names_dataset = make_dataset(context_size=context_size, names=names, log=False)

In [9]:
train_split = int(len(names_dataset) * 0.8)
test_split = int(len(names_dataset) - train_split)
train_dataset, test_dataset = random_split(dataset=names_dataset, lengths=[train_split, test_split])

In [10]:
batch_size = 64
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, generator=generator)

# Sample from the dataloader

In [11]:
batch_sample_inputs, batch_sample_labels = next(iter(test_dataloader))
print(batch_sample_inputs.shape, batch_sample_labels.shape)

torch.Size([64, 6]) torch.Size([64])


# Model

In [12]:
# n_embd is the number of values used to represent each token
n_embd = 5
lookup_table = torch.randn(size=(vocab_size, n_embd))
print(lookup_table)

tensor([[-1.4869, -0.3235,  1.2784, -0.3999, -0.4920],
        [ 1.1827, -1.2273,  0.5267, -1.8182, -0.9312],
        [ 2.1601,  0.9220, -0.1843, -0.3023,  0.1793],
        [-0.3970,  1.4365,  0.4366,  0.6143,  0.5874],
        [-1.1577,  0.4677,  1.2219,  0.2994,  1.4609],
        [ 1.6026,  0.2643, -0.5547,  1.6054,  0.0086],
        [-0.3132,  1.1740,  0.0394,  0.0038, -0.2761],
        [ 0.4117,  0.5724,  1.8255, -0.6417, -0.3732],
        [ 0.4354,  0.1918,  1.7422,  0.3736,  0.1066],
        [ 0.2577,  1.2712,  0.0446,  1.5371,  0.0563],
        [ 0.9605,  1.5735,  2.2687,  0.1062, -0.6861],
        [ 0.0551, -0.3009, -1.1085, -0.7830, -1.9565],
        [-0.1811,  0.2476,  1.3800, -0.9123,  0.3325],
        [ 0.0937, -0.2027,  2.1665, -1.1368,  0.9071],
        [-0.1031, -0.6882,  0.9750, -0.5984,  1.9706],
        [ 0.0314,  0.3113, -0.6589,  0.3337, -1.5489],
        [-1.5637,  0.8031,  0.9257, -1.6233,  0.0383],
        [ 1.0538,  0.4708,  0.6737, -1.5698,  0.7666],
        [ 

In [13]:
random_char_idx = torch.randint(0, vocab_size, (1,)).item()
random_char = itos[random_char_idx]
print(random_char)
print(lookup_table[random_char_idx]) # the random char is going to get n_embd values related to itself

O
tensor([-0.1811,  0.2476,  1.3800, -0.9123,  0.3325])


In [14]:
random_char_idx_tensor = torch.randint(0, vocab_size, (8, 4))
stacked = torch.stack([idx for idx in random_char_idx_tensor])

for stack in stacked:
    input_chs = ""
    for v in stack:
        input_chs += itos[v.item()]
        
    print(f"for input {input_chs} --> {stack}")

print(random_char)
print(lookup_table[stacked]) # the random char is going to get n_embd values related to itself
print(lookup_table[stacked].shape) # shape of B x T x C --> batches by inputs by channels

for input ozvp --> tensor([39, 24, 32, 34])
for input bwzw --> tensor([21, 41, 24, 41])
for input miQB --> tensor([43, 16,  1, 10])
for input xSXz --> tensor([ 2,  4, 17, 24])
for input qXUp --> tensor([48, 17, 23, 34])
for input q.rP --> tensor([48,  0, 38, 52])
for input jSJS --> tensor([42,  4, 45,  4])
for input NRMc --> tensor([50,  7,  5, 22])
O
tensor([[[ 1.9440, -0.1757, -1.1439, -1.2610,  1.2359],
         [ 1.2999, -1.3299,  0.7816, -0.1127, -0.2495],
         [ 0.1256, -0.4440, -2.0239,  0.7251,  0.5738],
         [-0.4340, -2.3415,  1.4259,  0.1838,  0.4829]],

        [[ 0.1973,  0.2920,  1.1155, -2.1830, -0.1908],
         [-0.0823,  0.7344,  0.1401,  1.4766,  1.3831],
         [ 1.2999, -1.3299,  0.7816, -0.1127, -0.2495],
         [-0.0823,  0.7344,  0.1401,  1.4766,  1.3831]],

        [[-0.0992, -0.1694,  1.1950,  0.1887,  1.9428],
         [-1.5637,  0.8031,  0.9257, -1.6233,  0.0383],
         [ 1.1827, -1.2273,  0.5267, -1.8182, -0.9312],
         [ 0.9605,  1.5735

In [15]:
# another way is to use nn.Embedding()
emb_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
random_char_idx_tensor = torch.randint(0, vocab_size, (8, 4), dtype=torch.long)


embedded = emb_table(random_char_idx_tensor)

print(random_char)
print(embedded.shape) # B x T x C (batches by time by channels)
# time is the number of inputs, channels is the number of values for each input
print(embedded)

O
torch.Size([8, 4, 5])
tensor([[[-0.4063,  1.1518,  0.4449, -1.1142,  0.5560],
         [-0.3924,  0.5616,  0.7872, -1.6846, -0.1319],
         [-0.3070, -0.0624, -0.9929,  0.1439,  0.8113],
         [ 0.6039,  0.3659, -1.6956, -1.0783, -0.7340]],

        [[-0.4430,  0.1270,  0.9508,  0.1614, -1.8309],
         [-0.3887, -0.7361,  0.5584, -1.5667,  0.9043],
         [ 0.5428,  0.9093, -0.7722, -0.1605,  1.8612],
         [ 0.2011,  0.0950, -0.0681, -1.0228,  0.7966]],

        [[ 0.3206, -2.8351, -1.0463, -2.0307, -1.5411],
         [ 0.5428,  0.9093, -0.7722, -0.1605,  1.8612],
         [-0.8781,  1.0908, -2.4032,  0.3324,  0.1659],
         [-0.1619, -1.1855, -0.0716, -1.2106, -0.7732]],

        [[-0.5573, -0.4651,  0.7077, -0.8789, -0.5335],
         [ 1.2120,  1.0643,  0.3840, -1.2617,  0.7788],
         [ 0.3206, -2.8351, -1.0463, -2.0307, -1.5411],
         [-0.5798, -2.4853, -0.1025,  1.3449,  0.3872]],

        [[ 1.2120,  1.0643,  0.3840, -1.2617,  0.7788],
         [ 0.679

In [16]:
embedded = emb_table(random_char_idx_tensor)

# Batches, Time and Channels
B, T, C = embedded.shape
#embedded = embedded.reshape(B, T*C) # proper shape to multiply the embedded information by weights
embedded = embedded.view(B, T*C)
print(embedded.shape)
print(embedded)

torch.Size([8, 20])
tensor([[-0.4063,  1.1518,  0.4449, -1.1142,  0.5560, -0.3924,  0.5616,  0.7872,
         -1.6846, -0.1319, -0.3070, -0.0624, -0.9929,  0.1439,  0.8113,  0.6039,
          0.3659, -1.6956, -1.0783, -0.7340],
        [-0.4430,  0.1270,  0.9508,  0.1614, -1.8309, -0.3887, -0.7361,  0.5584,
         -1.5667,  0.9043,  0.5428,  0.9093, -0.7722, -0.1605,  1.8612,  0.2011,
          0.0950, -0.0681, -1.0228,  0.7966],
        [ 0.3206, -2.8351, -1.0463, -2.0307, -1.5411,  0.5428,  0.9093, -0.7722,
         -0.1605,  1.8612, -0.8781,  1.0908, -2.4032,  0.3324,  0.1659, -0.1619,
         -1.1855, -0.0716, -1.2106, -0.7732],
        [-0.5573, -0.4651,  0.7077, -0.8789, -0.5335,  1.2120,  1.0643,  0.3840,
         -1.2617,  0.7788,  0.3206, -2.8351, -1.0463, -2.0307, -1.5411, -0.5798,
         -2.4853, -0.1025,  1.3449,  0.3872],
        [ 1.2120,  1.0643,  0.3840, -1.2617,  0.7788,  0.6795,  1.2414, -0.5720,
         -0.7167,  1.0474,  0.0239,  0.6004,  0.5630,  1.4730, -0.2

In [17]:
hidden_units = 100
w1 = torch.randn(size=(T*C, hidden_units))
b1 = torch.randn(size=(hidden_units,))
result = torch.tanh(embedded @ w1 + b1)
print(result.shape)

torch.Size([8, 100])


In [18]:
w2 = torch.randn(size=(100, vocab_size))
b2 = torch.randn(size=(vocab_size,))
logits = result @ w2 + b2
print(logits.shape) # B by vocab_size, each batch has 1 output and there are vocab_size possible characters as an output

torch.Size([8, 53])


In [19]:
percents = torch.softmax(logits, dim=1)
preds = torch.argmax(percents, dim=1)
print(percents.shape)
print(percents[0].sum())
print(preds.shape, preds)

torch.Size([8, 53])
tensor(1., grad_fn=<SumBackward0>)
torch.Size([8]) tensor([28, 28, 23, 20,  6, 14,  6, 28])


# Turn all this mess into a model class

In [20]:
# vocab_size --> number of possible characters
# n_embd --> number of values associated with each token
class MLP(nn.Module):
    def __init__(self, context_size, vocab_size, n_embd):
        super().__init__()
        
        self.context_size = context_size
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        
        self.token_emb = nn.Embedding(vocab_size, n_embd) # B x T x C (B=batches; T=context_size, C=n_embd)
        self.linear1 = nn.Linear(in_features=context_size*n_embd, out_features=8*8)
        self.linear2 = nn.Linear(in_features=8*8, out_features=vocab_size)
        self.act_fn = nn.Tanh()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.token_emb(x)
        
        B, T, C = x.shape
        x = x.view(B, T*C)
        x = self.act_fn(self.linear1(x))
        x = self.linear2(x)
        
        return x
    
    def generate_name(self, starting_char, max_length, randomize: bool):
        name = ""
        last_char = starting_char
        i = 0
        while last_char != "." and i < max_length:
            context = [stoi["."]] * (self.context_size - 1) + [stoi[last_char]]
            context = torch.tensor(context, dtype=torch.long).view(1, len(context))

            logits = self(context)
            percents = torch.softmax(logits, dim=1)
            
            if randomize:
                pred = torch.multinomial(percents, num_samples=1)
            else:
                pred = torch.argmax(percents, dim=1)
            
            i += 1
            name += itos[pred.item()]
            last_char = itos[pred.item()]
        return name

# Define the model, optimizer and loss function

In [21]:
model = MLP(vocab_size=vocab_size, context_size=context_size, n_embd=8)
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [22]:
def list_to_characters(array):
    strings = []
    for v in array:
        string = [itos[idx] for idx in v]
        strings.append(string)
        
    return strings

# Model infernce

In [23]:
def model_inference(model, dataloader):
    model.eval()
    with torch.inference_mode():
        X, y = next(iter(dataloader))
        logits = model(X)
        percents = torch.softmax(logits, dim=1)
        preds = torch.argmax(percents, dim=1)
        preds_array = [idx_array.item() for idx_array in preds]
        labels_array = [label_array.item() for label_array in y]
        print(preds)
        print("\n")
        batches = [batch for batch in X]
        separated_inputs = [batch.tolist() for batch in batches]
        print(f"the batches are {list_to_characters(array=separated_inputs)}\n")
        print(f"model predicted {[itos[idx] for idx in preds_array]}\n")
        print(f"expected --> {[itos[label] for label in labels_array]}\n")

In [24]:
model_inference(model=model, dataloader=train_dataloader)

tensor([19, 22, 12, 12, 38, 12,  8, 12, 12, 29, 29, 12, 27, 29, 22, 31, 27, 29,
        45, 29, 29, 36, 12, 12, 38, 12, 47, 37, 47, 38, 50, 12, 28, 36, 29, 22,
        12, 30, 30, 37, 19, 29, 12, 37, 12, 38, 37, 42, 12, 22, 12, 38, 12,  9,
        12, 23, 29, 49, 29, 49, 30, 12, 12,  9])


the batches are [['.', '.', '.', 'S', 'h', 'a'], ['.', 'M', 'e', 'r', 'r', 'i'], ['n', 'c', 'o', 'i', 's', 'e'], ['.', '.', '.', '.', '.', 'R'], ['r', 'm', 'a', 'n', 'd', 'i'], ['B', 'l', 'a', 'i', 'n', 'e'], ['.', 'D', 'o', 'y', 'a', 'l'], ['.', '.', '.', '.', '.', '.'], ['.', '.', '.', '.', '.', 'O'], ['H', 'e', 'n', 'd', 'e', 'r'], ['.', '.', '.', '.', '.', 'L'], ['.', '.', '.', '.', '.', '.'], ['.', '.', '.', 'K', 'a', 't'], ['.', '.', '.', 'E', 'l', 'l'], ['K', 'i', 'r', 's', 't', 'i'], ['.', '.', '.', '.', 'T', 'e'], ['.', 'N', 'a', 'n', 'c', 'i'], ['.', '.', '.', 'R', 'o', 'l'], ['.', '.', 'D', 'a', 'r', 'e'], ['.', '.', '.', '.', 'I', 'r'], ['.', '.', '.', '.', '.', 'T'], ['.', '.', '.', '.',

# Training loop

In [25]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for batch, (X, y) in enumerate(dataloader):
            logits = model(X)
            loss = loss_fn(logits, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch % 5000 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")
    print(f"loss for the very last batch --> {loss}")

In [26]:
train_model(model=model, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=2)

loss for batch 0 --> 3.9573333263397217 at epoch 0
loss for batch 5000 --> 2.03257417678833 at epoch 0
loss for batch 10000 --> 1.977204442024231 at epoch 0
loss for batch 15000 --> 2.212308406829834 at epoch 0
loss for batch 20000 --> 1.8828935623168945 at epoch 0
loss for batch 25000 --> 2.020533561706543 at epoch 0
loss for batch 30000 --> 2.5284740924835205 at epoch 0
loss for batch 35000 --> 2.233997106552124 at epoch 0
loss for batch 40000 --> 1.739423155784607 at epoch 0
loss for batch 45000 --> 2.1143953800201416 at epoch 0
loss for batch 50000 --> 2.036633014678955 at epoch 0
loss for batch 55000 --> 1.9260410070419312 at epoch 0
loss for batch 60000 --> 1.9914218187332153 at epoch 0
loss for batch 65000 --> 2.01112699508667 at epoch 0
loss for batch 70000 --> 2.0165247917175293 at epoch 0
loss for batch 75000 --> 2.1636102199554443 at epoch 0
loss for batch 80000 --> 1.8721739053726196 at epoch 0
loss for batch 85000 --> 2.0042881965637207 at epoch 0
loss for batch 90000 --> 

# Test the model

In [27]:
model_inference(model=model, dataloader=train_dataloader)

tensor([37, 26, 37, 16,  0,  0,  0, 38, 47, 37, 26,  0, 26, 26,  0, 47, 27, 47,
         0, 16, 44, 13, 38, 25, 37, 38, 26,  6, 25, 26, 43, 38,  6,  6, 13,  6,
        26, 38, 26, 20, 47, 37, 47,  0, 26, 21,  0, 37,  0,  0,  0, 37, 37, 38,
        37,  0, 25,  0, 16, 26,  0, 38, 37, 22])


the batches are [['.', '.', '.', 'K', 'a', 'm'], ['.', '.', '.', '.', '.', '.'], ['.', '.', '.', '.', '.', 'L'], ['.', '.', 'G', 'e', 'r', 'd'], ['.', '.', 'M', 'y', 'r', 'a'], ['T', 'a', 'm', 'a', 'r', 'a'], ['L', 'o', 'w', 'a', 'n', 'a'], ['.', '.', 'E', 'l', 'v', 'e'], ['.', '.', 'L', 'a', 't', 'e'], ['.', '.', 'H', 'e', 'r', 'm'], ['.', '.', '.', '.', '.', '.'], ['.', 'L', 'a', 'k', 'e', 'n'], ['.', '.', '.', '.', '.', '.'], ['.', '.', '.', '.', '.', '.'], ['B', 'r', 'e', 'i', 'o', 'n'], ['.', '.', 'J', 'a', 'y', 'n'], ['.', '.', '.', 'H', 'a', 'f'], ['.', '.', '.', 'A', 'a', 'l'], ['.', '.', 'S', 'a', 'r', 'a'], ['.', '.', '.', 'Y', 'a', 'd'], ['.', '.', 'B', 'r', 'i', 'c'], ['.', 'J', 'o', 'n',

# Generate name

In [28]:
name_from_distribution = model.generate_name(starting_char="L", max_length=5, randomize=True)
name = model.generate_name(starting_char="L", max_length=5, randomize=False)

print(name_from_distribution)
print(name)

okese
akese
