<a href="https://www.kaggle.com/code/evelynartoria/mlp-pytorch-nlp?scriptVersionId=187596114" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/us-baby-names/StateNames.csv
/kaggle/input/us-baby-names/NationalReadMe.pdf
/kaggle/input/us-baby-names/hashes.txt
/kaggle/input/us-baby-names/NationalNames.csv
/kaggle/input/us-baby-names/StateReadMe.pdf
/kaggle/input/us-baby-names/database.sqlite


# Introduction
- This is a notebook presesnting how to build a MLP (Multilayer perceptron) for character level machine learning model
- this notebook follows the Building makemore Part 2: MLP by Andrej Karpathy (https://www.youtube.com/watch?v=TCH_1BHY58I&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=4)

# Import needed libraries

In [2]:
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split


# Device agnostic code

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
generator = torch.Generator(device=device)
print(f"default devcie set to {device}")

default devcie set to cpu


# Read the names csv file

In [4]:
names_csv = pd.read_csv("/kaggle/input/us-baby-names/NationalNames.csv")
names_csv

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
...,...,...,...,...,...
1825428,1825429,Zykeem,2014,M,5
1825429,1825430,Zymeer,2014,M,5
1825430,1825431,Zymiere,2014,M,5
1825431,1825432,Zyran,2014,M,5


# Prepare the data

In [5]:
names = names_csv["Name"].values
vocab = set("".join(names))
print(vocab)

{'D', 'Z', 'l', 'p', 'z', 'C', 'U', 'i', 'B', 'N', 'c', 'n', 'g', 'Q', 'F', 'o', 'j', 'q', 'b', 's', 'm', 'J', 'f', 'T', 't', 'h', 'a', 'X', 'I', 'v', 'P', 'x', 'V', 'e', 'H', 'M', 'W', 'O', 'G', 'E', 'k', 'S', 'R', 'w', 'Y', 'u', 'r', 'K', 'd', 'L', 'A', 'y'}


In [6]:
# string to id
stoi = {c:v+1 for v, c in enumerate(vocab)}
stoi["."] = 0

# id to string
itos = {v:c for c, v in stoi.items()}

print(stoi["g"])
print(itos[21])

vocab_size = len(stoi)
print(f"vocab size --> {vocab_size}")

13
m
vocab size --> 53


In [7]:
def make_dataset(context_size, names, log: bool):

    inputs = []
    labels = []

    for name in names:
        chs = name + "."
        context = ["."] * context_size

        for ch in chs:
            idx = stoi[ch]
            inputs.append([stoi[ch_context] for ch_context in context])
            labels.append(idx)
            
            if log:
                print(f"for context {context}, expect --> {itos[idx]}")
            
            context = context[1:] + list(ch)
    
    if log:
        print(inputs)
    
    inputs = torch.tensor(inputs)
    labels = torch.tensor(labels)
    dataset = TensorDataset(inputs, labels)
    
    if log:
        print(inputs.shape)
        print(labels.shape)
    return dataset

In [8]:
context_size = 6
names_dataset = make_dataset(context_size=context_size, names=names, log=False)

In [9]:
train_split = int(len(names_dataset) * 0.8)
test_split = int(len(names_dataset) - train_split)
train_dataset, test_dataset = random_split(dataset=names_dataset, lengths=[train_split, test_split])

In [10]:
batch_size = 64
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, generator=generator)

# Sample from the dataloader

In [11]:
batch_sample_inputs, batch_sample_labels = next(iter(test_dataloader))
print(batch_sample_inputs.shape, batch_sample_labels.shape)

torch.Size([64, 6]) torch.Size([64])


# Model

In [12]:
# n_embd is the number of values used to represent each token
n_embd = 5
lookup_table = torch.randn(size=(vocab_size, n_embd))
print(lookup_table)

tensor([[ 0.1910,  0.2529,  0.1702,  0.3490,  1.1524],
        [ 0.8226, -1.2103, -2.0078,  0.0331,  0.1466],
        [-0.4766,  0.6657, -1.2633, -0.7203, -0.6937],
        [ 0.9686, -1.0285, -0.6936,  0.6835,  0.5472],
        [-0.7300, -0.3681, -1.1399,  0.9989,  1.3538],
        [-1.8471, -0.8797,  0.0491, -0.2660, -1.2394],
        [ 0.2073,  0.3121,  0.5729,  1.6215,  0.2775],
        [-0.5958,  0.2120, -0.6438,  0.7397, -0.8305],
        [-0.7792, -0.4257,  0.2491,  1.5282,  0.5284],
        [ 1.8956, -1.0551,  1.3927, -0.6194, -0.4414],
        [ 0.1667, -1.1706, -0.5769, -0.8891,  0.6357],
        [-0.6914,  0.9539, -0.0478,  0.2615, -1.0424],
        [ 0.1152, -2.4405, -0.0821, -2.6383,  0.5856],
        [-0.5742, -0.8190, -0.4573, -0.4212, -0.2012],
        [-0.1637,  0.9159,  0.5086, -0.4996, -0.2310],
        [ 0.2977,  1.0562, -0.5292, -0.8227, -0.9170],
        [-2.0740, -1.2263,  2.2683, -1.8315, -0.2507],
        [ 0.1066,  0.5343, -0.6043, -0.6283, -0.0803],
        [ 

In [13]:
random_char_idx = torch.randint(0, vocab_size, (1,)).item()
random_char = itos[random_char_idx]
print(random_char)
print(lookup_table[random_char_idx]) # the random char is going to get n_embd values related to itself

m
tensor([-1.1214, -0.3443, -1.0614, -0.2809, -0.4767])


In [14]:
random_char_idx_tensor = torch.randint(0, vocab_size, (8, 4))
stacked = torch.stack([idx for idx in random_char_idx_tensor])

for stack in stacked:
    input_chs = ""
    for v in stack:
        input_chs += itos[v.item()]
        
    print(f"for input {input_chs} --> {stack}")

print(random_char)
print(lookup_table[stacked]) # the random char is going to get n_embd values related to itself
print(lookup_table[stacked].shape) # shape of B x T x C --> batches by inputs by channels

for input EFSU --> tensor([40, 15, 42,  7])
for input ..Nf --> tensor([ 0,  0, 10, 23])
for input OcxD --> tensor([38, 11, 32,  1])
for input FzBi --> tensor([15,  5,  9,  8])
for input Pfxd --> tensor([31, 23, 32, 49])
for input THWZ --> tensor([24, 35, 37,  2])
for input fqJd --> tensor([23, 18, 22, 49])
for input bPhl --> tensor([19, 31, 26,  3])
m
tensor([[[-0.8389,  0.0050, -0.0150,  1.0552, -0.3970],
         [ 0.2977,  1.0562, -0.5292, -0.8227, -0.9170],
         [ 0.6327,  0.3981, -1.0385, -0.3724, -0.1561],
         [-0.5958,  0.2120, -0.6438,  0.7397, -0.8305]],

        [[ 0.1910,  0.2529,  0.1702,  0.3490,  1.1524],
         [ 0.1910,  0.2529,  0.1702,  0.3490,  1.1524],
         [ 0.1667, -1.1706, -0.5769, -0.8891,  0.6357],
         [ 0.2415,  0.2750,  1.7493, -0.9243,  0.1461]],

        [[ 1.1010,  1.2167,  1.9880, -1.3722,  0.0265],
         [-0.6914,  0.9539, -0.0478,  0.2615, -1.0424],
         [-1.0135, -0.8282,  0.0238, -0.3103, -0.3234],
         [ 0.8226, -1.2103

In [15]:
# another way is to use nn.Embedding()
emb_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
random_char_idx_tensor = torch.randint(0, vocab_size, (8, 4), dtype=torch.long)


embedded = emb_table(random_char_idx_tensor)

print(random_char)
print(embedded.shape) # B x T x C (batches by time by channels)
# time is the number of inputs, channels is the number of values for each input
print(embedded)

m
torch.Size([8, 4, 5])
tensor([[[ 0.5135, -0.9732, -2.2502, -0.7153, -0.6811],
         [ 0.3091, -0.4326, -1.0256, -0.5363, -0.3226],
         [ 0.8491,  0.8945,  0.8710, -1.1318, -0.4168],
         [ 1.7743, -0.1195, -0.3694, -0.1886, -0.3251]],

        [[-0.4650,  0.5792,  0.6693,  1.4846,  0.4195],
         [ 1.7743, -0.1195, -0.3694, -0.1886, -0.3251],
         [-0.4650,  0.5792,  0.6693,  1.4846,  0.4195],
         [ 0.3151,  0.0644, -0.5634, -0.7968,  0.7176]],

        [[ 1.3678,  0.1365, -0.6316, -0.6158, -0.8207],
         [ 0.8803, -0.9316, -1.0286, -0.1380,  1.1855],
         [-0.6524, -0.3893, -1.5866, -1.6253,  0.7163],
         [ 0.4129, -0.7507, -0.7812,  0.7463,  0.5595]],

        [[ 0.2092,  1.3673,  1.5639,  0.3987, -1.1506],
         [ 0.5637,  0.4472, -0.1323,  0.7090, -0.2301],
         [ 0.5637,  0.4472, -0.1323,  0.7090, -0.2301],
         [ 0.0961, -0.6239,  1.1914, -1.2992,  0.5193]],

        [[ 1.5983,  2.1535, -0.2992, -0.1406,  0.1330],
         [ 0.280

In [16]:
embedded = emb_table(random_char_idx_tensor)

# Batches, Time and Channels
B, T, C = embedded.shape
#embedded = embedded.reshape(B, T*C) # proper shape to multiply the embedded information by weights
embedded = embedded.view(B, T*C)
print(embedded.shape)
print(embedded)

torch.Size([8, 20])
tensor([[ 0.5135, -0.9732, -2.2502, -0.7153, -0.6811,  0.3091, -0.4326, -1.0256,
         -0.5363, -0.3226,  0.8491,  0.8945,  0.8710, -1.1318, -0.4168,  1.7743,
         -0.1195, -0.3694, -0.1886, -0.3251],
        [-0.4650,  0.5792,  0.6693,  1.4846,  0.4195,  1.7743, -0.1195, -0.3694,
         -0.1886, -0.3251, -0.4650,  0.5792,  0.6693,  1.4846,  0.4195,  0.3151,
          0.0644, -0.5634, -0.7968,  0.7176],
        [ 1.3678,  0.1365, -0.6316, -0.6158, -0.8207,  0.8803, -0.9316, -1.0286,
         -0.1380,  1.1855, -0.6524, -0.3893, -1.5866, -1.6253,  0.7163,  0.4129,
         -0.7507, -0.7812,  0.7463,  0.5595],
        [ 0.2092,  1.3673,  1.5639,  0.3987, -1.1506,  0.5637,  0.4472, -0.1323,
          0.7090, -0.2301,  0.5637,  0.4472, -0.1323,  0.7090, -0.2301,  0.0961,
         -0.6239,  1.1914, -1.2992,  0.5193],
        [ 1.5983,  2.1535, -0.2992, -0.1406,  0.1330,  0.2801,  0.1762,  0.7456,
         -0.5360, -1.1181, -1.8922,  0.7219, -1.9671, -0.4800, -0.3

In [17]:
hidden_units = 100
w1 = torch.randn(size=(T*C, hidden_units))
b1 = torch.randn(size=(hidden_units,))
result = torch.tanh(embedded @ w1 + b1)
print(result.shape)

torch.Size([8, 100])


In [18]:
w2 = torch.randn(size=(100, vocab_size))
b2 = torch.randn(size=(vocab_size,))
logits = result @ w2 + b2
print(logits.shape) # B by vocab_size, each batch has 1 output and there are vocab_size possible characters as an output

torch.Size([8, 53])


In [19]:
percents = torch.softmax(logits, dim=1)
preds = torch.argmax(percents, dim=1)
print(percents.shape)
print(percents[0].sum())
print(preds.shape, preds)

torch.Size([8, 53])
tensor(1.0000, grad_fn=<SumBackward0>)
torch.Size([8]) tensor([40,  7, 47, 26,  5, 29, 52,  3])


# Turn all this mess into a model class

In [20]:
# vocab_size --> number of possible characters
# n_embd --> number of values associated with each token
class MLP(nn.Module):
    def __init__(self, context_size, vocab_size, n_embd):
        super().__init__()
        
        self.context_size = context_size
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        
        self.token_emb = nn.Embedding(vocab_size, n_embd) # B x T x C (B=batches; T=context_size, C=n_embd)
        self.linear1 = nn.Linear(in_features=context_size*n_embd, out_features=8*8)
        self.linear2 = nn.Linear(in_features=8*8, out_features=vocab_size)
        self.act_fn = nn.Tanh()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.token_emb(x)
        
        B, T, C = x.shape
        x = x.view(B, T*C)
        x = self.act_fn(self.linear1(x))
        x = self.linear2(x)
        
        return x
    
    def generate_name(self, starting_char, max_length, randomize: bool):
        name = ""
        last_char = starting_char
        i = 0
        while last_char != "." and i < max_length:
            context = [stoi["."]] * (self.context_size - 1) + [stoi[last_char]]
            context = torch.tensor(context, dtype=torch.long).view(1, len(context))

            logits = self(context)
            percents = torch.softmax(logits, dim=1)
            
            if randomize:
                pred = torch.multinomial(percents, num_samples=1)
            else:
                pred = torch.argmax(percents, dim=1)
            
            i += 1
            name += itos[pred.item()]
            last_char = itos[pred.item()]
        return name

# Define the model, optimizer and loss function

In [21]:
model = MLP(vocab_size=vocab_size, context_size=context_size, n_embd=8)
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [22]:
def list_to_characters(array):
    strings = []
    for v in array:
        string = [itos[idx] for idx in v]
        strings.append(string)
        
    return strings

# Model infernce

In [23]:
def model_inference(model, dataloader):
    model.eval()
    with torch.inference_mode():
        X, y = next(iter(dataloader))
        logits = model(X)
        percents = torch.softmax(logits, dim=1)
        preds = torch.argmax(percents, dim=1)
        preds_array = [idx_array.item() for idx_array in preds]
        labels_array = [label_array.item() for label_array in y]
        print(preds)
        print("\n")
        batches = [batch for batch in X]
        separated_inputs = [batch.tolist() for batch in batches]
        print(f"the batches are {list_to_characters(array=separated_inputs)}\n")
        print(f"model predicted {[itos[idx] for idx in preds_array]}\n")
        print(f"expected --> {[itos[label] for label in labels_array]}\n")

In [24]:
model_inference(model=model, dataloader=train_dataloader)

tensor([37, 45, 16, 17, 39, 45, 17, 45,  4,  4, 17, 33,  4, 50, 26, 45, 17,  4,
        43, 50, 46, 25, 17, 45, 45, 17, 17, 17, 10,  4,  4, 39, 17,  4, 43,  7,
        17, 45, 17, 17, 15,  4, 51, 51, 16, 39, 45, 17, 37, 39, 45, 25, 17, 17,
        39, 37, 43, 39, 17, 17, 15, 17,  4, 45])


the batches are [['r', 't', 'e', 'm', 'i', 'o'], ['.', '.', '.', '.', 'G', 'e'], ['m', 'i', 's', 'o', 'l', 'a'], ['.', '.', '.', '.', '.', '.'], ['.', '.', '.', 'M', 'a', 'u'], ['.', '.', '.', '.', 'W', 'i'], ['.', '.', '.', '.', '.', '.'], ['.', '.', 'E', 'v', 'y', 'n'], ['.', '.', '.', '.', 'U', 'm'], ['.', '.', '.', '.', '.', 'S'], ['.', '.', '.', 'V', 'a', 'n'], ['t', 'o', 'p', 'h', 'e', 'r'], ['.', '.', '.', 'S', 'h', 'a'], ['.', '.', '.', '.', 'Q', 'u'], ['.', 'C', 'o', 'r', 'n', 'e'], ['.', '.', '.', 'C', 'r', 'i'], ['.', '.', '.', '.', '.', 'R'], ['.', '.', '.', '.', '.', 'L'], ['.', '.', '.', 'J', 'e', 'n'], ['.', '.', 'C', 'a', 'm', 'y'], ['D', 'a', 'l', 'l', 'i', 's'], ['.', 'F', 'l', 'o',

# Training loop

In [25]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for batch, (X, y) in enumerate(dataloader):
            logits = model(X)
            loss = loss_fn(logits, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch % 5000 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")
    print(f"loss for the very last batch --> {loss}")

In [26]:
train_model(model=model, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=1)

loss for batch 0 --> 3.974172592163086 at epoch 0
loss for batch 5000 --> 2.185049533843994 at epoch 0
loss for batch 10000 --> 2.342066764831543 at epoch 0
loss for batch 15000 --> 2.356318950653076 at epoch 0
loss for batch 20000 --> 1.946466326713562 at epoch 0
loss for batch 25000 --> 1.9315907955169678 at epoch 0
loss for batch 30000 --> 2.0936498641967773 at epoch 0
loss for batch 35000 --> 2.2282559871673584 at epoch 0
loss for batch 40000 --> 2.230684518814087 at epoch 0
loss for batch 45000 --> 2.0686068534851074 at epoch 0
loss for batch 50000 --> 2.162890672683716 at epoch 0
loss for batch 55000 --> 2.3669183254241943 at epoch 0
loss for batch 60000 --> 2.1972827911376953 at epoch 0
loss for batch 65000 --> 2.136941909790039 at epoch 0
loss for batch 70000 --> 2.2070326805114746 at epoch 0
loss for batch 75000 --> 2.09073805809021 at epoch 0
loss for batch 80000 --> 2.1862876415252686 at epoch 0
loss for batch 85000 --> 2.0733842849731445 at epoch 0
loss for batch 90000 --> 

# Test the model

In [27]:
model_inference(model=model, dataloader=train_dataloader)

tensor([27,  0, 49, 23, 27, 27,  3,  0, 47, 47,  8, 51, 27, 27, 12, 20, 12, 51,
        20,  0,  3,  0,  0, 47, 27, 51, 27,  0,  3,  0, 47,  0, 27,  8, 34, 27,
        20, 27,  0,  0, 27, 47, 12,  0, 47,  3,  0, 27,  0, 21, 27, 34, 51, 51,
        47, 27, 51,  0,  0, 51, 51,  0, 51, 27])


the batches are [['.', '.', '.', '.', '.', 'M'], ['e', 'a', 'r', 'l', 'i', 'n'], ['.', '.', '.', 'W', 'o', 'o'], ['.', '.', '.', 'R', 'u', 'f'], ['.', '.', '.', '.', '.', 'K'], ['.', '.', '.', '.', '.', 'M'], ['.', '.', 'M', 'c', 'c', 'a'], ['q', 'u', 'i', 'n', 't', 'a'], ['.', 'J', 'a', 's', 'p', 'a'], ['.', '.', '.', '.', 'T', 'a'], ['.', '.', '.', 'P', 'e', 'c'], ['.', '.', '.', '.', '.', '.'], ['.', '.', '.', '.', '.', 'M'], ['.', '.', '.', '.', '.', 'N'], ['.', '.', 'J', 'o', 'd', 'i'], ['.', '.', '.', '.', 'J', 'o'], ['.', '.', 'J', 'e', 'r', 'e'], ['.', '.', '.', '.', '.', '.'], ['B', 'r', 'e', 'l', 'e', 'i'], ['B', 'e', 'n', 'n', 'i', 'e'], ['.', '.', '.', 'V', 'i', 'b'], ['K', 'r', 'i', 's',

# Generate name

In [28]:
class model_generator:
    def __init__(self, model, length, num_samples):
        self.outputs = []
        self.model = model
        self.length = length
        self.num_samples = num_samples
    
    @torch.no_grad
    def generate(self, starting_char):
        model.eval()
        for i in range(self.num_samples):
            output = self.model.generate_name(starting_char=starting_char, max_length=self.length, randomize=True)
            self.outputs.append(output)
    
    def update_params(self, clear_outputs: bool, model=None, length=None, num_samples=None):
        if clear_outputs:
            self.clear_outputs()
            
        update_dict = {
            "model": model,
            "length": length,
            "num_samples": num_samples
        }
        
        for attribute, value in update_dict.items():
            if value is not None:
                setattr(self, attribute, value)
    
    def clear_outputs(self):
        self.outputs = []
    
    def print_outputs(self):
        for output in self.outputs:
            print(f"{output}\n\n")
    

In [29]:
name_generator = model_generator(model=model, length=7, num_samples=5)
name_generator.generate(starting_char="L")
name_generator.print_outputs()

elavila


eayacea


ayseaqo


umainer


eeanaea


