In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/us-baby-names/StateNames.csv
/kaggle/input/us-baby-names/NationalReadMe.pdf
/kaggle/input/us-baby-names/hashes.txt
/kaggle/input/us-baby-names/NationalNames.csv
/kaggle/input/us-baby-names/StateReadMe.pdf
/kaggle/input/us-baby-names/database.sqlite


# Import libraries

In [2]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split

from tqdm import tqdm

# Device agnostic code

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
generator = torch.Generator(device=device)
print(f"default device set to {device}")

default device set to cpu


In [4]:
names_csv = pd.read_csv("/kaggle/input/us-baby-names/NationalNames.csv")
names_csv

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
...,...,...,...,...,...
1825428,1825429,Zykeem,2014,M,5
1825429,1825430,Zymeer,2014,M,5
1825430,1825431,Zymiere,2014,M,5
1825431,1825432,Zyran,2014,M,5


# Model hyperparameters
- context_size --> how many characters the model look at before making a prediction
- n_embd --> number of values per character token

In [5]:
context_size = 4
n_embd = 5
vocab = set("".join(names_csv["Name"]))
vocab.add(".")
vocab_size = len(vocab)

# Prepare dataset

In [6]:
stoi = {c: v for v, c in enumerate(vocab)}
itos = {v: c for c, v in stoi.items()}
print(stoi["."])
print(itos[39])

36
O


In [7]:
def make_dataset(data, context_size):
    inputs = []
    labels = []
    context = [stoi["."]] * context_size
    for name in names_csv["Name"]:
        for ch in name:
            inputs.append(context)
            labels.append(stoi[ch])
            context = context[1:] + [stoi[ch]]
    
    inputs = torch.tensor(inputs, dtype=torch.long)
    labels = torch.tensor(labels, dtype=torch.long)
    
    return TensorDataset(inputs, labels)

In [8]:
dataset = make_dataset(data=names_csv, context_size=context_size)

In [9]:
train_split = int(len(dataset) * 0.8)
test_split = int(len(dataset) - train_split)
train_dataset, test_dataset = random_split(dataset=dataset, lengths=[train_split, test_split])

In [10]:
batch_size = 128
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
test_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, generator=generator)

# Model

In [11]:
class MLP(nn.Module):
    def __init__(self, context_size, vocab_size, n_embd):
        super().__init__()
        
        self.context_size = context_size
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        
        self.token_emb = nn.Embedding(vocab_size, n_embd) # B x T x C (B=batches; T=context_size, C=n_embd)
        self.linear1 = nn.Linear(in_features=context_size*n_embd, out_features=8*8)
        self.linear2 = nn.Linear(in_features=8*8, out_features=vocab_size)
        self.act_fn = nn.Tanh()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.token_emb(x)
        
        B, T, C = x.shape
        x = x.view(B, T*C)
        x = self.act_fn(self.linear1(x))
        x = self.linear2(x)
        
        return x
    
    def generate_name(self, starting_char, max_length, randomize: bool):
        name = ""
        last_char = starting_char
        i = 0
        while last_char != "." and i < max_length:
            context = [stoi["."]] * (self.context_size - 1) + [stoi[last_char]]
            context = torch.tensor(context, dtype=torch.long).view(1, len(context))

            logits = self(context)
            percents = torch.softmax(logits, dim=1)
            
            if randomize:
                pred = torch.multinomial(percents, num_samples=1)
            else:
                pred = torch.argmax(percents, dim=1)
            
            i += 1
            name += itos[pred.item()]
            last_char = itos[pred.item()]
        return name

# Define the model, optimizer and loss function

In [12]:
model = MLP(context_size=context_size, vocab_size=vocab_size, n_embd=n_embd)
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [13]:
# way to generate characters from probability distribution
tensor = torch.softmax(torch.randn(size=(1, vocab_size)), dim=1)
print(tensor.sum(dim=1))
multinomial = torch.multinomial(tensor, num_samples=1)
print(itos[multinomial.item()])

tensor([1.0000])
R


In [14]:
name_from_distribution = model.generate_name(starting_char="L", max_length=5, randomize=True)
name = model.generate_name(starting_char="L", max_length=5, randomize=False)

print(name_from_distribution)
print(name)

.
XCXCX


# Train the model

In [15]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for batch, (X, y) in enumerate(dataloader):
            logits = model(X)
            loss = loss_fn(logits, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch % 5000 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")
                
    print(f"loss for the very last batch --> {loss}")

In [16]:
train_model(model=model, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=1)

loss for batch 0 --> 4.044730186462402 at epoch 0
loss for batch 5000 --> 2.564631223678589 at epoch 0
loss for batch 10000 --> 2.4463095664978027 at epoch 0
loss for batch 15000 --> 2.381837844848633 at epoch 0
loss for batch 20000 --> 2.5656380653381348 at epoch 0
loss for batch 25000 --> 2.3179445266723633 at epoch 0
loss for batch 30000 --> 2.4941458702087402 at epoch 0
loss for batch 35000 --> 2.3904638290405273 at epoch 0
loss for batch 40000 --> 2.551481008529663 at epoch 0
loss for batch 45000 --> 2.371795654296875 at epoch 0
loss for batch 50000 --> 2.34086012840271 at epoch 0
loss for batch 55000 --> 2.4614946842193604 at epoch 0
loss for batch 60000 --> 2.264216184616089 at epoch 0
loss for batch 65000 --> 2.3846659660339355 at epoch 0
loss for batch 70000 --> 2.4509963989257812 at epoch 0
loss for the very last batch --> 2.38173246383667


# Sample from the model

In [17]:
name_from_distribution = model.generate_name(starting_char="L", max_length=5, randomize=True)
name = model.generate_name(starting_char="L", max_length=5, randomize=False)

print(name_from_distribution)
print(name)

eElCh
aShaS
