<a href="https://www.kaggle.com/code/evelynartoria/bigram-like-model-pytorch-nlp?scriptVersionId=187156852" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/us-baby-names/StateNames.csv
/kaggle/input/us-baby-names/NationalReadMe.pdf
/kaggle/input/us-baby-names/hashes.txt
/kaggle/input/us-baby-names/NationalNames.csv
/kaggle/input/us-baby-names/StateReadMe.pdf
/kaggle/input/us-baby-names/database.sqlite


# This is a bigram like aproach for next character prediction

# Import needed libraries

In [2]:
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split

from tqdm import tqdm

# Device agnostic code

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
print(f"default device set to {device}")

default device set to cpu


# Read data

In [4]:
names_csv = pd.read_csv("/kaggle/input/us-baby-names/NationalNames.csv")
names_csv

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
...,...,...,...,...,...
1825428,1825429,Zykeem,2014,M,5
1825429,1825430,Zymeer,2014,M,5
1825430,1825431,Zymiere,2014,M,5
1825431,1825432,Zyran,2014,M,5


# Prepare the data

In [5]:
names = names_csv["Name"].values
vocab = set("".join(names))

print(vocab)

{'C', 'Q', 'g', 'S', 'p', 'e', 'a', 'P', 'O', 'Y', 'X', 'N', 'L', 'U', 'E', 'm', 'b', 'd', 'y', 'c', 'H', 'T', 'h', 'f', 't', 'x', 'Z', 'r', 'q', 'I', 'k', 'J', 'A', 'z', 'o', 's', 'j', 'w', 'u', 'G', 'n', 'W', 'K', 'F', 'V', 'D', 'M', 'v', 'R', 'l', 'B', 'i'}


In [6]:
stoi = {c:v+2 for v, c in enumerate(vocab)}
stoi["<SOS>"] = 0
stoi["<EOS>"] = 1

itos = {v:c for c, v in stoi.items()}

vocab_size = len(stoi)

print(stoi["M"])
print(itos[43])
print(vocab_size)

48
W
54


In [7]:
inputs = []
labels = []
for name in names[:2]:
    chs = ["<SOS>"] + list(name) + ["<EOS>"]
    
    for ch1, ch2 in zip(chs, chs[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]
        
        inputs.append(idx1)
        labels.append(idx2)
        
inputs = torch.tensor(inputs, dtype=torch.int32)
labels = torch.tensor(labels, dtype=torch.int32)

# Visualize the data

In [8]:
for X, y in zip(inputs, labels):
    print(f"for the input {X.item()} ({itos[X.item()]}), expected --> {y.item()}; {itos[y.item()]}")

for the input 0 (<SOS>), expected --> 48; M
for the input 48 (M), expected --> 8; a
for the input 8 (a), expected --> 29; r
for the input 29 (r), expected --> 20; y
for the input 20 (y), expected --> 1; <EOS>
for the input 0 (<SOS>), expected --> 34; A
for the input 34 (A), expected --> 42; n
for the input 42 (n), expected --> 42; n
for the input 42 (n), expected --> 8; a
for the input 8 (a), expected --> 1; <EOS>


# Construct the datasets and dataloaders

In [9]:
def make_dataset(names):
    inputs = []
    labels = []
    for name in names:
        chs = ["<SOS>"] + list(name) + ["<EOS>"]

        for ch1, ch2 in zip(chs, chs[1:]):
            idx1 = stoi[ch1]
            idx2 = stoi[ch2]

            inputs.append(idx1)
            labels.append(idx2)

    inputs = torch.tensor(inputs, dtype=torch.int32)
    labels = torch.tensor(labels, dtype=torch.int32)
    
    dataset = TensorDataset(inputs, labels)
    return dataset

In [10]:
generator = torch.Generator(device=device)

names_dataset = make_dataset(names=names)

train_split = int(len(names_dataset) * 0.8)
test_split = int(len(names_dataset) - train_split)

train_dataset, test_dataset = random_split(dataset=names_dataset, lengths=[train_split, test_split], generator=generator)

In [11]:
batch_size = 64
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, generator=generator)

# Sample from the dataloader

In [12]:
batch_sample_inputs, batch_sample_labels = next(iter(test_dataloader))
print(batch_sample_inputs, batch_sample_labels)

tensor([ 7,  0,  0,  7,  0,  0,  0,  8,  7,  8,  0, 26, 23, 53,  8, 48, 24,  7,
         0,  5,  7, 20,  0, 26, 42, 29,  0,  4,  8, 51,  8,  0,  7, 53, 19,  7,
        37, 34, 53, 47, 37,  0,  7,  8,  0,  8,  8, 21, 42, 19, 24, 42, 51,  0,
        42,  9,  8, 42, 35, 34,  8, 53, 42, 42], dtype=torch.int32) tensor([25, 41, 10, 49, 33,  5, 50, 19, 29, 29,  9, 42,  8, 42, 40,  8, 29, 29,
        15, 26, 29,  1, 48,  1,  1, 53, 44,  7,  1, 51, 37,  2, 42, 26,  7,  1,
         1, 42, 42, 36, 26,  3,  8,  1, 34, 42, 20,  8,  1,  8,  1,  1,  1, 23,
         1, 36, 42,  1, 53, 17,  1, 51, 42,  7], dtype=torch.int32)


# Classifier model

In [13]:
class ClassifierModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.linear1 = nn.Linear(in_features=1, out_features=8*8, dtype=torch.float32)
        self.linear2 = nn.Linear(in_features=8*8, out_features=8*8*8, dtype=torch.float32)
        self.linear3 = nn.Linear(in_features=8*8*8, out_features=vocab_size, dtype=torch.float32)
        
        self.act_fn = nn.ReLU()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.act_fn(self.linear1(x))
        x = self.act_fn(self.linear2(x))
        x = self.linear3(x)
        
        return x
        

# Bigram like model

In [14]:
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size, dtype=torch.float32)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        logits = self.embedding_table(x)
        B, T, C = logits.shape
        
        logits = logits.reshape(B*T, C)
        
        return logits

In [15]:
classifier_model = ClassifierModel(vocab_size=vocab_size)
bigram_like_model = BigramModel(vocab_size=vocab_size)


bigram_optimizer = torch.optim.Adam(params=bigram_like_model.parameters(), lr=1e-1)
loss_fn = nn.CrossEntropyLoss()

In [16]:
emb = nn.Embedding(vocab_size, vocab_size)
inputs = torch.tensor([[0]], dtype=torch.long)
embedded = emb(inputs)
print(embedded)
print(embedded.shape) # batch by inputs by n_embd (n_embd is the number or possible tokens associated with each input)


tensor([[[-2.2135e-01,  1.8206e-01,  1.5204e+00, -4.1473e-01, -1.1851e-04,
          -6.9320e-01,  4.3481e-01, -8.3395e-01,  5.4692e-02, -9.8160e-01,
           6.4134e-01, -6.5377e-02,  5.8582e-03, -4.0661e-01, -1.9806e-01,
          -9.3439e-01, -1.0162e+00,  2.5442e-01,  3.0803e-01, -2.4471e-01,
          -5.3298e-02, -1.8201e+00, -1.9675e+00,  1.6593e+00, -1.5022e-01,
           7.1850e-01, -4.9820e-01,  1.3548e+00, -1.2402e+00,  3.4825e-01,
           5.9895e-01,  7.6718e-01,  1.8232e+00,  1.7031e+00, -1.4640e+00,
          -6.9740e-01,  9.4774e-02, -2.0993e-01,  4.3434e-01, -2.7525e-01,
          -2.4825e-01,  7.8016e-01,  5.5718e-01,  1.9110e-01,  8.8480e-02,
          -9.3037e-02,  3.7705e-01, -1.9262e-01,  1.2340e+00,  2.5133e-01,
          -9.5177e-01,  3.3684e-01, -1.3013e+00,  5.7847e-01]]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([1, 1, 54])


# Model inference

In [17]:
def model_inference(model, starting_char, bigram_model: bool):
    model.eval()
    #input_sample = inputs[0].to(torch.float32)
    if bigram_model:
        input_sample = torch.tensor(stoi[starting_char], dtype=torch.long)
    else:
        input_sample = torch.tensor(stoi[starting_char], dtype=torch.float32)
        
    input_sample = input_sample.reshape(1, 1)
    with torch.inference_mode():
        context = ""
        pred_char = ""
        max_length = 10

        while pred_char != "<EOS>" and len(context) < max_length:
            logits = model(input_sample)
            pred = torch.argmax(torch.softmax(logits, dim=1), dim=1)
            pred_char = itos[pred.item()]
            context += pred_char
            
            if bigram_model:
                input_sample = pred.reshape(1, 1).to(torch.long)
            else:
                input_sample = pred.reshape(1, 1).to(torch.float32)

        print(context)

In [18]:
model_inference(model=classifier_model, starting_char="<SOS>", bigram_model=False)
model_inference(model=bigram_like_model, starting_char="<SOS>", bigram_model=True)

RRRRRRRRRR
DdEezLo<EOS>


In [19]:
def train_model(model, train_dataloader, loss_fn, optimizer, epochs, bigram: bool):
    model.train()
    
    for epoch in range(epochs):
        for batch, (X, y) in enumerate(train_dataloader):
            if bigram:
                X = X.to(torch.long)
                X = X.reshape(1, len(X))
            logits = model(X)
            loss = loss_fn(logits, y.to(torch.long))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch % 5000 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")
            

    print(f"loss for th very last batch --> {loss}")

In [20]:
train_model(model=bigram_like_model, train_dataloader=train_dataloader, loss_fn=loss_fn, optimizer=bigram_optimizer, epochs=2, bigram=True)

loss for batch 0 --> 4.475108623504639 at epoch 0
loss for batch 5000 --> 2.389503240585327 at epoch 0
loss for batch 10000 --> 2.5362653732299805 at epoch 0
loss for batch 15000 --> 2.840265989303589 at epoch 0
loss for batch 20000 --> 2.41422963142395 at epoch 0
loss for batch 25000 --> 2.496122121810913 at epoch 0
loss for batch 30000 --> 2.580915927886963 at epoch 0
loss for batch 35000 --> 2.394439458847046 at epoch 0
loss for batch 40000 --> 2.421869993209839 at epoch 0
loss for batch 45000 --> 2.0717291831970215 at epoch 0
loss for batch 50000 --> 2.382190704345703 at epoch 0
loss for batch 55000 --> 2.4469566345214844 at epoch 0
loss for batch 60000 --> 2.474855899810791 at epoch 0
loss for batch 65000 --> 2.401779890060425 at epoch 0
loss for batch 70000 --> 2.3383610248565674 at epoch 0
loss for batch 75000 --> 2.238337993621826 at epoch 0
loss for batch 80000 --> 2.304372787475586 at epoch 0
loss for batch 85000 --> 2.4854366779327393 at epoch 0
loss for batch 90000 --> 2.31

In [21]:
model_inference(model=bigram_like_model, starting_char="<SOS>", bigram_model=True)

Ma<EOS>
