In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils as torch_utils

import numpy as np
import random
# Assume words is a list of names
words = open("input.txt", "r").read().splitlines()

# Creating the character dictionary
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos) #27

# Hyperparameters
context_length = 8
input_size = vocab_size
hidden_size = 128
output_size = vocab_size

# One-hot encoding function
def one_hot_encode(index, size):
    vec = np.zeros(size, dtype=np.float32)
    vec[index] = 1.0
    return vec

# Function to map one-hot encoded vectors to index
def one_hot_to_index(one_hot_vec):
    return np.argmax(one_hot_vec)

# Function to map index to character
def index_to_char(index):
    return itos[index]

torch.manual_seed(1219)

<torch._C.Generator at 0x230344e3db0>

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharDataset(Dataset):
    def __init__(self, words, stoi, context_length, vocab_size):
        self.words = words
        self.stoi = stoi
        self.context_length = context_length
        self.vocab_size = vocab_size
        self.X, self.Y = self.build_dataset()

    def one_hot_encode(self, index):
        vec = np.zeros(self.vocab_size, dtype=np.float32)
        vec[index] = 1.0
        return vec

    def build_dataset(self):
        X, Y = [], []
        for w in self.words:
            context = [0] * self.context_length
            for ch in w + '.':
                ix = self.stoi[ch]
                one_hot_context = [self.one_hot_encode(c) for c in context]
                X.append(np.array(one_hot_context))
                Y.append(ix)
                context = context[1:] + [ix]
        X = torch.tensor(X, dtype=torch.float)
        Y = torch.tensor(Y)
        return X, Y

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]


In [19]:
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

train_words = words[:n1]
dev_words = words[n1:n2]

test_words = words[n2:]

train_dataset = CharDataset(train_words, stoi, context_length, vocab_size)
dev_dataset = CharDataset(dev_words, stoi, context_length, vocab_size)
test_dataset = CharDataset(test_words, stoi, context_length, vocab_size)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Gated recurrent units

![image.png](attachment:image.png)

In [21]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRU, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

        self.rt_layer = nn.Linear(self.input_size+ self.hidden_size, self.hidden_size)
        self.zt_layer = nn.Linear(self.input_size+ self.hidden_size, self.hidden_size)
        self.candidate_layer = nn.Linear(self.input_size+ self.hidden_size, self.hidden_size)
        self.output_layer = nn.Linear(self.hidden_size, self.output_size)
 
    def forward(self, xt, hprev):
        combined = torch.cat([xt, hprev], 1)

        rt = self.sigmoid(self.rt_layer(combined))
        zt = self.sigmoid(self.zt_layer(combined))

        combined_candidate = torch.cat((xt, rt * hprev), 1)
        candidate_hidden = self.tanh(self.candidate_layer(combined_candidate))

        ht = (1 - zt) * hprev + zt * candidate_hidden

        return ht


In [22]:
x = torch.tensor([[1,2,3]])
y = torch.tensor([[4,5,6]])
torch.stack([x,y], 1)

tensor([[[1, 2, 3],
         [4, 5, 6]]])

In [23]:
torch.stack([x,y], 0)

tensor([[[1, 2, 3]],

        [[4, 5, 6]]])

In [24]:
gru_model = GRU(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

steps = []
losses = []
# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    gru_model.train()
    total_loss = 0
    for X_batch, Y_batch in train_loader:

        X_batch = X_batch.to(device)
        Y_batch = Y_batch.to(device)

        # Initialize the hidden state
        # X_batch shape is 32 x 3 x 27 (B,T,C)
        hprev = torch.zeros(X_batch.size(0), hidden_size).to(device)

        # Forward pass
        outputs = []
        for t in range(X_batch.size(1)):  # Iterate through sequence length
            xt = X_batch[:, t, :]
            hprev = gru_model(xt, hprev)
            outputs.append(hprev)

        outputs = torch.stack(outputs, dim=1)  # [batch_size, seq_len, hidden_size]
        outputs = outputs[:, -1, :]  # Take the last hidden state for each sequence
        outputs = gru_model.output_layer(outputs)  # Apply the output layer

        # Compute the loss
        loss = criterion(outputs, Y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    steps.append(epoch)
    losses.append(avg_loss)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')


Epoch [1/20], Loss: 2.2752
Epoch [2/20], Loss: 2.1217
Epoch [3/20], Loss: 2.0603
Epoch [4/20], Loss: 2.0173
Epoch [5/20], Loss: 1.9841
Epoch [6/20], Loss: 1.9557
Epoch [7/20], Loss: 1.9324
Epoch [8/20], Loss: 1.9136
Epoch [9/20], Loss: 1.8961
Epoch [10/20], Loss: 1.8810
Epoch [11/20], Loss: 1.8673
Epoch [12/20], Loss: 1.8548
Epoch [13/20], Loss: 1.8447
Epoch [14/20], Loss: 1.8339
Epoch [15/20], Loss: 1.8263
Epoch [16/20], Loss: 1.8177
Epoch [17/20], Loss: 1.8096
Epoch [18/20], Loss: 1.8038
Epoch [19/20], Loss: 1.7981
Epoch [20/20], Loss: 1.7914


In [25]:
# Evaluation on the validation set
gru_model.eval()
with torch.no_grad():
    total_loss = 0
    for X_batch, Y_batch in dev_loader:
        X_batch = X_batch.to(device)
        Y_batch = Y_batch.to(device)

        # Initialize the hidden state
        # X_batch shape is 32 x 3 x 27 (B,T,C)
        hprev = torch.zeros(X_batch.size(0), hidden_size).to(device)
        outputs = []
        for t in range(X_batch.size(1)):
            xt = X_batch[:, t, :]
            hprev = gru_model(xt, hprev)
            outputs.append(hprev)

        outputs = torch.stack(outputs, dim=1)
        outputs = outputs[:, -1, :]  # Take the last hidden state for each sequence
        outputs = gru_model.output_layer(outputs)

        loss = criterion(outputs, Y_batch)
        total_loss += loss.item()

    avg_loss = total_loss / len(dev_loader)
    print(f'Validation Loss: {avg_loss:.4f}')

Validation Loss: 2.0212


In [59]:
import torch.nn.functional as F

def generate_text(model, stoi, itos, initial_text, max_length=100, temperature=1.0):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    model.eval()
    context = [stoi[ch] for ch in initial_text]
    context = torch.tensor([one_hot_encode(ix, vocab_size) for ix in context], dtype=torch.float).unsqueeze(0).to(device)
    hprev = torch.zeros(1, hidden_size).to(device)

    generated_text = initial_text

    with torch.no_grad():
        for _ in range(max_length):
            xt = context[:, -1, :]  # Get the last character in the current context
            hprev = model(xt, hprev)
            output = model.output_layer(hprev)

            # Apply temperature
            output = output / temperature

            # Convert to probabilities
            probs = F.softmax(output, dim=1).squeeze()

            # Sample from the distribution
            char_idx = torch.multinomial(probs, 1).item()
            generated_char = itos[char_idx]

            # Append the generated character to the context and the generated text
            generated_text += generated_char
            new_context = torch.tensor(one_hot_encode(char_idx, vocab_size), dtype=torch.float).unsqueeze(0).unsqueeze(0).to(device)
            context = torch.cat([context, new_context], dim=1)

            if generated_char == '.':
                break

    return generated_text

# Example usage:
initial_text = "a"  # Initial seed text
generated_text = generate_text(gru_model, stoi, itos, initial_text)
print("Generated Text:", generated_text)


Generated Text: anthony.
