In [17]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd

In [18]:
df = pd.read_csv(r'C:\Users\UGBOKE GEORGE\Downloads\NationalNames\N_gram-name-prediction\NationalNames.csv', encoding= 'latin1')

In [19]:
top_names = df['Name'].head(50000)

processed_names = ["^" + name + "$" for name in top_names]


In [20]:
class NameDataset(Dataset):
    def __init__(self, names, n=5):
        super().__init__()
        self.n = n
        chars = set("".join(names))
        self.char_to_int = {char: i for i, char in enumerate(chars, start=1)}
        self.int_to_char = {i: char for char, i in self.char_to_int.items()}
        self.char_to_int["<PAD>"] = 0  # Padding
        self.int_to_char[0] = "<PAD>"
        self.vocab_size = len(self.char_to_int)
        
        self.samples = []
        for name in names:
            encoded = [self.char_to_int[c] for c in name]
            for i in range(len(encoded) - n):
                self.samples.append((encoded[i:i+n], encoded[i+1:i+n+1]))
                
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        x_pad = x + [0] * (self.n - len(x))
        y_pad = y + [0] * (self.n - len(y))
        return torch.tensor(x_pad, dtype=torch.long), torch.tensor(y_pad, dtype=torch.long)

# Create the dataset
dataset = NameDataset(processed_names, n=5)

In [21]:
import torch.nn as nn

class NameGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_dim=128):
        super(NameGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        out = self.fc(lstm_out)
        return out

model = NameGenerator(dataset.vocab_size)


In [22]:
from torch.optim import Adam
from torch.nn.functional import cross_entropy

def train_model(model, dataset, batch_size=64, epochs=10):
    model.train()
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = Adam(model.parameters())
    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            optimizer.zero_grad()
            pred = model(x)
            loss = cross_entropy(pred.transpose(1, 2), y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")

# Training the model
train_model(model, dataset)


Epoch 1, Loss: 1.9057051971484797
Epoch 2, Loss: 1.5308436500074718
Epoch 3, Loss: 1.4093202874694073
Epoch 4, Loss: 1.351604914161521
Epoch 5, Loss: 1.3202195391968383
Epoch 6, Loss: 1.3010358233966737
Epoch 7, Loss: 1.2879191190983768
Epoch 8, Loss: 1.2781892491618232
Epoch 9, Loss: 1.2707445428964677
Epoch 10, Loss: 1.264825610095906


In [23]:
def predict_name(model, dataset, start_char='^', max_length=20):
    """
    
    Parameters:
    - model: The trained PyTorch model.
    - dataset: The dataset object containing char_to_int and int_to_char mappings.
    - start_char: The starting character. Default is '^'.
    - max_length: The maximum length of the name to generate.
   
    """
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # No need to track gradients
        chars = [start_char]
        for _ in range(max_length):
            # Convert current sequence of characters to integers
            input_seq = [dataset.char_to_int[c] for c in chars]
            # Pad input 
            input_tensor = torch.tensor([input_seq], dtype=torch.long)
            # Get predictions
            output = model(input_tensor)
            # Convert output probabilities to next character
            probabilities = torch.softmax(output[0, -1], dim=0).numpy()
            next_char_int = np.random.choice(len(probabilities), p=probabilities)
            next_char = dataset.int_to_char[next_char_int]
            if next_char == '$':  # End of sequence
                break
            chars.append(next_char)
        return ''.join(chars[1:])  # Skip the start-of-sequence character in the output

# Example usage
generated_name = predict_name(model, dataset)
print(generated_name)


Salome


: 