## Importing necessary libraries 

In [10]:
import torch
import pandas as pd
import numpy as np

## Cleaning and Encoding Name Strings for Model Input

In [11]:
df = pd.read_excel("names.xlsx")
df.rename(columns={'Michael': 'Names'}, inplace=True)
names_list = df["Names"].dropna().astype(str).tolist()
names_list = ['.' + names + '.' for names in names_list]
chars = sorted(list(set(''.join(names_list))))
stoi = {ch:i for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i:ch for ch, i in stoi.items()}
encoded_names = [[stoi[ch] for ch in name] for name in names_list]

## Generating Fixed-Length Sequences and Targets from Encoded Names

In [17]:
seq_length = 5
X, y = [], []
for name in encoded_names:
    for i in range(1, len(name)):
        start = max(0, i - seq_length)
        seq = name[start:i] 
        seq = [0] * (seq_length - len(seq)) + seq
        X.append(seq)
        y.append(name[i])
        
for i in range(len(encoded_names))[:10]:
    print(f"{i}: Input: {X[i]}, Target: {y[i]}")

0: Input: [0, 0, 0, 0, 0], Target: 3
1: Input: [0, 0, 0, 0, 3], Target: 34
2: Input: [0, 0, 0, 3, 34], Target: 44
3: Input: [0, 0, 3, 34, 44], Target: 35
4: Input: [0, 3, 34, 44, 35], Target: 45
5: Input: [3, 34, 44, 35, 45], Target: 46
6: Input: [34, 44, 35, 45, 46], Target: 41
7: Input: [44, 35, 45, 46, 41], Target: 42
8: Input: [35, 45, 46, 41, 42], Target: 34
9: Input: [45, 46, 41, 42, 34], Target: 31


## Splitting Data into Training and Test Sets

In [13]:
n = int(0.8 * len(X))
X_train, X_test = X[:n], X[n:]
y_train, y_test = y[:n], y[n:]

## Initializing Model Parameters

In [14]:
vocab_size = len(chars)
hidden_size = 100
embedding_size = 32

Wxh1 = torch.randn(hidden_size, embedding_size) * 0.01
Whh1 = torch.randn(hidden_size, hidden_size) * 0.01
bh1  = torch.zeros(hidden_size, 1)

Wxh2 = torch.randn(hidden_size, hidden_size) * 0.01  
Whh2 = torch.randn(hidden_size, hidden_size) * 0.01 
bh2  = torch.zeros(hidden_size, 1)

Why = torch.randn(vocab_size, hidden_size) * 0.01
by  = torch.zeros(vocab_size, 1)

Embedding_vector = torch.randn(vocab_size, embedding_size)

bgain1 = torch.ones((1, hidden_size))
bnbias1 = torch.zeros((1, hidden_size))

bgain2 = torch.ones((1, hidden_size))
bnbias2 = torch.zeros((1, hidden_size))

parameters = [Wxh1, Whh1, bh1, Wxh2, Whh2, bh2, Why, by, Embedding_vector, bgain1, bnbias1, bgain2, bnbias2]
for p in parameters:
    p.requires_grad = True

## Model Functions – Forward, Backward, and Loss Computation

In [15]:
def tanh(x):
    x = torch.clamp(x, -20, 20)
    e_pos = torch.exp(x)
    e_neg = torch.exp(-x)
    return (e_pos - e_neg) / (e_pos + e_neg)

def softmax_cross_entropy(out_batch, target_batch):
    out_batch = out_batch - torch.max(out_batch, dim=1, keepdim=True)[0]
    exp_out = torch.exp(out_batch)
    probs = exp_out / exp_out.sum(dim=1, keepdim=True)
    
    correct_probs = probs[torch.arange(len(target_batch)), target_batch]
    loss = -torch.log(correct_probs).mean()
    return loss, probs

def batch_normalization(h, bgain=None, bnbias=None, eps=1e-5):
    bnmean = torch.mean(h, dim=0, keepdim=True)   
    bnstd  = torch.std(h, dim=0, keepdim=True)       
    h_norm = (h - bnmean) / (bnstd + eps)         
    h_out = bgain * h_norm + bnbias                
    return h_out, bnmean, bnstd


def forward(X_batch, h1_prev, h2_prev, bnmean1, bnstd1, bnmean2, bnstd2):
    batch_size, seq_len = X_batch.shape
    h1_prev = h1_prev.repeat(batch_size, 1)
    h2_prev = h2_prev.repeat(batch_size, 1)

    outs = []
    h1s = []
    h2s = []

    for t in range(seq_len):
        X_t = X_batch[:, t]
        X_embed = Embedding_vector[X_t]

        h1_pre = X_embed @ Wxh1.T + h1_prev @ Whh1.T + bh1.T
        h1, mean1, std1 = batch_normalization(h1_pre, bgain1, bnbias1)
        h1 = torch.tanh(h1)

        h2_pre = h1 @ Wxh2.T + h2_prev @ Whh2.T + bh2.T
        h2, mean2, std2 = batch_normalization(h2_pre, bgain2, bnbias2)
        h2 = torch.tanh(h2)

        out = h2 @ Why.T + by.T

        outs.append(out)
        h1s.append(h1)
        h2s.append(h2)

        h1_prev = h1
        h2_prev = h2

    with torch.no_grad():
        bnmean1 = 0.999 * bnmean1 + 0.001 * mean1
        bnstd1  = 0.999 * bnstd1  + 0.001 * std1
        bnmean2 = 0.999 * bnmean2 + 0.001 * mean2
        bnstd2  = 0.999 * bnstd2  + 0.001 * std2

    outs = torch.stack(outs, dim=1)
    h1s = torch.stack(h1s, dim=1)
    h2s = torch.stack(h2s, dim=1)

    return outs, h1s, h2s, bnmean1, bnstd1, bnmean2, bnstd2


def backward_batch(X_batch, h1_batch, h2_batch, out_batch, y_batch):
    dWxh1 = torch.zeros_like(Wxh1)
    dWhh1 = torch.zeros_like(Whh1)
    dbh1  = torch.zeros_like(bh1)

    dWxh2 = torch.zeros_like(Wxh2)
    dWhh2 = torch.zeros_like(Whh2)
    dbh2  = torch.zeros_like(bh2)

    dWhy  = torch.zeros_like(Why)
    dby   = torch.zeros_like(by)

    dEmbedding = torch.zeros_like(Embedding_vector)

    batch_size = X_batch.shape[0]

    for b in range(batch_size):
        X_seq = X_batch[b]
        h1_seq = h1_batch[b]
        h2_seq = h2_batch[b]
        out_seq = out_batch[b]
        target_seq = y_batch[b]

        dh1_next = torch.zeros_like(h1_seq[0])
        dh2_next = torch.zeros_like(h2_seq[0])

        for t in reversed(range(len(X_seq))):
            out = out_seq[t]
            h1 = h1_seq[t]
            h2 = h2_seq[t]
            X_index = X_seq[t]
            target_index = target_seq if isinstance(target_seq, int) else target_seq[t]

            probs = torch.softmax(out, dim=0)
            probs[target_index] -= 1
            dout = probs

            dWhy += dout @ h2.T
            dby  += dout

            dh2 = Why.T @ dout + dh2_next
            dh2_raw = dh2 * (1 - h2**2)

            dWxh2 += dh2_raw @ h1.T
            dWhh2 += dh2_raw @ (h2_seq[t-1] if t > 0 else torch.zeros_like(h2)).T
            dbh2  += dh2_raw

            dh1 = Wxh2.T @ dh2_raw + dh1_next
            dh1_raw = dh1 * (1 - h1**2)

            dWxh1 += dh1_raw @ Embedding_vector[X_index].T
            dWhh1 += dh1_raw @ (h1_seq[t-1] if t > 0 else torch.zeros_like(h1)).T
            dbh1  += dh1_raw

            dEmbedding[X_index] += Wxh1.T @ dh1_raw

            dh1_next = Whh1.T @ dh1_raw
            dh2_next = Whh2.T @ dh2_raw

    return (dWxh1 / batch_size, dWhh1 / batch_size, dbh1 / batch_size,
            dWxh2 / batch_size, dWhh2 / batch_size, dbh2 / batch_size,
            dWhy / batch_size, dby / batch_size, dEmbedding / batch_size)

## Model Training Loop

In [None]:
max_steps = 2000
batch_size = 32

bnmean1 = torch.zeros(1, hidden_size)
bnstd1  = torch.ones(1, hidden_size)
bnmean2 = torch.zeros(1, hidden_size)
bnstd2  = torch.ones(1, hidden_size)

for i in range(max_steps):
    h1_prev = torch.zeros(hidden_size, 1)
    h2_prev = torch.zeros(hidden_size, 1)

    ix = torch.randint(0, len(X_train), (batch_size,))
    Xb, yb = X_train[ix], y_train[ix]

    out_batch, h1_batch, h2_batch, bnmean1, bnstd1, bnmean2, bnstd2 = forward(
        Xb, h1_prev, h2_prev, bnmean1, bnstd1, bnmean2, bnstd2
    )

    loss, _ = softmax_cross_entropy(out_batch, yb)

    grads = backward_batch(Xb, h1_batch, h2_batch, out_batch, yb)

    lr = 0.1 if i < 200 else 0.01

    for p, g in zip(parameters, grads):
        p.data -= lr * g
