In [6]:
import torch
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import torch.nn.functional as F

In [7]:
# Open training data
with open(f"./names.txt") as file:
    words = file.readlines()
    
# Make list of each word from data
words = [word.strip() for word in words]

# store alphabet of letters from data. 
alphabet = set()
alphabet.add('<S>')
alphabet.add('<E>')

for word in words:
    for char in word:
        alphabet.add(char)


# Store alphabet to index mapping with reverse functionality. 
alphabet = list(alphabet)
alphabet = sorted(alphabet)
char_to_int = dict()
int_to_char = dict()

for index, char in enumerate(alphabet):
    char_to_int[char] = index
    int_to_char[index] = char

# Store token counts
token_counts = dict()

# Iterate of each word
for word in words:
    
    # Add stop and start tokens
    chst = ['<S>'] + [char for char in word] + ['<E>']
    # print(type(chst))
    
    # This will include up to ending token in right index
    for tuplet in zip(chst, chst[1:]):
        # Store running counts of each token
        if tuplet in token_counts:
            token_counts[tuplet] += 1
        else: 
            token_counts[tuplet] = 1
    
    

In [8]:
# Create 2D tensor to store frequency of bigram pairs. 
data = torch.zeros((len(alphabet), len(alphabet)), dtype=torch.int32)


char_to_int
# # Iterate over tuplets
for tuplet in token_counts.items():
    # store characters pairs and freq
    chpairs, freq = tuplet
    # Assign index of data according to character values
    data_index = [char_to_int[char] for char in chpairs]
    print(chpairs, data_index)
    data[data_index[0]][data_index[1]] = freq
    


('<S>', 'e') [1, 6]
('e', 'm') [6, 14]
('m', 'm') [14, 14]
('m', 'a') [14, 2]
('a', '<E>') [2, 0]
('<S>', 'o') [1, 16]
('o', 'l') [16, 13]
('l', 'i') [13, 10]
('i', 'v') [10, 23]
('v', 'i') [23, 10]
('i', 'a') [10, 2]
('<S>', 'a') [1, 2]
('a', 'v') [2, 23]
('v', 'a') [23, 2]
('<S>', 'i') [1, 10]
('i', 's') [10, 20]
('s', 'a') [20, 2]
('a', 'b') [2, 3]
('b', 'e') [3, 6]
('e', 'l') [6, 13]
('l', 'l') [13, 13]
('l', 'a') [13, 2]
('<S>', 's') [1, 20]
('s', 'o') [20, 16]
('o', 'p') [16, 17]
('p', 'h') [17, 9]
('h', 'i') [9, 10]
('<S>', 'c') [1, 4]
('c', 'h') [4, 9]
('h', 'a') [9, 2]
('a', 'r') [2, 19]
('r', 'l') [19, 13]
('l', 'o') [13, 16]
('o', 't') [16, 21]
('t', 't') [21, 21]
('t', 'e') [21, 6]
('e', '<E>') [6, 0]
('<S>', 'm') [1, 14]
('m', 'i') [14, 10]
('a', 'm') [2, 14]
('m', 'e') [14, 6]
('<S>', 'h') [1, 9]
('r', 'p') [19, 17]
('p', 'e') [17, 6]
('e', 'r') [6, 19]
('r', '<E>') [19, 0]
('e', 'v') [6, 23]
('v', 'e') [23, 6]
('l', 'y') [13, 26]
('y', 'n') [26, 15]
('n', '<E>') [15, 0]


In [9]:
# Convert data tensor to numpy array
data_np = data.numpy()

# Create heatmap using Seaborn with annotations
plt.figure(figsize=(20, 15), dpi=200)
sns.heatmap(data_np, cmap='YlGnBu', annot=True, fmt='.0f', linewidths=.5)

# Set custom tick labels for x-axis (Second Character)
plt.xticks(np.arange(len(alphabet)) + 0.5, [int_to_char[i] for i in range(len(alphabet))])
# Set custom tick labels for y-axis (First Character)
plt.yticks(np.arange(len(alphabet)) + 0.5, [int_to_char[i] for i in range(len(alphabet))])

# Set labels and title
plt.title('Bigram Frequency Heatmap')
plt.xlabel('Second Character')
plt.ylabel('First Character')

# Display the plot
plt.show()

RuntimeError: Numpy is not available

In [10]:
# Create a generator object
generator = torch.Generator()

# Define the value to fill
epsilon = 1e-10

# Set seed using manual_seed method
generator.manual_seed(42)

# # Convert data to float for probabilities
norm_data = (data + epsilon).to(torch.float32)

# Normalize data according to valid pdf
norm_data /= norm_data.sum(dim=1, keepdim=True)

# # # Add very small number to clear null values
# norm_data[0].fill_(epsilon)

print(norm_data)


tensor([[3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02,
         3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02,
         3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02,
         3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02,
         3.5714e-02, 3.5714e-02, 3.5714e-02, 3.5714e-02],
        [3.1218e-15, 3.1218e-15, 1.3767e-01, 4.0770e-02, 4.8138e-02, 5.2758e-02,
         4.7794e-02, 1.3018e-02, 2.0885e-02, 2.7284e-02, 1.8450e-02, 7.5610e-02,
         9.2498e-02, 4.9074e-02, 7.9231e-02, 3.5776e-02, 1.2300e-02, 1.6077e-02,
         2.8720e-03, 5.1166e-02, 6.4153e-02, 4.0833e-02, 2.4350e-03, 1.1738e-02,
         9.5839e-03, 4.1832e-03, 1.6702e-02, 2.9001e-02],
        [1.9596e-01, 2.9512e-15, 1.6408e-02, 1.5966e-02, 1.3870e-02, 3.0751e-02,
         2.0422e-02, 3.9546e-03, 4.9579e-03, 6.8821e-02, 4.8694e-02, 5.1645e-03,
         1.6763e-02, 7.4605e-02, 4.8222e-02, 1.6048e-01, 1.8592e-03, 2.419

In [11]:
# Store list of sampled words
sample_outcome = []

for _ in range(10):
    # Word initializes with start token 
    ix = 1
    word = int_to_char[ix]

    while True:
        
        # get random character given previous character 
        ix = torch.multinomial(norm_data[ix], num_samples=1, generator=generator).item()
        
        # store new character token
        char = int_to_char[ix]
        
        # add sampled letter to word 
        word += char
        
        # check if terminating token has been reached
        if char == '<E>':
            # add word to list of sampled words
            sample_outcome.append(word)
            break
        

In [12]:
training = words[0]

inputs = []
targets = []

# Add start and end character tokens
training = ['<S>'] + [char for char in training] + ['<E>']

# Iterate over bigrams 
for ch1, ch2 in zip(training, training[1:]):
    # add prior
    inputs.append(char_to_int[ch1])
    # add target
    targets.append(char_to_int[ch2])

In [13]:
# targets_enc

In [14]:
inputs_enc = F.one_hot(torch.tensor(inputs), num_classes=28)
inputs_enc = inputs_enc.to(torch.float32)

targets_enc = F.one_hot(torch.tensor(targets), num_classes=28)
targets_enc = targets_enc.to(torch.float32)

In [15]:
weights = torch.rand((28, 28), requires_grad=True)
weights = weights.to(torch.float32)

In [16]:
# Calculate logits, unnormalized pre-probabilities
logits = weights @ inputs_enc.T
# print(logits)

# Transpose for readability
logits = logits.T

# Convert each output to valid probability distribution, across cols
# dim=0 / each row is a probability distribution over the alphabet given the prior / input token (char)
probabilities = logits.exp() / logits.exp().sum(dim=1, keepdim=True)
# print(probabilities) #compare to softmax

# Apply softmax to output of linear layer
softmax = F.softmax(logits, dim = 1)
# print(softmax) #compare to probabilities

# Calculate negative log likelihood = loss of data for practice (GLM)
lik = softmax[torch.arange(5),targets]
log_lik = lik.log().mean()
neg_log_lik = -log_lik
loss = neg_log_lik
print(loss)

# Compare calculated loss with cross entropy loss. 
criterion = torch.nn.CrossEntropyLoss()
criterion_result = criterion(softmax, targets_enc)
print(criterion_result)



tensor(3.5086, grad_fn=<NegBackward0>)
tensor(3.3379, grad_fn=<DivBackward1>)


In [20]:
targets

[6, 14, 14, 2, 0]

In [12]:
# Clear out prev gradients
weights.grad = None

# Backprop
loss.backward()

# Update weights
weights.data -= .01 * weights.grad

In [13]:
criterion = torch.nn.CrossEntropyLoss()
softmax.shape



torch.Size([5, 28])

In [14]:
# Initialize bigram array for storage of inputs and targets
train_inputs = []
train_targets = []


# Iterate over every word
for word in words:
    
    # append start and end tokens to word
    chst = ['<S>'] + [char for char in word] + ['<E>']
    
    # Add bigram to training data.
    for ch1, ch2 in zip(chst, chst[1:]):
        
        # add data to training set
        train_inputs.append(char_to_int[ch1])
        train_targets.append(char_to_int[ch2])
        
    # Define Stopping point 
    # no stopping point necessary
    

In [15]:
# Convert data to tensors
train_inputs = torch.tensor(train_inputs)
train_targets = torch.tensor(train_targets)

# Initialize training weights radnomly, uniform 0 to 1 exclusive
training_weights = torch.rand(28, 28, dtype=torch.float32)

In [16]:
train_inputs.shape

torch.Size([228146])

In [17]:
# Encode training data. 
train_inputs_enc = F.one_hot(train_inputs, num_classes=28).float()

# train_inputs_enc.shape -> 228146 x 28
train_targets_enc = F.one_hot(train_targets, num_classes=28).float()
# print(train_inputs_enc)
# train_inputs
# train_targets_enc = F.one_hot(train_targets, num_classes=28)

In [18]:
print(train_inputs_enc.shape)
print(training_weights.T.shape)

torch.Size([228146, 28])
torch.Size([28, 28])


In [22]:
# for k in range(100):
    
# Foward pass
# Calculate linear layer output
logits = train_inputs_enc @ training_weights.T

# Apply softmax layer for normalization and conversion to valid pdf
softmax = F.softmax(logits, dim=1)

# # # store likelihood of data
log_lik = softmax[:, train_targets]
# # log_lik = softmax[torch.arange(train_inputs.shape[0]), train_targets].log()
# neg_log_lik = -log_lik
# loss = neg_log_lik

# print(f"K: {k}, Loss: {loss}")

# # Clear weights
# training_weights.grad.zero_()

# # backward pass
# loss.backward()

# training_weights.data += .01 * training_weights.grad
    
    

In [21]:
softmax.dtype

torch.float32