In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [2]:
import torch.nn.functional as F

In [4]:
words = open('/content/names.txt', 'r').read().splitlines()

In [25]:
type(words)

list

In [8]:
b = {}
for w in words:
  chr = ['<S>'] + list(w) + ['<E>']
  for ch1, ch2, ch3 in zip(chr, chr[1:], chr[2:]):
    trigram = (ch1, ch2, ch3)
    b[trigram] = b.get(trigram, 0) + 1


In [None]:
list(b.items())[-10:]

In [48]:
sorted_list = sorted(b.items(), key = lambda kv: -kv[1])
sorted_list[:10]

[(('a', 'h', '<E>'), 1714),
 (('n', 'a', '<E>'), 1673),
 (('a', 'n', '<E>'), 1509),
 (('o', 'n', '<E>'), 1503),
 (('<S>', 'm', 'a'), 1453),
 (('<S>', 'j', 'a'), 1255),
 (('<S>', 'k', 'a'), 1254),
 (('e', 'n', '<E>'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '<E>'), 953)]

In [None]:
char_list = sorted(list(set(''.join(words))))

s2i = {c:i for i, c in enumerate(char_list)}

s2i['<S>'] = 26
s2i['<E>'] = 27
s2i

In [9]:
i2s = {i:s for s,i in s2i.items()}

In [None]:
for i in sorted_list[:5]:
  print(i, i[0], i[0][0])

In [11]:
# N = torch.zeros((len(b),28), dtype=torch.int32)
N = torch.zeros((28,28,28), dtype=torch.int32)

In [12]:
for w in words:
  chr = ['<S>'] + list(w) + ['<E>']
  for ch1, ch2, ch3 in zip(chr, chr[1:], chr[2:]):
    ix1 = s2i[ch1]
    ix2 = s2i[ch2]
    ix3 = s2i[ch3]
    #print(ix1,ix2,ix3)
    N[ix1, ix2,  ix3] += 1

In [14]:
P = (N+1).float()
P = P/ P.sum(dim=2,keepdim=True)

In [None]:
log_likelihood = 0.0
n= 0
for w in words:
  chr = ['<S>'] + list(w) + ['<E>']
  for ch1, ch2, ch3 in zip(chr, chr[1:], chr[2:]):
    ix1 = s2i[ch1]
    ix2 = s2i[ch2]
    ix3 = s2i[ch3]
    prob = P[ix1, ix2, ix3]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    #print(f"{ch1}{ch2}{ch3}: {prob:.4f} {logprob:.4f}")


print(f"{log_likelihood=}")
# negative_log_likelihood serves as a nice loss function;
negative_log_likelihood = -log_likelihood
print(f'{negative_log_likelihood=}')
print(f'{negative_log_likelihood/n}') # this is the actual loss function which is normalized

In [None]:
g = torch.Generator().manual_seed(2)
ix = [0, 0]
name = ''

while True:
  p = P[ix[0],ix[1],:]
  #p = N[ix,:]
  #p = p /p.sum()
  ix[0] = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
  ix[1] = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
  if ix[0] == 27 or ix[1] == 27: # remember <E> token is 27 idx
    break
  name += i2s[ix[0]]
  name += i2s[ix[1]]
  print(i2s[ix[0]], i2s[ix[1]])

name

In [None]:
g = torch.Generator().manual_seed(2)
ix = [0, 0]
name = ''

while True:
  p = P[ix[0],ix[1],:]
  #p = N[ix,:]
  #p = p /p.sum()
  i = torch.multinomial(p, num_samples=2, replacement=True,generator=g)
  ix[0] = i[0].item()
  ix[1] = i[1].item()
  if ix[0] == 27 or ix[1] == 27: # remember <E> token is 27 idx
    break
  name += i2s[ix[0]]
  name += i2s[ix[1]]
  print(i2s[ix[0]], i2s[ix[1]])

name

In [212]:
i = torch.multinomial(p, num_samples=2, replacement=True)
i, i[1].item()

(tensor([ 3, 12]), 12)

In [192]:
ix = [30, 4]
ix[1]

4

In [19]:
class Trigram:

  def __init__(self, words_list, s2i):
    self.t = {}
    self.words = words_list
    self.s2i = s2i

  def build_trigram(self):
    N = torch.zeros((28,28,28), dtype=torch.int32)
    for w in self.words:
      chr = ['<S>'] + list(w) + ['<E>']
      for ch1, ch2, ch3 in zip(chr, chr[1:], chr[2:]):
        ix1 = self.s2i[ch1]
        ix2 = self.s2i[ch2]
        ix3 = self.s2i[ch3]
        N[ix1, ix2,  ix3] += 1

    return N

  def build_trigram_count(self):
    for w in self.words:
      chr = ['<S>'] + list(w) + ['<E>']
      for ch1, ch2, ch3 in zip(chr, chr[1:], chr[2:]):
        trigram = (ch1, ch2, ch3)
        self.t[trigram] = self.t.get(trigram, 0) + 1
    sorted_t = sorted(self.t.items(), key= lambda kv: -kv[1])
    return sorted_t



In [15]:
def count_loss(self, P):
  log_likelihood = 0.0
  n= 0
  for w in words:
    chr = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2, ch3 in zip(chr, chr[1:], chr[2:]):
      ix1 = s2i[ch1]
      ix2 = s2i[ch2]
      ix3 = s2i[ch3]
      prob = P[ix1, ix2, ix3]
      logprob = torch.log(prob)
      log_likelihood += logprob
      n += 1

  negative_log_likelihood = -log_likelihood
  return negative_log_likelihood/n

def main():
  l = sorted(list(set(''.join(words))))
  s2i = {s:i for i,s in enumerate(l)}
  s2i['<S>'] = 26
  s2i['<E>'] = 27

  train, test = train_test_split(words, test_size=0.2)
  val, test = train_test_split(test, test_size=0.5)

  train = Trigram(train, s2i)
  train_data = train.build_trigram()

  val = Trigram(val, s2i)
  val_data = val.build_trigram()

  test = Trigram(test, s2i)
  test_data = test.build_trigram()


  smooth = [0,1,10,100]
  for i in smooth:
    P = (train_data+i).float()
    P = P/ P.sum(dim=2,keepdim=True)

    print(count_loss(P))

In [None]:
main()

In [None]:
class simpleModels():

  def __init__(self):
      self.s2i = self.stoi()

  def stoi(self):
      l = sorted(list(set(''.join(words))))
      s2i = {s:i for i,s in enumerate(l)}
      s2i['<S>'] = 26
      s2i['<E>'] = 27
      return s2i

  def count_model(self):
      pass

  def count_loss(self, P):
      log_likelihood = 0.0
      n= 0
      for w in words:
        chr = ['<S>'] + list(w) + ['<E>']
        for ch1, ch2, ch3 in zip(chr, chr[1:], chr[2:]):
          ix1 = self.s2i[ch1]
          ix2 = self.s2i[ch2]
          ix3 = self.s2i[ch3]
          prob = P[ix1, ix2, ix3]
          logprob = torch.log(prob)
          log_likelihood += logprob
          n += 1

      negative_log_likelihood = -log_likelihood
      return negative_log_likelihood/n

  def nn_model(self):
    xenc = F.one_hot()



    W = torch.randn((28,28,28), dtype=torch.float32)


In [None]:
train, test = train_test_split(words, test_size=0.2)
val, test = train_test_split(test, test_size=0.5)

train_obj = Trigram(train, s2i)
train_data = train_obj.build_trigram()
tt_count = train_obj.build_trigram_count()


In [72]:
x1s,x2s, ys =[], [], []
xs, ys =[], []
for w in train[:1]:
  chr = ['<S>'] + list(w) + ['<E>']
  for ch1, ch2, ch3 in zip(chr, chr[1:], chr[2:]):
    ix1 = s2i[ch1]
    ix2 = s2i[ch2]
    ix3 = s2i[ch3]
    print(ch1 , ch2, ch3)
    x1s.append(ix1)
    x2s.append(ix2)
    # xs.append(ix1)
    # xs.append(ix2)
    ys.append(ix3)

#print(len(x1s), len(x2s))

x1s = torch.tensor(x1s)
x2s = torch.tensor(x2s)
ys = torch.tensor(ys)


x1enc = F.one_hot(x1s, num_classes=28).float()
x2enc = F.one_hot(x2s, num_classes=28).float()
xenc = torch.cat((x1enc, x2enc), dim=1)
xenc.shape, x1enc.shape, x2enc.shape

<S> e d
e d a
d a l
a l i
l i n
i n e
n e <E>


(torch.Size([7, 56]), torch.Size([7, 28]), torch.Size([7, 28]))

In [68]:
xs, xenc

(tensor([[26,  4,  3,  0, 11,  8, 13],
         [ 4,  3,  0, 11,  8, 13,  4]]),
 torch.Size([2, 20, 28]))

In [53]:
W = torch.randn((28,28,28), dtype=torch.float32)

asd = xenc @ W # (2,7,28) @ (28,28,28)
asd.shape

torch.Size([28, 52, 28])