<a href="https://colab.research.google.com/github/VinitVpANDEY/Neural-Network/blob/main/MLP_BatchNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
words = open('names.txt', 'r').read().splitlines()
print(words[:8])

len(words)

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


32033

In [None]:
# Build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)


{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [None]:
# build the dataset
block_size = 3  # context length: how many characters do we take to predict the next one?

def build_dataset(words):
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]  # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])  # 80%
Xdev, Ydev = build_dataset(words[n1:n2])  # 10%
Xte, Yte = build_dataset(words[n2:])  # 10%

g = torch.Generator().manual_seed(2147483647)

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

class Linear:

  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in ** 0.5
    self.bias = torch.zeros(fan_out) if bias else None

  def __call__(self,x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])


class BatchNorm1d:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    if self.training:
      xmean = x.mean(0, keepdim=True)
      xvar = x.var(0, keepdim=True, unbiased=True)
    else:
      xmean = self.running_mean
      xvar = self.running_var

    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
    self.out = self.gamma * xhat + self.beta

    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar

    return self.out

  def parameters(self):
    return [self.gamma, self.beta]


class Tanh:

  def __call__(self,x):
    self.out = torch.tanh(x)
    return self.out

  def parameters(self):
    return []

In [None]:
# @title Default title text
def train(max_steps, X, Y, bs=32, vocab_size = 27, n_embd = 10, n_hidden = 100, block_size = 3, weight_scale = 5/3, learning_rate = None):
  C = torch.randn((vocab_size, n_embd), generator=g)

  layers = [
      Linear(n_embd * block_size, n_hidden), BatchNorm1d(n_hidden), Tanh(),
      Linear(n_hidden, n_hidden), BatchNorm1d(n_hidden), Tanh(),
      Linear(n_hidden, n_hidden), BatchNorm1d(n_hidden), Tanh(),
      Linear(n_hidden, n_hidden), BatchNorm1d(n_hidden), Tanh(),
      Linear(n_hidden, n_hidden), BatchNorm1d(n_hidden), Tanh(),
      Linear(n_hidden, vocab_size), BatchNorm1d(vocab_size)
  ]

  with torch.no_grad():
    layers[-1].gamma *= 0.1
    for layer in layers:
      if isinstance(layer, Linear):
        layer.weight *= weight_scale

  parameters = [C] + [p for layer in layers for p in layer.parameters()]
  print('Total number of parameters are:', sum(p.nelement() for p in parameters))

  for p in parameters:
    p.requires_grad = True

  for i in range(max_steps):
    # minibatch construct
    ix = torch.randint(0, X.shape[0], (bs,), generator = g)
    Xb, Yb = X[ix], Y[ix]

    # forward pass
    emb = C[Xb]
    x = emb.view(emb.shape[0], -1)
    for layer in layers:
      x = layer(x)
    loss = F.cross_entropy(x, Yb)


    # backward pass
    for p in parameters:
      p.grad = None
    loss.backward()

    # update
    lr = 0.1 if i < 100000 else 0.01
    if learning_rate: lr = learning_rate
    for p in parameters:
      p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0:
      print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')

  return layers, C



In [None]:
layers, C = train(150000, Xtr, Ytr)

Total number of parameters are: 47551
      0/ 150000: 3.2998
  10000/ 150000: 2.4799
  20000/ 150000: 2.2382
  30000/ 150000: 2.2865
  40000/ 150000: 2.0540
  50000/ 150000: 2.3933
  60000/ 150000: 2.2979
  70000/ 150000: 2.1845
  80000/ 150000: 1.9803
  90000/ 150000: 2.3812
 100000/ 150000: 2.1609
 110000/ 150000: 1.8543
 120000/ 150000: 2.0962
 130000/ 150000: 2.3420
 140000/ 150000: 1.7519


In [None]:
@torch.no_grad()
def split_loss(split):
    x, y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]

    emb = C[x]
    x = emb.view(emb.shape[0], -1)
    for layer in layers:
      if isinstance(layer, BatchNorm1d):
        layer.training = False

    for layer in layers:
      x = layer(x)
    loss = F.cross_entropy(x, y)
    print(split, loss.item())


split_loss('train')
split_loss('val')

train 2.0267608165740967
val 2.0910773277282715


In [None]:
def generate_name(count):
  for i in range(count):
    out = []
    context = [0] * block_size
    while True:
      emb = C[torch.tensor([context])]
      x = emb.view(emb.shape[0], -1)
      for layer in layers:
        x = layer(x)
      p = F.softmax(x, dim=1)
      ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
      context = context[1:] + [ix]
      out.append(itos[ix])
      if ix == 0:
        break
    print(''.join(out))

generate_name(10)

chevi.
rub.
jerssieyel.
mikella.
dema.
mael.
lidi.
geria.
zephonesise.
del.
