In [3]:
import jax
from jax import lax,random,numpy as jnp

import flax
from flax.core import freeze, unfreeze
from flax import linen as nn
from flax.training import train_state

# import haiku as hk

import optax


from torchvision.datasets import MNIST
from torch.utils.data import DataLoader

import functools
from typing import Any,Callable,Sequence,Optional

import numpy as np
import matplotlib.pyplot as plt

In [113]:
#input is CxHxW latent image, transform into patches of size Nx(P^2*C)

def patchify(lat_img, p):
    C, H, W = lat_img.shape
    assert (H % p == 0 and W % p == 0)
    
    lat_img = lat_img.reshape(C, H//p, p, W//p, p)

    lat_img = lat_img.transpose(1, 3, 2, 4, 0)

    patches = lat_img.reshape(-1, p*p*C)
    return patches

inp = random.normal(random.PRNGKey(23), (3,4,32,32))

pat = jax.vmap(patchify, in_axes=(0,None))(inp, 4)
# pat = patchify(inp,4)
# print(pat.shape)

def batch_patchify(batch_lat_img, p):
    return jax.vmap(patchify, in_axes=(0,None))(batch_lat_img,p)

#TODO: Positional encoding
class EmbedPatch(nn.Module):
    # patchdim: int
    embed_dim: int

    def setup(self):
        self.layer = nn.Dense(self.embed_dim)

    # @nn.compact
    def __call__(self, x):
        return self.layer(x)
    
embed = EmbedPatch(embed_dim=32)
key = random.PRNGKey(23)
params = embed.init(key, pat)
output = embed.apply(params, pat)
print(output.shape)

class MHA(nn.Module):
    num_heads: int
    embed_dim: int

    def setup(self):
        assert self.embed_dim%self.num_heads == 0, "embed_dim not divisible by num_heads"
        self.W = nn.Dense(self.embed_dim)
        self.K = nn.Dense(self.embed_dim)
        self.Q = nn.Dense(self.embed_dim)
        self.W0 = nn.Dense(self.embed_dim)

    def __call__(self, x):
        #Assume x has shape (Batches, Seq_len, embed_dim)
        B,S,_ = x.shape
        q = self.Q(x)
        w = self.W(x)
        k = self.K(x)

        head_dim = self.embed_dim//self.num_heads
        multi_q = q.reshape(B,S,self.num_heads,head_dim).transpose(0,2,1,3)
        multi_w = w.reshape(B,S,self.num_heads,head_dim).transpose(0,2,1,3)
        multi_k = k.reshape(B,S,self.num_heads,head_dim).transpose(0,2,1,3)
        
        attention = jnp.matmul(multi_q, multi_k.transpose(0,1,3,2))/jnp.sqrt(head_dim)
        attention = nn.softmax(attention,-1)
        z = jnp.matmul(attention,multi_w)
        multi_z = self.W0(z.transpose(0,2,1,3).reshape(B,S,self.embed_dim))

        return multi_z
    

mha = MHA(4,32)
key, key1 = random.split(key)
params = mha.init(key1, output) 
print(output)
output = mha.apply(params, output)
print(output.shape)



(3, 64, 32)
[[[ 1.0468478   0.8035271   0.8496734  ... -0.5753434  -0.03359401
   -0.68675864]
  [ 2.342903   -0.8281858   2.0300694  ...  0.6713694   0.05638641
   -0.3618956 ]
  [ 1.7352185   0.23705341  1.5971271  ... -0.5574417   2.1689997
    0.6870029 ]
  ...
  [ 0.15635595 -0.37036014  0.42485115 ...  1.8889856  -0.14340451
   -1.5819112 ]
  [-0.3020094  -0.59322274  1.704061   ... -1.8944708   0.79693174
    0.60269856]
  [ 0.04926859 -0.7514953   1.3294649  ... -0.8573161   1.750353
    0.7611191 ]]

 [[-0.9173674   2.233478   -0.42316583 ... -1.3016429   0.06169784
    0.35906047]
  [ 0.9953332   0.9354058  -0.8280301  ... -0.4074973   0.15709904
    1.0425365 ]
  [ 0.7179083  -0.51288664  0.31902295 ... -0.38125518  0.17373572
    0.48471907]
  ...
  [-0.13593046 -0.20349844 -0.34313315 ...  1.0903944  -1.975092
    1.3367975 ]
  [ 1.3926562  -0.46960866  0.11220901 ...  1.8033906  -0.26224324
   -0.01628546]
  [-1.048764    0.73301977  0.92024714 ... -0.66073483  0.2708147


In [116]:
#Test with MNIST

def custom_collate(batch):
    transposed_data = list(zip(*batch))
    # print((transposed_data))

    imgs = np.array(transposed_data[0])
    imgs = imgs.reshape(imgs.shape[0],1,imgs.shape[1],imgs.shape[2])
    labels = np.array(transposed_data[1])

    # print(len(imgs))

    return imgs, labels


train_dataset = MNIST(root='./train_mnist',train=True, download=True,transform=lambda x:(np.array(x, dtype=np.float32)))
test_dataset = MNIST(root='./test_mnist',train=False, download=True,transform=lambda x: np.ravel(np.array(x, dtype=np.float32)))
# print(type(train_dataset))
# print((train_dataset[0][0].shape))

BATCH_SIZE = 128

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate, drop_last=True)

# batch_data = (next(iter(train_loader)))
# batch_data = next(iter(train_loader))
# batch_data = next(iter(train_loader))
# print((batch_data[0].shape))

In [114]:
class Model(nn.Module):
    embed_dim: int
    num_heads: int

    def setup(self):
        self.embedder = EmbedPatch(self.embed_dim)
        self.mha = MHA(self.num_heads,self.embed_dim)
        self.linear = nn.Dense(10)
        self.mlp = nn.Dense(self.embed_dim)
        self.layernorm = nn.LayerNorm()
        # self.relu = nn.relu()

    def __call__(self,x):
        activation = x
        activation = batch_patchify(activation, 7)
        activation = self.embedder(activation)
        activation = self.layernorm(activation)
        activation = self.mha(activation)
        activation = self.layernorm(activation)
        activation = self.mlp(activation)
        activation = nn.relu(activation)
        activation = self.mha(activation)
        # activation = self.layernorm(activation)
        activation = activation.reshape(activation.shape[0], -1)
        activation = self.linear(activation)
        return activation

In [119]:
from jax import grad, value_and_grad


NUM_EPOCHS = 10
model = Model(64,4)
dummy = random.normal(key, (1,1,28,28))
params = model.init(key, dummy)

def cross_entropy_loss(params,imgs, labels):
    # logits: (batch_size, num_classes)
    # labels: (batch_size,) with class indices
    # patches = patchify(imgs, 7)
    # embed = EmbedPatch(embed_dim=28)
    # key = random.PRNGKey(23)
    # params = embed.init(key, patches)
    # output = embed.apply(params, patches)
    logits = model.apply(params,imgs)
    # print(logits.shape)
    log_probs = jax.nn.log_softmax(logits)  # (batch_size, num_classes)
    one_hot_labels = jax.nn.one_hot(labels, num_classes=logits.shape[-1])
    loss = -jnp.sum(one_hot_labels * log_probs, axis=-1)  # (batch_size,)
    return loss.mean()

def loss(params, imgs, labels):
    output = model.apply(params, imgs)
    log_probs = jax.nn.log_softmax(output)
    one_hot_labels = jax.nn.one_hot(labels, num_classes=10)
    return -jnp.mean(log_probs*one_hot_labels)

def update(params, imgs, gt_labels, lr=0.01):
    l, grads = value_and_grad(loss)(params,imgs,gt_labels)
    return l, jax.tree.map(lambda p, g: p - lr*g, params, grads)

for epoch in range((NUM_EPOCHS)):

    for cnt, (imgs, labels) in enumerate(train_loader):
        # gt_labels = jax.nn.one_hot(labels,len(MNIST.classes))
        # print(imgs.shape)
        l, params = update(params, imgs, labels)

        # if cnt % 50 == 0:
        print(l)
    break



0.25109115
0.2500823
0.24290256
0.23581591
0.24253109
0.24870744
0.23344178
0.23673497
0.228478
0.23841698
0.24092574
0.23596537
0.23822665
0.2312181
0.22915141
0.23142305
0.23298745
0.22781669
0.22971149
0.22985671
0.22713505
0.22869109
0.22695763
0.2300754
0.23193073
0.22666247
0.2249848
0.23209445
0.22721867
0.23245668
0.23182836
0.23433402
0.22791429
0.22480035
0.22570558
0.23200707
0.2315812
0.22769487
0.22909795
0.22765999
0.22670327
0.22494598
0.22672169
0.22118668
0.22670498
0.22857343


KeyboardInterrupt: 