In [None]:
#!pip install python-utils

In [None]:
from torch import nn, optim
import torch
import torch.nn.utils
from pathlib import Path
import python_utils
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms

from tqdm import tqdm

import math

import os
import pandas as pd

In [None]:
seed = 1
device = 'cuda' #but you can change to 'cuda' if you have a gpu

# Tuning Paramters

In [None]:
learning_rate = 0.005 # maybe 0.001 or 0.01 or 0.005 ? 0.1 originally
epochs = 80 # number of epochs

grad_norm = 1

batch_size_train = 100 # batch size of the train dataset
batch_size_test = 128 # batch size of the test dataset

nhidden = 32 # dimension of hidden states

# Other Paramters

In [None]:
ninp = 14 # numper of inputs to your RNN
nout = 10 # numper of outputs to your RNN

## set random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)

In [None]:
## random permutation
#perm = torch.randperm(784)
perm = torch.randperm(56)


# Create a Dataset

In [None]:
def get_data(bs_train,bs_test):
    train_dataset = torchvision.datasets.MNIST(root='data/',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)

    test_dataset = torchvision.datasets.MNIST(root='data/',
                                              train=False,
                                              transform=transforms.ToTensor())

    train_dataset, valid_dataset = torch.utils.data.random_split(train_dataset, [57000,3000])

    # Data loader
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=bs_train,
                                               shuffle=True)

    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                              batch_size=bs_test,
                                              shuffle=False)

    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=bs_test,
                                              shuffle=False)

    return train_loader, valid_loader, test_loader

In [None]:
train_loader, valid_loader, test_loader = get_data(batch_size_train ,batch_size_test)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw



# Create a Model

In [None]:
class rnnCell(nn.Module):
    def __init__(self, ninp, nhid):
        super(rnnCell, self).__init__()
        self.ninp = ninp
        self.nhid = nhid
        
        self.W1 = nn.Linear(nhid, nhid) # hidden-to-hidden matrix
        self.U1 = nn.Linear(ninp, nhid) # input-to-hidden matrix
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.nhid)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, h, x):
        h = torch.tanh(self.W1(h) + self.U1(x)) 
        return h

class RNN(nn.Module):
    def __init__(self, ninp, nhid, nout):
        super(RNN, self).__init__()
        self.nhid = nhid
        self.ninp = ninp    
        
        self.cell = rnnCell(ninp, nhid)
        self.classifier = nn.Linear(nhid, nout)
        
        self.init_weights()

    def init_weights(self):
        for name, param in self.named_parameters():
            if 'classifier' in name and 'weight' in name:
                nn.init.kaiming_normal_(param.data)

    def forward(self, input):
        h = input.data.new(input.size(1), self.nhid).zero_() # initialize hidden states
        
        for t, x in enumerate(input):
            h = self.cell(h, x)

        out = self.classifier(h) # final decoder to map hidden state to target
        return out       



# Create a new model

In [None]:
def grad_clipping(model, anorm):
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in model.parameters()))
    if norm > anorm:
        for param in model.parameters():
            param.grad[:] *= anorm / norm


def grad_CR(model, anorm):
    for p in model.parameters():
      if p.size() == torch.Size([32, 32]):
        norm = (torch.norm(p)-1)**2 
        norm.backward(retain_graph=True)
        return p.grad

def grad_CR_clipping(model, anorm,tensor_grad):  
    for p in model.parameters():
      if p.size() == torch.Size([32, 32]):
         p.grad = tensor_grad
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in model.parameters()))
    if norm > anorm:
        for param in model.parameters():
            param.grad[:] *= anorm / norm

def grad_log(model):
    for p in model.parameters():
      if p.size() == torch.Size([32, 32]):
        norm = torch.norm(p)/(32*32)
        dim0, dim1 = p.shape
        for i in range(dim0):
          for j in range(dim1):
            if p.grad[i][j] == 0:  p.grad[i][j] = norm
            if abs(p.grad[i][j]) < torch.exp(-norm) or abs(p.grad[i][j]) > torch.exp(norm):
                p.grad[i][j] = torch.log(abs(p.grad[i][j])) * p.grad[i][j] / abs(p.grad[i][j])



In [None]:
model = RNN(ninp, nhidden, nout).to(device)

In [None]:
#==============================================================================
# Model summary
#==============================================================================
print('**** Setup ****')
print('Total params: %.4fK' % (sum(p.numel() for p in model.parameters())/1000.0))
print('************')
print(model)


**** Setup ****
Total params: 1.8660K
************
RNN(
  (cell): rnnCell(
    (W1): Linear(in_features=32, out_features=32, bias=True)
    (U1): Linear(in_features=14, out_features=32, bias=True)
  )
  (classifier): Linear(in_features=32, out_features=10, bias=True)
)


In [None]:
for p in model.parameters():
  if p.size() == torch.Size([32, 32]):
    print((torch.norm(p)-1)**2)


tensor(5.0220, device='cuda:0', grad_fn=<PowBackward0>)


# Define Objective

In [None]:
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Start Training

In [None]:
def get_lr(step, total_steps, lr_max, lr_min):
  """Compute learning rate according to cosine annealing schedule."""
  return lr_min + (lr_max - lr_min) * 0.5 * (1 + np.cos(step / total_steps * np.pi))

scheduler = torch.optim.lr_scheduler.LambdaLR(
          optimizer,
          lr_lambda=lambda step: get_lr(  # pylint: disable=g-long-lambda
              step, epochs * len(train_loader),
              1,  # lr_lambda computes multiplicative factor
              1e-8 / learning_rate))      


In [None]:
def test(data_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for images, labels in data_loader:
            ## Reshape images for sequence learning
            #images = images.reshape(images.size(0), 1, 784)
            images = images.reshape(images.size(0), 14, 56)

            images = images.permute(2, 0, 1)
            images = images[perm, :, :]
            output = model(images.to(device))
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(labels.data.view_as(pred).to(device)).sum()
    accuracy = 100. * correct / len(data_loader.dataset)
    return accuracy.item()

In [None]:
best_epoch = 0
best_eval = 0.
epochlog = []

print('****')
print('start training')
print('****')
for epoch in range(epochs):
    model.train()
    for i, (images, labels) in enumerate(tqdm(train_loader)):
        ## Reshape images for sequence learning
        #images = images.reshape(images.size(0), 1, 784)
        images = images.reshape(images.size(0), 14, 56)

        images = images.permute(2, 0, 1)
        images = images[perm, :, :]

        optimizer.zero_grad()
        output = model.forward(images.to(device))
        #tensor_grad = grad_CR(model, grad_norm)
        loss = objective(output, labels.to(device))
        loss.backward()
        #nn.utils.clip_grad_norm_(model.parameters(), grad_norm)
        #grad_CR_clipping(model, grad_norm,tensor_grad) 
        grad_log(model)
        optimizer.step()
        scheduler.step()

    valid_acc = test(valid_loader)
    test_acc = test(test_loader)
    if (valid_acc > best_eval):
        best_eval = valid_acc
        final_test_acc = test_acc
        best_epoch = epoch

    

    print("epoch: %s, val accuracy: %.2f, test accuracy: %.2f" % (epoch, valid_acc, test_acc))     
    epochlog.append([valid_acc, test_acc])


print('****')
print("Final test accuracy: %.2f, obtained at epoch: %s," % (final_test_acc, best_epoch))      
print('****')
print(' ')  


****
start training
****


  4%|▎         | 20/570 [00:06<02:59,  3.07it/s]


KeyboardInterrupt: ignored

In [None]:
for t, x in enumerate(images.to(device)):
  print(t,x.shape)

0 torch.Size([100, 14])
1 torch.Size([100, 14])
2 torch.Size([100, 14])
3 torch.Size([100, 14])
4 torch.Size([100, 14])
5 torch.Size([100, 14])
6 torch.Size([100, 14])
7 torch.Size([100, 14])
8 torch.Size([100, 14])
9 torch.Size([100, 14])
10 torch.Size([100, 14])
11 torch.Size([100, 14])
12 torch.Size([100, 14])
13 torch.Size([100, 14])
14 torch.Size([100, 14])
15 torch.Size([100, 14])
16 torch.Size([100, 14])
17 torch.Size([100, 14])
18 torch.Size([100, 14])
19 torch.Size([100, 14])
20 torch.Size([100, 14])
21 torch.Size([100, 14])
22 torch.Size([100, 14])
23 torch.Size([100, 14])
24 torch.Size([100, 14])
25 torch.Size([100, 14])
26 torch.Size([100, 14])
27 torch.Size([100, 14])
28 torch.Size([100, 14])
29 torch.Size([100, 14])
30 torch.Size([100, 14])
31 torch.Size([100, 14])
32 torch.Size([100, 14])
33 torch.Size([100, 14])
34 torch.Size([100, 14])
35 torch.Size([100, 14])
36 torch.Size([100, 14])
37 torch.Size([100, 14])
38 torch.Size([100, 14])
39 torch.Size([100, 14])
40 torch.S

# Data Storage

In [None]:
os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'RNNor.csv')

In [None]:
csv_data = pd.read_csv(data_file)
csv_df = pd.DataFrame(csv_data)
csv_df['RNNv0.005log'] = [x[0] for x in epochlog]
csv_df['RNNt0.005log'] = [x[-1] for x in epochlog]
csv_df.to_csv(data_file,index = None)

data = pd.read_csv(data_file)
print(data)